Issue mcp-graveyard/UXP%1737 - Import libaom 2.0.2 source

author: Moonchild <moonchild@palemoon.org> 2021-03-03 18:48:48 +0000
committer: Moonchild <moonchild@palemoon.org> 2021-03-04 00:03:46 +0000
commit: 44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746 (patch)
tree: 9d9cc4d21c93ae3e1a88ab5c160c3be5f6af0ca9 /media/libaom/src/av1/common
parent: 353943d1a48086a39ff5f4365b22f8f058d5f66e (diff)
download: aura-central-44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746.tar.gz
117 files changed, 20997 insertions, 15545 deletions
diff --git a/media/libaom/src/av1/common/alloccommon.c b/media/libaom/src/av1/common/alloccommon.c
index 1bf81c91d..badee3df9 100644
--- a/media/libaom/src/av1/common/alloccommon.c
+++ b/media/libaom/src/av1/common/alloccommon.c
@@ -15,10 +15,10 @@
 #include "aom_mem/aom_mem.h"
 
 #include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
-#include "av1/common/onyxc_int.h"
 
 int av1_get_MBs(int width, int height) {
   const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
@@ -31,60 +31,6 @@ int av1_get_MBs(int width, int height) {
   return mb_rows * mb_cols;
 }
 
-#if LOOP_FILTER_BITMASK
-static int alloc_loop_filter_mask(AV1_COMMON *cm) {
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-
-  // Each lfm holds bit masks for all the 4x4 blocks in a max
-  // 64x64 (128x128 for ext_partitions) region.  The stride
-  // and rows are rounded up / truncated to a multiple of 16
-  // (32 for ext_partition).
-  cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
-  cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
-                   cm->lf.lfm_stride;
-  cm->lf.lfm =
-      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
-  if (!cm->lf.lfm) return 1;
-
-  unsigned int i;
-  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
-
-  return 0;
-}
-
-static void free_loop_filter_mask(AV1_COMMON *cm) {
-  if (cm->lf.lfm == NULL) return;
-
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-  cm->lf.lfm_num = 0;
-  cm->lf.lfm_stride = 0;
-}
-#endif
-
-void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
-  // Ensure that the decoded width and height are both multiples of
-  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
-  // subsampling is used).
-  // This simplifies the implementation of various experiments,
-  // eg. cdef, which operates on units of 8x8 luma pixels.
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
-
-  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  cm->mi_stride = calc_mi_size(cm->mi_cols);
-
-  cm->mb_cols = (cm->mi_cols + 2) >> 2;
-  cm->mb_rows = (cm->mi_rows + 2) >> 2;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
-
-#if LOOP_FILTER_BITMASK
-  alloc_loop_filter_mask(cm);
-#endif
-}
-
 void av1_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
@@ -92,6 +38,9 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
     if (pool->frame_bufs[i].ref_count > 0 &&
         pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
       pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].raw_frame_buffer.data = NULL;
+      pool->frame_bufs[i].raw_frame_buffer.size = 0;
+      pool->frame_bufs[i].raw_frame_buffer.priv = NULL;
       pool->frame_bufs[i].ref_count = 0;
     }
     aom_free(pool->frame_bufs[i].mvs);
@@ -124,20 +73,19 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
   // able to quickly answer the question "Where is the <n>'th stripe for tile
   // row <m>?" To make that efficient, we generate the rst_last_stripe array.
   int num_stripes = 0;
-  for (int i = 0; i < cm->tile_rows; ++i) {
+  for (int i = 0; i < cm->tiles.rows; ++i) {
     TileInfo tile_info;
     av1_tile_set_row(&tile_info, cm, i);
     const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
     const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
     const int tile_stripes = (ext_h + 63) / 64;
     num_stripes += tile_stripes;
-    cm->rst_end_stripe[i] = num_stripes;
   }
 
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
   const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+  const int use_highbd = cm->seq_params.use_highbitdepth;
 
   for (int p = 0; p < num_planes; ++p) {
     const int is_uv = p > 0;
@@ -184,106 +132,131 @@ void av1_free_restoration_buffers(AV1_COMMON *cm) {
   aom_free_frame_buffer(&cm->rst_frame);
 }
 
-void av1_free_above_context_buffers(AV1_COMMON *cm,
-                                    int num_free_above_contexts) {
+void av1_free_above_context_buffers(CommonContexts *above_contexts) {
   int i;
-  const int num_planes = cm->num_allocated_above_context_planes;
+  const int num_planes = above_contexts->num_planes;
 
-  for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) {
+  for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
     for (i = 0; i < num_planes; i++) {
-      aom_free(cm->above_context[i][tile_row]);
-      cm->above_context[i][tile_row] = NULL;
+      aom_free(above_contexts->entropy[i][tile_row]);
+      above_contexts->entropy[i][tile_row] = NULL;
     }
-    aom_free(cm->above_seg_context[tile_row]);
-    cm->above_seg_context[tile_row] = NULL;
+    aom_free(above_contexts->partition[tile_row]);
+    above_contexts->partition[tile_row] = NULL;
 
-    aom_free(cm->above_txfm_context[tile_row]);
-    cm->above_txfm_context[tile_row] = NULL;
+    aom_free(above_contexts->txfm[tile_row]);
+    above_contexts->txfm[tile_row] = NULL;
   }
   for (i = 0; i < num_planes; i++) {
-    aom_free(cm->above_context[i]);
-    cm->above_context[i] = NULL;
+    aom_free(above_contexts->entropy[i]);
+    above_contexts->entropy[i] = NULL;
   }
-  aom_free(cm->above_seg_context);
-  cm->above_seg_context = NULL;
+  aom_free(above_contexts->partition);
+  above_contexts->partition = NULL;
 
-  aom_free(cm->above_txfm_context);
-  cm->above_txfm_context = NULL;
+  aom_free(above_contexts->txfm);
+  above_contexts->txfm = NULL;
 
-  cm->num_allocated_above_contexts = 0;
-  cm->num_allocated_above_context_mi_col = 0;
-  cm->num_allocated_above_context_planes = 0;
+  above_contexts->num_tile_rows = 0;
+  above_contexts->num_mi_cols = 0;
+  above_contexts->num_planes = 0;
 }
 
 void av1_free_context_buffers(AV1_COMMON *cm) {
-  cm->free_mi(cm);
+  cm->mi_params.free_mi(&cm->mi_params);
 
-  av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+  av1_free_above_context_buffers(&cm->above_contexts);
 
-#if LOOP_FILTER_BITMASK
-  free_loop_filter_mask(cm);
+#if CONFIG_LPF_MASK
+  av1_free_loop_filter_mask(cm);
 #endif
 }
 
-int av1_alloc_above_context_buffers(AV1_COMMON *cm,
-                                    int num_alloc_above_contexts) {
-  const int num_planes = av1_num_planes(cm);
-  int plane_idx;
+int av1_alloc_above_context_buffers(CommonContexts *above_contexts,
+                                    int num_tile_rows, int num_mi_cols,
+                                    int num_planes) {
   const int aligned_mi_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+      ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2);
 
   // Allocate above context buffers
-  cm->num_allocated_above_contexts = num_alloc_above_contexts;
-  cm->num_allocated_above_context_mi_col = aligned_mi_cols;
-  cm->num_allocated_above_context_planes = num_planes;
-  for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
-    cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
-        num_alloc_above_contexts, sizeof(cm->above_context[0]));
-    if (!cm->above_context[plane_idx]) return 1;
+  above_contexts->num_tile_rows = num_tile_rows;
+  above_contexts->num_mi_cols = aligned_mi_cols;
+  above_contexts->num_planes = num_planes;
+  for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+    above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+        num_tile_rows, sizeof(above_contexts->entropy[0]));
+    if (!above_contexts->entropy[plane_idx]) return 1;
   }
 
-  cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc(
-      num_alloc_above_contexts, sizeof(cm->above_seg_context));
-  if (!cm->above_seg_context) return 1;
+  above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc(
+      num_tile_rows, sizeof(above_contexts->partition));
+  if (!above_contexts->partition) return 1;
 
-  cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc(
-      num_alloc_above_contexts, sizeof(cm->above_txfm_context));
-  if (!cm->above_txfm_context) return 1;
+  above_contexts->txfm =
+      (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm));
+  if (!above_contexts->txfm) return 1;
 
-  for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) {
-    for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
-      cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc(
-          aligned_mi_cols, sizeof(*cm->above_context[0][tile_row]));
-      if (!cm->above_context[plane_idx][tile_row]) return 1;
+  for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) {
+    for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+      above_contexts->entropy[plane_idx][tile_row] =
+          (ENTROPY_CONTEXT *)aom_calloc(
+              aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row]));
+      if (!above_contexts->entropy[plane_idx][tile_row]) return 1;
     }
 
-    cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
-        aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row]));
-    if (!cm->above_seg_context[tile_row]) return 1;
+    above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*above_contexts->partition[tile_row]));
+    if (!above_contexts->partition[tile_row]) return 1;
 
-    cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc(
-        aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row]));
-    if (!cm->above_txfm_context[tile_row]) return 1;
+    above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row]));
+    if (!above_contexts->txfm[tile_row]) return 1;
   }
 
   return 0;
 }
 
-int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
-  int new_mi_size;
-
-  av1_set_mb_mi(cm, width, height);
-  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
-  if (cm->mi_alloc_size < new_mi_size) {
-    cm->free_mi(cm);
-    if (cm->alloc_mi(cm, new_mi_size)) goto fail;
+// Allocate the dynamically allocated arrays in 'mi_params' assuming
+// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of
+// the struct members.
+static int alloc_mi(CommonModeInfoParams *mi_params) {
+  const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows);
+  const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows;
+  const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int alloc_mi_size =
+      mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d);
+
+  if (mi_params->mi_alloc_size < alloc_mi_size ||
+      mi_params->mi_grid_size < mi_grid_size) {
+    mi_params->free_mi(mi_params);
+
+    mi_params->mi_alloc =
+        aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc));
+    if (!mi_params->mi_alloc) return 1;
+    mi_params->mi_alloc_size = alloc_mi_size;
+
+    mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc(
+        mi_grid_size, sizeof(*mi_params->mi_grid_base));
+    if (!mi_params->mi_grid_base) return 1;
+    mi_params->mi_grid_size = mi_grid_size;
+
+    mi_params->tx_type_map =
+        aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map));
+    if (!mi_params->tx_type_map) return 1;
   }
 
   return 0;
+}
+
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->set_mb_mi(mi_params, width, height);
+  if (alloc_mi(mi_params)) goto fail;
+  return 0;
 
 fail:
   // clear the mi_* values to force a realloc on resync
-  av1_set_mb_mi(cm, 0, 0);
+  mi_params->set_mb_mi(mi_params, 0, 0);
   av1_free_context_buffers(cm);
   return 1;
 }
@@ -293,8 +266,44 @@ void av1_remove_common(AV1_COMMON *cm) {
 
   aom_free(cm->fc);
   cm->fc = NULL;
-  aom_free(cm->frame_contexts);
-  cm->frame_contexts = NULL;
+  aom_free(cm->default_frame_context);
+  cm->default_frame_context = NULL;
+}
+
+void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
+  mi_params->setup_mi(mi_params);
+}
+
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(AV1_COMMON *cm) {
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+
+  // Each lfm holds bit masks for all the 4x4 blocks in a max
+  // 64x64 (128x128 for ext_partitions) region.  The stride
+  // and rows are rounded up / truncated to a multiple of 16
+  // (32 for ext_partition).
+  cm->lf.lfm_stride =
+      (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+  cm->lf.lfm_num =
+      ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+      cm->lf.lfm_stride;
+  cm->lf.lfm =
+      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm) return 1;
+
+  unsigned int i;
+  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+  return 0;
 }
 
-void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); }
+void av1_free_loop_filter_mask(AV1_COMMON *cm) {
+  if (cm->lf.lfm == NULL) return;
+
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+  cm->lf.lfm_num = 0;
+  cm->lf.lfm_stride = 0;
+}
+#endif
diff --git a/media/libaom/src/av1/common/alloccommon.h b/media/libaom/src/av1/common/alloccommon.h
index 8e5896981..fe8e0c530 100644
--- a/media/libaom/src/av1/common/alloccommon.h
+++ b/media/libaom/src/av1/common/alloccommon.h
@@ -14,21 +14,25 @@
 
 #define INVALID_IDX -1  // Invalid buffer index.
 
+#include "config/aom_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct AV1Common;
 struct BufferPool;
+struct CommonContexts;
+struct CommonModeInfoParams;
 
 void av1_remove_common(struct AV1Common *cm);
 
-int av1_alloc_above_context_buffers(struct AV1Common *cm,
-                                    int num_alloc_above_contexts);
-void av1_free_above_context_buffers(struct AV1Common *cm,
-                                    int num_free_above_contexts);
+int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts,
+                                    int num_tile_rows, int num_mi_cols,
+                                    int num_planes);
+void av1_free_above_context_buffers(struct CommonContexts *above_contexts);
 int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
-void av1_init_context_buffers(struct AV1Common *cm);
+void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params);
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
@@ -38,9 +42,13 @@ void av1_free_restoration_buffers(struct AV1Common *cm);
 int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
 void av1_free_state_buffers(struct AV1Common *cm);
 
-void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
 int av1_get_MBs(int width, int height);
 
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(struct AV1Common *cm);
+void av1_free_loop_filter_mask(struct AV1Common *cm);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
index bad411743..2f3567aea 100644
--- a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
+++ b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
@@ -48,11 +48,11 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
 
 // 1D functions
 static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
-  { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
-  { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
-  { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
-  { av1_idct32_new, NULL, NULL },
-  { av1_idct64_new, NULL, NULL },
+  { av1_idct4, av1_iadst4, av1_iidentity4_c },
+  { av1_idct8, av1_iadst8, av1_iidentity8_c },
+  { av1_idct16, av1_iadst16, av1_iidentity16_c },
+  { av1_idct32, NULL, NULL },
+  { av1_idct64, NULL, NULL },
 };
 
 static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
@@ -248,31 +248,27 @@ static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
   x[1] = vcombine_s16(v1[0], v1[1]);
 }
 
-static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
-                                          int16_t *const c2,
-                                          int16_t *const c3) {
+static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
+                                       const int16_t c2, const int16_t c3) {
   int16x4_t val = vdup_n_s16((int16_t)0);
-  val = vld1_lane_s16(c0, val, 0);
-  val = vld1_lane_s16(c1, val, 1);
-  val = vld1_lane_s16(c2, val, 2);
-  val = vld1_lane_s16(c3, val, 3);
+  val = vset_lane_s16(c0, val, 0);
+  val = vset_lane_s16(c1, val, 1);
+  val = vset_lane_s16(c2, val, 2);
+  val = vset_lane_s16(c3, val, 3);
   return val;
 }
 
-static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
+static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
+                               int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[20], (int16_t)cospi[44]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[8];
   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -327,22 +323,21 @@ static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
 
   // Stage 7
   out[0] = x[0];
-  out[1] = vnegq_s16(x[4]);
+  out[1] = vqnegq_s16(x[4]);
   out[2] = x[6];
-  out[3] = vnegq_s16(x[2]);
+  out[3] = vqnegq_s16(x[2]);
   out[4] = x[3];
-  out[5] = vnegq_s16(x[7]);
+  out[5] = vqnegq_s16(x[7]);
   out[6] = x[5];
-  out[7] = vnegq_s16(x[1]);
+  out[7] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[8];
   int16x8_t s0, s1, s4, s5;
@@ -381,34 +376,32 @@ static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
 
   // Stage 7
   out[0] = x[0];
-  out[1] = vnegq_s16(x[4]);
+  out[1] = vqnegq_s16(x[4]);
   out[2] = x[6];
-  out[3] = vnegq_s16(x[2]);
+  out[3] = vqnegq_s16(x[2]);
   out[4] = x[3];
-  out[5] = vnegq_s16(x[7]);
+  out[5] = vqnegq_s16(x[7]);
   out[6] = x[5];
-  out[7] = vnegq_s16(x[1]);
+  out[7] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                                  int bit) {
+static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                              int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[8], step2[8];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   // stage 2
   btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
   btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
 
   // stage 3
-  btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
-  btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+  btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
   step2[4] = vqaddq_s16(step1[4], step1[5]);
   step2[5] = vqsubq_s16(step1[4], step1[5]);
   step2[6] = vqsubq_s16(step1[7], step1[6]);
@@ -419,7 +412,7 @@ static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
   step1[1] = vqaddq_s16(step2[1], step2[2]);
   step1[2] = vqsubq_s16(step2[1], step2[2]);
   step1[3] = vqsubq_s16(step2[0], step2[3]);
-  btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+  btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
 
   // stage 5
   out[0] = vqaddq_s16(step1[0], step2[7]);
@@ -432,8 +425,8 @@ static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
   out[7] = vqsubq_s16(step1[0], step2[7]);
 }
 
-static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
-                                       int8_t cos_bit, int bit) {
+static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -489,19 +482,24 @@ static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
   }
 }
 
-static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
-                                      int8_t cos_bit, int bit) {
-  (void)bit;
-  (void)cos_bit;
-
-  output[0] = vmulq_n_s16(input[0], (int16_t)2);
-  output[1] = vmulq_n_s16(input[1], (int16_t)2);
-  output[2] = vmulq_n_s16(input[2], (int16_t)2);
-  output[3] = vmulq_n_s16(input[3], (int16_t)2);
-  output[4] = vmulq_n_s16(input[4], (int16_t)2);
-  output[5] = vmulq_n_s16(input[5], (int16_t)2);
-  output[6] = vmulq_n_s16(input[6], (int16_t)2);
-  output[7] = vmulq_n_s16(input[7], (int16_t)2);
+static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                         4 * 5793 };
+
+static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
+                                            int txw_idx, int8_t size, int bit) {
+  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+  int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
+  int16x4_t low_i16, high_i16;
+  int32x4_t low_i32, high_i32;
+  for (int i = 0; i < size; i++) {
+    int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
+    int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
+    low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
+    high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
+    low_i16 = vqmovn_s32(low_i32);
+    high_i16 = vqmovn_s32(high_i32);
+    output[i] = vcombine_s16(low_i16, high_i16);
+  }
 }
 
 static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
@@ -520,38 +518,8 @@ static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
   }
 }
 
-static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
-                                       int8_t cos_bit, int bit) {
-  (void)bit;
-  (void)cos_bit;
-
-  int32x4_t out_low, out_high;
-  int16x4_t low, high;
-  int16_t scale = (int16_t)(2 * NewSqrt2);
-
-  for (int z = 0; z < 16; ++z) {
-    out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
-    out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
-
-    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
-    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
-
-    output[z] = vcombine_s16(low, high);
-  }
-}
-
-static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
-                                       int8_t cos_bit, int bit) {
-  (void)bit;
-  (void)cos_bit;
-
-  for (int z = 0; z < 32; ++z) {
-    output[z] = vmulq_n_s16(input[z], (int16_t)4);
-  }
-}
-
-static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -584,25 +552,23 @@ static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
   out[15] = step1;
 }
 
-static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
+static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                               int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
-
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
   // stage 2
 
   btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
@@ -642,8 +608,7 @@ static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
   btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
-                       &step2[10], &step2[13]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
 
   step2[4] = vqaddq_s16(step1[4], step1[5]);
   step2[5] = vqsubq_s16(step1[4], step1[5]);
@@ -710,14 +675,16 @@ static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
   out[15] = vqsubq_s16(step2[0], step2[15]);
 }
 
-static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[16], step2[16];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c1 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 1
   // stage 2
@@ -753,8 +720,7 @@ static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
   btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
   btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
-                       &step2[10], &step2[13]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
 
   step2[4] = vqaddq_s16(step1[4], step1[5]);
   step2[5] = vqsubq_s16(step1[4], step1[5]);
@@ -820,30 +786,23 @@ static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
   out[15] = vqsubq_s16(step2[0], step2[15]);
 }
 
-static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
+static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
+                                int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
-                        (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
-                        (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
-                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
-                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-
-  const int16x4_t c =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+                                      (int16_t)cospi[10], (int16_t)cospi[54]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+                                      (int16_t)cospi[26], (int16_t)cospi[38]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
+                                      (int16_t)cospi[42], (int16_t)cospi[22]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
+                                      (int16_t)cospi[58], (int16_t)cospi[6]);
+  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[16];
   int16x8_t t[14];
@@ -933,14 +892,14 @@ static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
   t[1] = x[1];
   t[2] = x[2];
   t[3] = x[3];
-  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
-  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
   t[8] = x[8];
   t[9] = x[9];
   t[10] = x[10];
   t[11] = x[11];
-  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
-  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+  btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
 
   // Stage 7
   x[0] = vqaddq_s16(t[0], t[2]);
@@ -961,40 +920,38 @@ static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
   x[15] = vqsubq_s16(s13, s15);
 
   // Stage 8
-  btf_16_half_neon(x + 2, c);
-  btf_16_half_neon(x + 6, c);
-  btf_16_half_neon(x + 10, c);
-  btf_16_half_neon(x + 14, c);
+  btf_16_half_neon(x + 2, c5);
+  btf_16_half_neon(x + 6, c5);
+  btf_16_half_neon(x + 10, c5);
+  btf_16_half_neon(x + 14, c5);
 
   // Stage 9
   out[0] = x[0];
-  out[1] = vnegq_s16(x[8]);
+  out[1] = vqnegq_s16(x[8]);
   out[2] = x[12];
-  out[3] = vnegq_s16(x[4]);
+  out[3] = vqnegq_s16(x[4]);
   out[4] = x[6];
-  out[5] = vnegq_s16(x[14]);
+  out[5] = vqnegq_s16(x[14]);
   out[6] = x[10];
-  out[7] = vnegq_s16(x[2]);
+  out[7] = vqnegq_s16(x[2]);
   out[8] = x[3];
-  out[9] = vnegq_s16(x[11]);
+  out[9] = vqnegq_s16(x[11]);
   out[10] = x[15];
-  out[11] = vnegq_s16(x[7]);
+  out[11] = vqnegq_s16(x[7]);
   out[12] = x[5];
-  out[13] = vnegq_s16(x[13]);
+  out[13] = vqnegq_s16(x[13]);
   out[14] = x[9];
-  out[15] = vnegq_s16(x[1]);
+  out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[16];
   int16x8_t t[10];
@@ -1016,7 +973,7 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
   // Stage 4
   t[0] = x[0];
   t[1] = x[1];
-  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
 
   // Stage 5
   x[0] = t[0];
@@ -1031,10 +988,10 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
   // stage 6
   t[0] = x[0];
   t[1] = x[1];
-  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
   t[8] = x[8];
   t[9] = x[9];
-  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
 
   // Stage 7
   x[0] = t[0];
@@ -1055,41 +1012,39 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
   x[15] = s13;
 
   // Stage 8
-  btf_16_half_neon(x + 2, c);
-  btf_16_half_neon(x + 6, c);
-  btf_16_half_neon(x + 10, c);
-  btf_16_half_neon(x + 14, c);
+  btf_16_half_neon(x + 2, c1);
+  btf_16_half_neon(x + 6, c1);
+  btf_16_half_neon(x + 10, c1);
+  btf_16_half_neon(x + 14, c1);
 
   // Stage 9
   out[0] = x[0];
-  out[1] = vnegq_s16(x[8]);
+  out[1] = vqnegq_s16(x[8]);
   out[2] = x[12];
-  out[3] = vnegq_s16(x[4]);
+  out[3] = vqnegq_s16(x[4]);
   out[4] = x[6];
-  out[5] = vnegq_s16(x[14]);
+  out[5] = vqnegq_s16(x[14]);
   out[6] = x[10];
-  out[7] = vnegq_s16(x[2]);
+  out[7] = vqnegq_s16(x[2]);
   out[8] = x[3];
-  out[9] = vnegq_s16(x[11]);
+  out[9] = vqnegq_s16(x[11]);
   out[10] = x[15];
-  out[11] = vnegq_s16(x[7]);
+  out[11] = vqnegq_s16(x[7]);
   out[12] = x[5];
-  out[13] = vnegq_s16(x[13]);
+  out[13] = vqnegq_s16(x[13]);
   out[14] = x[9];
-  out[15] = vnegq_s16(x[1]);
+  out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
 
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
 
   int16x8_t x[16];
   int16x8_t t[14];
@@ -1144,10 +1099,10 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
   t[5] = x[5];
   t[6] = x[6];
   t[7] = x[7];
-  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
-  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
-  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
-  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
 
   // Stage 5
   x[0] = vqaddq_s16(t[0], t[4]);
@@ -1172,14 +1127,14 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
   t[1] = x[1];
   t[2] = x[2];
   t[3] = x[3];
-  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
-  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
   t[8] = x[8];
   t[9] = x[9];
   t[10] = x[10];
   t[11] = x[11];
-  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
-  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
 
   // Stage 7
   x[0] = vqaddq_s16(t[0], t[2]);
@@ -1200,60 +1155,58 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
   x[15] = vqsubq_s16(s13, s15);
 
   // Stage 8
-  btf_16_half_neon(x + 2, c);
-  btf_16_half_neon(x + 6, c);
-  btf_16_half_neon(x + 10, c);
-  btf_16_half_neon(x + 14, c);
+  btf_16_half_neon(x + 2, c1);
+  btf_16_half_neon(x + 6, c1);
+  btf_16_half_neon(x + 10, c1);
+  btf_16_half_neon(x + 14, c1);
 
   // Stage 9
   out[0] = x[0];
-  out[1] = vnegq_s16(x[8]);
+  out[1] = vqnegq_s16(x[8]);
   out[2] = x[12];
-  out[3] = vnegq_s16(x[4]);
+  out[3] = vqnegq_s16(x[4]);
   out[4] = x[6];
-  out[5] = vnegq_s16(x[14]);
+  out[5] = vqnegq_s16(x[14]);
   out[6] = x[10];
-  out[7] = vnegq_s16(x[2]);
+  out[7] = vqnegq_s16(x[2]);
   out[8] = x[3];
-  out[9] = vnegq_s16(x[11]);
+  out[9] = vqnegq_s16(x[11]);
   out[10] = x[15];
-  out[11] = vnegq_s16(x[7]);
+  out[11] = vqnegq_s16(x[7]);
   out[12] = x[5];
-  out[13] = vnegq_s16(x[13]);
+  out[13] = vqnegq_s16(x[13]);
   out[14] = x[9];
-  out[15] = vnegq_s16(x[1]);
+  out[15] = vqnegq_s16(x[1]);
 }
 
-static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
+static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                               int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
 
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
-                        (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
-                        (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
-  const int16x4_t c2 =
-      create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
-                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
-  const int16x4_t c3 =
-      create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
-                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
-  const int16x4_t c4 =
-      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
-                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
-  const int16x4_t c5 =
-      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
-                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
-  const int16x4_t c6 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c7 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+                                      (int16_t)cospi[34], (int16_t)cospi[30]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+                                      (int16_t)cospi[50], (int16_t)cospi[14]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
+                                      (int16_t)cospi[42], (int16_t)cospi[22]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
+                                      (int16_t)cospi[58], (int16_t)cospi[6]);
+  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c8 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c9 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 2
 
@@ -1321,11 +1274,9 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
   btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
   btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
-                       &step2[18], &step2[29]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
   btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
-                       &step2[22], &step2[25]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
 
   step2[0] = step1[0];
   step2[1] = step1[1];
@@ -1353,8 +1304,7 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
   btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
   btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
-                       &step1[10], &step1[13]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
 
   step1[4] = vqaddq_s16(step2[4], step2[5]);
   step1[5] = vqsubq_s16(step2[4], step2[5]);
@@ -1386,10 +1336,8 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
   btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
   btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
-                       &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
-                       &step2[21], &step2[26]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
 
   step2[0] = vqaddq_s16(step1[0], step1[3]);
   step2[1] = vqaddq_s16(step1[1], step1[2]);
@@ -1516,8 +1464,8 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
 
-static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1;
@@ -1573,19 +1521,22 @@ static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
   out[31] = step1;
 }
 
-static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
-                                        int8_t cos_bit, int bit) {
+static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
   int32x4_t t32[16];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
-
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], cospi[48]);
+  const int16x4_t c2 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c3 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
   // stage 1
   // stage 2
 
@@ -1627,11 +1578,9 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
 
   btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
   btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
-                       &step2[18], &step2[29]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
   btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
-                       &step2[22], &step2[25]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
 
   step2[0] = step1[0];
   step2[8] = step1[8];
@@ -1659,8 +1608,7 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
                           vrshrn_n_s32(t32[1], INV_COS_BIT));
 
   btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
-                       &step1[10], &step1[13]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
 
   step1[4] = step2[4];
   step1[5] = step2[4];
@@ -1692,10 +1640,8 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
   btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
   btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
-                       &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
-                       &step2[21], &step2[26]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
 
   step2[0] = step1[0];
   step2[1] = step1[0];
@@ -1822,18 +1768,22 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
 
-static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
-                                         int8_t cos_bit, int bit) {
+static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
   (void)bit;
   const int32_t *cospi = cospi_arr(cos_bit);
   int16x8_t step1[32], step2[32];
   int32x4_t t32[16];
-  const int16x4_t c0 =
-      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
-                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-  const int16x4_t c1 =
-      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
-                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c2 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c3 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
 
   // stage 1
   // stage 2
@@ -1889,11 +1839,9 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
   btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
   btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
-                       &step2[18], &step2[29]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
   btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
-                       &step2[22], &step2[25]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
 
   step2[0] = step1[0];
   step2[2] = step1[2];
@@ -1924,8 +1872,7 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
 
   btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
   btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
-                       &step1[10], &step1[13]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
 
   step1[4] = vqaddq_s16(step2[4], step2[5]);
   step1[5] = vqsubq_s16(step2[4], step2[5]);
@@ -1957,10 +1904,8 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
   btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
   btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
   btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
-                       &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
-                       &step2[21], &step2[26]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
 
   step2[0] = vqaddq_s16(step1[0], step1[3]);
   step2[1] = vqaddq_s16(step1[0], step1[2]);
@@ -2086,33 +2031,1542 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
   out[30] = vqsubq_s16(step2[1], step2[30]);
   out[31] = vqsubq_s16(step2[0], step2[31]);
 }
+static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
+  btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
+  btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
+  btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[15]);
+  step1[1] = vqaddq_s16(step2[1], step2[14]);
+  step1[2] = vqaddq_s16(step2[2], step2[13]);
+  step1[3] = vqaddq_s16(step2[3], step2[12]);
+  step1[4] = vqaddq_s16(step2[4], step2[11]);
+  step1[5] = vqaddq_s16(step2[5], step2[10]);
+  step1[6] = vqaddq_s16(step2[6], step2[9]);
+  step1[7] = vqaddq_s16(step2[7], step2[8]);
+  step1[8] = vqsubq_s16(step2[7], step2[8]);
+  step1[9] = vqsubq_s16(step2[6], step2[9]);
+  step1[10] = vqsubq_s16(step2[5], step2[10]);
+  step1[11] = vqsubq_s16(step2[4], step2[11]);
+  step1[12] = vqsubq_s16(step2[3], step2[12]);
+  step1[13] = vqsubq_s16(step2[2], step2[13]);
+  step1[14] = vqsubq_s16(step2[1], step2[14]);
+  step1[15] = vqsubq_s16(step2[0], step2[15]);
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[47]);
+  step1[33] = vqaddq_s16(step2[33], step2[46]);
+  step1[34] = vqaddq_s16(step2[34], step2[45]);
+  step1[35] = vqaddq_s16(step2[35], step2[44]);
+  step1[36] = vqaddq_s16(step2[36], step2[43]);
+  step1[37] = vqaddq_s16(step2[37], step2[42]);
+  step1[38] = vqaddq_s16(step2[38], step2[41]);
+  step1[39] = vqaddq_s16(step2[39], step2[40]);
+  step1[40] = vqsubq_s16(step2[39], step2[40]);
+  step1[41] = vqsubq_s16(step2[38], step2[41]);
+  step1[42] = vqsubq_s16(step2[37], step2[42]);
+  step1[43] = vqsubq_s16(step2[36], step2[43]);
+  step1[44] = vqsubq_s16(step2[35], step2[44]);
+  step1[45] = vqsubq_s16(step2[34], step2[45]);
+  step1[46] = vqsubq_s16(step2[33], step2[46]);
+  step1[47] = vqsubq_s16(step2[32], step2[47]);
+  step1[48] = vqsubq_s16(step2[63], step2[48]);
+  step1[49] = vqsubq_s16(step2[62], step2[49]);
+  step1[50] = vqsubq_s16(step2[61], step2[50]);
+  step1[51] = vqsubq_s16(step2[60], step2[51]);
+  step1[52] = vqsubq_s16(step2[59], step2[52]);
+  step1[53] = vqsubq_s16(step2[58], step2[53]);
+  step1[54] = vqsubq_s16(step2[57], step2[54]);
+  step1[55] = vqsubq_s16(step2[56], step2[55]);
+  step1[56] = vqaddq_s16(step2[56], step2[55]);
+  step1[57] = vqaddq_s16(step2[57], step2[54]);
+  step1[58] = vqaddq_s16(step2[58], step2[53]);
+  step1[59] = vqaddq_s16(step2[59], step2[52]);
+  step1[60] = vqaddq_s16(step2[60], step2[51]);
+  step1[61] = vqaddq_s16(step2[61], step2[50]);
+  step1[62] = vqaddq_s16(step2[62], step2[49]);
+  step1[63] = vqaddq_s16(step2[63], step2[48]);
+}
+
+static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
+  btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
+  btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
+  btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
+  btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
+  btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
+  btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[31]);
+  step2[1] = vqaddq_s16(step1[1], step1[30]);
+  step2[2] = vqaddq_s16(step1[2], step1[29]);
+  step2[3] = vqaddq_s16(step1[3], step1[28]);
+  step2[4] = vqaddq_s16(step1[4], step1[27]);
+  step2[5] = vqaddq_s16(step1[5], step1[26]);
+  step2[6] = vqaddq_s16(step1[6], step1[25]);
+  step2[7] = vqaddq_s16(step1[7], step1[24]);
+  step2[8] = vqaddq_s16(step1[8], step1[23]);
+  step2[9] = vqaddq_s16(step1[9], step1[22]);
+  step2[10] = vqaddq_s16(step1[10], step1[21]);
+  step2[11] = vqaddq_s16(step1[11], step1[20]);
+  step2[12] = vqaddq_s16(step1[12], step1[19]);
+  step2[13] = vqaddq_s16(step1[13], step1[18]);
+  step2[14] = vqaddq_s16(step1[14], step1[17]);
+  step2[15] = vqaddq_s16(step1[15], step1[16]);
+  step2[16] = vqsubq_s16(step1[15], step1[16]);
+  step2[17] = vqsubq_s16(step1[14], step1[17]);
+  step2[18] = vqsubq_s16(step1[13], step1[18]);
+  step2[19] = vqsubq_s16(step1[12], step1[19]);
+  step2[20] = vqsubq_s16(step1[11], step1[20]);
+  step2[21] = vqsubq_s16(step1[10], step1[21]);
+  step2[22] = vqsubq_s16(step1[9], step1[22]);
+  step2[23] = vqsubq_s16(step1[8], step1[23]);
+  step2[24] = vqsubq_s16(step1[7], step1[24]);
+  step2[25] = vqsubq_s16(step1[6], step1[25]);
+  step2[26] = vqsubq_s16(step1[5], step1[26]);
+  step2[27] = vqsubq_s16(step1[4], step1[27]);
+  step2[28] = vqsubq_s16(step1[3], step1[28]);
+  step2[29] = vqsubq_s16(step1[2], step1[29]);
+  step2[30] = vqsubq_s16(step1[1], step1[30]);
+  step2[31] = vqsubq_s16(step1[0], step1[31]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[36] = step1[36];
+  step2[37] = step1[37];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[58] = step1[58];
+  step2[59] = step1[59];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+}
+
+static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step2[64], step1[64];
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c7 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[2] = in[16];
+  step2[4] = in[8];
+  step2[6] = in[24];
+  step2[8] = in[4];
+  step2[10] = in[20];
+  step2[12] = in[12];
+  step2[14] = in[28];
+  step2[16] = in[2];
+  step2[18] = in[18];
+  step2[20] = in[10];
+  step2[22] = in[26];
+  step2[24] = in[6];
+  step2[26] = in[22];
+  step2[28] = in[14];
+  step2[30] = in[30];
+
+  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+  btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
+  btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
+  btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+  btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+  btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
+  btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
+  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+  btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
+  btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
+  btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+  btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+  btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
+  btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
+  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+  // stage 3
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = step2[4];
+  step1[6] = step2[6];
+  step1[8] = step2[8];
+  step1[10] = step2[10];
+  step1[12] = step2[12];
+  step1[14] = step2[14];
+
+  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+  btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
+  btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
+  btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+  btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+  btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
+  btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
+  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+  step1[32] = vqaddq_s16(step2[32], step2[33]);
+  step1[33] = vqsubq_s16(step2[32], step2[33]);
+  step1[34] = vqsubq_s16(step2[35], step2[34]);
+  step1[35] = vqaddq_s16(step2[35], step2[34]);
+  step1[36] = vqaddq_s16(step2[36], step2[37]);
+  step1[37] = vqsubq_s16(step2[36], step2[37]);
+  step1[38] = vqsubq_s16(step2[39], step2[38]);
+  step1[39] = vqaddq_s16(step2[39], step2[38]);
+  step1[40] = vqaddq_s16(step2[40], step2[41]);
+  step1[41] = vqsubq_s16(step2[40], step2[41]);
+  step1[42] = vqsubq_s16(step2[43], step2[42]);
+  step1[43] = vqaddq_s16(step2[43], step2[42]);
+  step1[44] = vqaddq_s16(step2[44], step2[45]);
+  step1[45] = vqsubq_s16(step2[44], step2[45]);
+  step1[46] = vqsubq_s16(step2[47], step2[46]);
+  step1[47] = vqaddq_s16(step2[47], step2[46]);
+  step1[48] = vqaddq_s16(step2[48], step2[49]);
+  step1[49] = vqsubq_s16(step2[48], step2[49]);
+  step1[50] = vqsubq_s16(step2[51], step2[50]);
+  step1[51] = vqaddq_s16(step2[51], step2[50]);
+  step1[52] = vqaddq_s16(step2[52], step2[53]);
+  step1[53] = vqsubq_s16(step2[52], step2[53]);
+  step1[54] = vqsubq_s16(step2[55], step2[54]);
+  step1[55] = vqaddq_s16(step2[55], step2[54]);
+  step1[56] = vqaddq_s16(step2[56], step2[57]);
+  step1[57] = vqsubq_s16(step2[56], step2[57]);
+  step1[58] = vqsubq_s16(step2[59], step2[58]);
+  step1[59] = vqaddq_s16(step2[59], step2[58]);
+  step1[60] = vqaddq_s16(step2[60], step2[61]);
+  step1[61] = vqsubq_s16(step2[60], step2[61]);
+  step1[62] = vqsubq_s16(step2[63], step2[62]);
+  step1[63] = vqaddq_s16(step2[63], step2[62]);
+
+  // stage 4
+
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[4] = step1[4];
+  step2[6] = step1[6];
+
+  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
+  btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
+  btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+  btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+  step2[16] = vqaddq_s16(step1[16], step1[17]);
+  step2[17] = vqsubq_s16(step1[16], step1[17]);
+  step2[18] = vqsubq_s16(step1[19], step1[18]);
+  step2[19] = vqaddq_s16(step1[19], step1[18]);
+  step2[20] = vqaddq_s16(step1[20], step1[21]);
+  step2[21] = vqsubq_s16(step1[20], step1[21]);
+  step2[22] = vqsubq_s16(step1[23], step1[22]);
+  step2[23] = vqaddq_s16(step1[23], step1[22]);
+  step2[24] = vqaddq_s16(step1[24], step1[25]);
+  step2[25] = vqsubq_s16(step1[24], step1[25]);
+  step2[26] = vqsubq_s16(step1[27], step1[26]);
+  step2[27] = vqaddq_s16(step1[27], step1[26]);
+  step2[28] = vqaddq_s16(step1[28], step1[29]);
+  step2[29] = vqsubq_s16(step1[28], step1[29]);
+  step2[30] = vqsubq_s16(step1[31], step1[30]);
+  step2[31] = vqaddq_s16(step1[31], step1[30]);
+  step2[32] = step1[32];
+  step2[35] = step1[35];
+  step2[36] = step1[36];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[43] = step1[43];
+  step2[44] = step1[44];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[51] = step1[51];
+  step2[52] = step1[52];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[59] = step1[59];
+  step2[60] = step1[60];
+  step2[63] = step1[63];
+
+  // stage 5
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+  btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+  step1[16] = step2[16];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[35]);
+  step1[33] = vqaddq_s16(step2[33], step2[34]);
+  step1[34] = vqsubq_s16(step2[33], step2[34]);
+  step1[35] = vqsubq_s16(step2[32], step2[35]);
+  step1[36] = vqsubq_s16(step2[39], step2[36]);
+  step1[37] = vqsubq_s16(step2[38], step2[37]);
+  step1[38] = vqaddq_s16(step2[38], step2[37]);
+  step1[39] = vqaddq_s16(step2[39], step2[36]);
+  step1[40] = vqaddq_s16(step2[40], step2[43]);
+  step1[41] = vqaddq_s16(step2[41], step2[42]);
+  step1[42] = vqsubq_s16(step2[41], step2[42]);
+  step1[43] = vqsubq_s16(step2[40], step2[43]);
+  step1[44] = vqsubq_s16(step2[47], step2[44]);
+  step1[45] = vqsubq_s16(step2[46], step2[45]);
+  step1[46] = vqaddq_s16(step2[46], step2[45]);
+  step1[47] = vqaddq_s16(step2[47], step2[44]);
+  step1[48] = vqaddq_s16(step2[48], step2[51]);
+  step1[49] = vqaddq_s16(step2[49], step2[50]);
+  step1[50] = vqsubq_s16(step2[49], step2[50]);
+  step1[51] = vqsubq_s16(step2[48], step2[51]);
+  step1[52] = vqsubq_s16(step2[55], step2[52]);
+  step1[53] = vqsubq_s16(step2[54], step2[53]);
+  step1[54] = vqaddq_s16(step2[54], step2[53]);
+  step1[55] = vqaddq_s16(step2[55], step2[52]);
+  step1[56] = vqaddq_s16(step2[56], step2[59]);
+  step1[57] = vqaddq_s16(step2[57], step2[58]);
+  step1[58] = vqsubq_s16(step2[57], step2[58]);
+  step1[59] = vqsubq_s16(step2[56], step2[59]);
+  step1[60] = vqsubq_s16(step2[63], step2[60]);
+  step1[61] = vqsubq_s16(step2[62], step2[61]);
+  step1[62] = vqaddq_s16(step2[62], step2[61]);
+  step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+  // stage 6
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[19]);
+  step2[17] = vqaddq_s16(step1[17], step1[18]);
+  step2[18] = vqsubq_s16(step1[17], step1[18]);
+  step2[19] = vqsubq_s16(step1[16], step1[19]);
+  step2[20] = vqsubq_s16(step1[23], step1[20]);
+  step2[21] = vqsubq_s16(step1[22], step1[21]);
+  step2[22] = vqaddq_s16(step1[22], step1[21]);
+  step2[23] = vqaddq_s16(step1[23], step1[20]);
+  step2[24] = vqaddq_s16(step1[24], step1[27]);
+  step2[25] = vqaddq_s16(step1[25], step1[26]);
+  step2[26] = vqsubq_s16(step1[25], step1[26]);
+  step2[27] = vqsubq_s16(step1[24], step1[27]);
+  step2[28] = vqsubq_s16(step1[31], step1[28]);
+  step2[29] = vqsubq_s16(step1[30], step1[29]);
+  step2[30] = vqaddq_s16(step1[30], step1[29]);
+  step2[31] = vqaddq_s16(step1[31], step1[28]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[41] = step1[41];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[54] = step1[54];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[39]);
+  step1[33] = vqaddq_s16(step2[33], step2[38]);
+  step1[34] = vqaddq_s16(step2[34], step2[37]);
+  step1[35] = vqaddq_s16(step2[35], step2[36]);
+  step1[36] = vqsubq_s16(step2[35], step2[36]);
+  step1[37] = vqsubq_s16(step2[34], step2[37]);
+  step1[38] = vqsubq_s16(step2[33], step2[38]);
+  step1[39] = vqsubq_s16(step2[32], step2[39]);
+  step1[40] = vqsubq_s16(step2[47], step2[40]);
+  step1[41] = vqsubq_s16(step2[46], step2[41]);
+  step1[42] = vqsubq_s16(step2[45], step2[42]);
+  step1[43] = vqsubq_s16(step2[44], step2[43]);
+  step1[44] = vqaddq_s16(step2[43], step2[44]);
+  step1[45] = vqaddq_s16(step2[42], step2[45]);
+  step1[46] = vqaddq_s16(step2[41], step2[46]);
+  step1[47] = vqaddq_s16(step2[40], step2[47]);
+  step1[48] = vqaddq_s16(step2[48], step2[55]);
+  step1[49] = vqaddq_s16(step2[49], step2[54]);
+  step1[50] = vqaddq_s16(step2[50], step2[53]);
+  step1[51] = vqaddq_s16(step2[51], step2[52]);
+  step1[52] = vqsubq_s16(step2[51], step2[52]);
+  step1[53] = vqsubq_s16(step2[50], step2[53]);
+  step1[54] = vqsubq_s16(step2[49], step2[54]);
+  step1[55] = vqsubq_s16(step2[48], step2[55]);
+  step1[56] = vqsubq_s16(step2[63], step2[56]);
+  step1[57] = vqsubq_s16(step2[62], step2[57]);
+  step1[58] = vqsubq_s16(step2[61], step2[58]);
+  step1[59] = vqsubq_s16(step2[60], step2[59]);
+  step1[60] = vqaddq_s16(step2[59], step2[60]);
+  step1[61] = vqaddq_s16(step2[58], step2[61]);
+  step1[62] = vqaddq_s16(step2[57], step2[62]);
+  step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[23]);
+  step2[17] = vqaddq_s16(step1[17], step1[22]);
+  step2[18] = vqaddq_s16(step1[18], step1[21]);
+  step2[19] = vqaddq_s16(step1[19], step1[20]);
+  step2[20] = vqsubq_s16(step1[19], step1[20]);
+  step2[21] = vqsubq_s16(step1[18], step1[21]);
+  step2[22] = vqsubq_s16(step1[17], step1[22]);
+  step2[23] = vqsubq_s16(step1[16], step1[23]);
+  step2[24] = vqsubq_s16(step1[31], step1[24]);
+  step2[25] = vqsubq_s16(step1[30], step1[25]);
+  step2[26] = vqsubq_s16(step1[29], step1[26]);
+  step2[27] = vqsubq_s16(step1[28], step1[27]);
+  step2[28] = vqaddq_s16(step1[28], step1[27]);
+  step2[29] = vqaddq_s16(step1[29], step1[26]);
+  step2[30] = vqaddq_s16(step1[30], step1[25]);
+  step2[31] = vqaddq_s16(step1[31], step1[24]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[44] = step1[44];
+  step2[45] = step1[45];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[50] = step1[50];
+  step2[51] = step1[51];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 9
+  idct64_stage9_neon(step2, step1, cos_bit);
+
+  // stage 10
+  idct64_stage10_neon(step1, step2, cos_bit);
+
+  // stage 11
+
+  out[0] = vqaddq_s16(step2[0], step2[63]);
+  out[1] = vqaddq_s16(step2[1], step2[62]);
+  out[2] = vqaddq_s16(step2[2], step2[61]);
+  out[3] = vqaddq_s16(step2[3], step2[60]);
+  out[4] = vqaddq_s16(step2[4], step2[59]);
+  out[5] = vqaddq_s16(step2[5], step2[58]);
+  out[6] = vqaddq_s16(step2[6], step2[57]);
+  out[7] = vqaddq_s16(step2[7], step2[56]);
+  out[8] = vqaddq_s16(step2[8], step2[55]);
+  out[9] = vqaddq_s16(step2[9], step2[54]);
+  out[10] = vqaddq_s16(step2[10], step2[53]);
+  out[11] = vqaddq_s16(step2[11], step2[52]);
+  out[12] = vqaddq_s16(step2[12], step2[51]);
+  out[13] = vqaddq_s16(step2[13], step2[50]);
+  out[14] = vqaddq_s16(step2[14], step2[49]);
+  out[15] = vqaddq_s16(step2[15], step2[48]);
+  out[16] = vqaddq_s16(step2[16], step2[47]);
+  out[17] = vqaddq_s16(step2[17], step2[46]);
+  out[18] = vqaddq_s16(step2[18], step2[45]);
+  out[19] = vqaddq_s16(step2[19], step2[44]);
+  out[20] = vqaddq_s16(step2[20], step2[43]);
+  out[21] = vqaddq_s16(step2[21], step2[42]);
+  out[22] = vqaddq_s16(step2[22], step2[41]);
+  out[23] = vqaddq_s16(step2[23], step2[40]);
+  out[24] = vqaddq_s16(step2[24], step2[39]);
+  out[25] = vqaddq_s16(step2[25], step2[38]);
+  out[26] = vqaddq_s16(step2[26], step2[37]);
+  out[27] = vqaddq_s16(step2[27], step2[36]);
+  out[28] = vqaddq_s16(step2[28], step2[35]);
+  out[29] = vqaddq_s16(step2[29], step2[34]);
+  out[30] = vqaddq_s16(step2[30], step2[33]);
+  out[31] = vqaddq_s16(step2[31], step2[32]);
+  out[32] = vqsubq_s16(step2[31], step2[32]);
+  out[33] = vqsubq_s16(step2[30], step2[33]);
+  out[34] = vqsubq_s16(step2[29], step2[34]);
+  out[35] = vqsubq_s16(step2[28], step2[35]);
+  out[36] = vqsubq_s16(step2[27], step2[36]);
+  out[37] = vqsubq_s16(step2[26], step2[37]);
+  out[38] = vqsubq_s16(step2[25], step2[38]);
+  out[39] = vqsubq_s16(step2[24], step2[39]);
+  out[40] = vqsubq_s16(step2[23], step2[40]);
+  out[41] = vqsubq_s16(step2[22], step2[41]);
+  out[42] = vqsubq_s16(step2[21], step2[42]);
+  out[43] = vqsubq_s16(step2[20], step2[43]);
+  out[44] = vqsubq_s16(step2[19], step2[44]);
+  out[45] = vqsubq_s16(step2[18], step2[45]);
+  out[46] = vqsubq_s16(step2[17], step2[46]);
+  out[47] = vqsubq_s16(step2[16], step2[47]);
+  out[48] = vqsubq_s16(step2[15], step2[48]);
+  out[49] = vqsubq_s16(step2[14], step2[49]);
+  out[50] = vqsubq_s16(step2[13], step2[50]);
+  out[51] = vqsubq_s16(step2[12], step2[51]);
+  out[52] = vqsubq_s16(step2[11], step2[52]);
+  out[53] = vqsubq_s16(step2[10], step2[53]);
+  out[54] = vqsubq_s16(step2[9], step2[54]);
+  out[55] = vqsubq_s16(step2[8], step2[55]);
+  out[56] = vqsubq_s16(step2[7], step2[56]);
+  out[57] = vqsubq_s16(step2[6], step2[57]);
+  out[58] = vqsubq_s16(step2[5], step2[58]);
+  out[59] = vqsubq_s16(step2[4], step2[59]);
+  out[60] = vqsubq_s16(step2[3], step2[60]);
+  out[61] = vqsubq_s16(step2[2], step2[61]);
+  out[62] = vqsubq_s16(step2[1], step2[62]);
+  out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+
+  t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
+
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+  out[16] = step1;
+  out[17] = step1;
+  out[18] = step1;
+  out[19] = step1;
+  out[20] = step1;
+  out[21] = step1;
+  out[22] = step1;
+  out[23] = step1;
+  out[24] = step1;
+  out[25] = step1;
+  out[26] = step1;
+  out[27] = step1;
+  out[28] = step1;
+  out[29] = step1;
+  out[30] = step1;
+  out[31] = step1;
+  out[32] = step1;
+  out[33] = step1;
+  out[34] = step1;
+  out[35] = step1;
+  out[36] = step1;
+  out[37] = step1;
+  out[38] = step1;
+  out[39] = step1;
+  out[40] = step1;
+  out[41] = step1;
+  out[42] = step1;
+  out[43] = step1;
+  out[44] = step1;
+  out[45] = step1;
+  out[46] = step1;
+  out[47] = step1;
+  out[48] = step1;
+  out[49] = step1;
+  out[50] = step1;
+  out[51] = step1;
+  out[52] = step1;
+  out[53] = step1;
+  out[54] = step1;
+  out[55] = step1;
+  out[56] = step1;
+  out[57] = step1;
+  out[58] = step1;
+  out[59] = step1;
+  out[60] = step1;
+  out[61] = step1;
+  out[62] = step1;
+  out[63] = step1;
+}
+
+static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step2[64], step1[64];
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[8] = in[4];
+  step2[16] = in[2];
+  step2[24] = in[6];
+
+  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+  // stage 3
+
+  step1[0] = step2[0];
+  step1[8] = step2[8];
+
+  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+  step1[32] = step2[32];
+  step1[33] = step2[32];
+  step1[38] = step2[39];
+  step1[39] = step2[39];
+  step1[40] = step2[40];
+  step1[41] = step2[40];
+  step1[46] = step2[47];
+  step1[47] = step2[47];
+  step1[48] = step2[48];
+  step1[49] = step2[48];
+  step1[54] = step2[55];
+  step1[55] = step2[55];
+  step1[56] = step2[56];
+  step1[57] = step2[56];
+  step1[62] = step2[63];
+  step1[63] = step2[63];
+
+  // stage 4
+
+  step2[0] = step1[0];
+
+  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+  btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
+
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+  step2[32] = step1[32];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[63] = step1[63];
+
+  // stage 5
+
+  step1[0] = step2[0];
+
+  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
+
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  step1[16] = step2[16];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[31] = step2[31];
+  step1[32] = step2[32];
+  step1[33] = step2[33];
+  step1[34] = step2[33];
+  step1[35] = step2[32];
+  step1[36] = step2[39];
+  step1[37] = step2[38];
+  step1[38] = step2[38];
+  step1[39] = step2[39];
+  step1[40] = step2[40];
+  step1[41] = step2[41];
+  step1[42] = step2[41];
+  step1[43] = step2[40];
+  step1[44] = step2[47];
+  step1[45] = step2[46];
+  step1[46] = step2[46];
+  step1[47] = step2[47];
+  step1[48] = step2[48];
+  step1[49] = step2[49];
+  step1[50] = step2[49];
+  step1[51] = step2[48];
+  step1[52] = step2[55];
+  step1[53] = step2[54];
+  step1[54] = step2[54];
+  step1[55] = step2[55];
+  step1[56] = step2[56];
+  step1[57] = step2[57];
+  step1[58] = step2[57];
+  step1[59] = step2[56];
+  step1[60] = step2[63];
+  step1[61] = step2[62];
+  step1[62] = step2[62];
+  step1[63] = step2[63];
+
+  // stage 6
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
+  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[17];
+  step2[19] = step1[16];
+  step2[20] = step1[23];
+  step2[21] = step1[22];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[26] = step1[25];
+  step2[27] = step1[24];
+  step2[28] = step1[31];
+  step2[29] = step1[30];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[41] = step1[41];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[54] = step1[54];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 7
+
+  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[10] = step2[9];
+  step1[11] = step2[8];
+  step1[12] = step2[15];
+  step1[13] = step2[14];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[39]);
+  step1[33] = vqaddq_s16(step2[33], step2[38]);
+  step1[34] = vqaddq_s16(step2[34], step2[37]);
+  step1[35] = vqaddq_s16(step2[35], step2[36]);
+  step1[36] = vqsubq_s16(step2[35], step2[36]);
+  step1[37] = vqsubq_s16(step2[34], step2[37]);
+  step1[38] = vqsubq_s16(step2[33], step2[38]);
+  step1[39] = vqsubq_s16(step2[32], step2[39]);
+  step1[40] = vqsubq_s16(step2[47], step2[40]);
+  step1[41] = vqsubq_s16(step2[46], step2[41]);
+  step1[42] = vqsubq_s16(step2[45], step2[42]);
+  step1[43] = vqsubq_s16(step2[44], step2[43]);
+  step1[44] = vqaddq_s16(step2[43], step2[44]);
+  step1[45] = vqaddq_s16(step2[42], step2[45]);
+  step1[46] = vqaddq_s16(step2[41], step2[46]);
+  step1[47] = vqaddq_s16(step2[40], step2[47]);
+  step1[48] = vqaddq_s16(step2[48], step2[55]);
+  step1[49] = vqaddq_s16(step2[49], step2[54]);
+  step1[50] = vqaddq_s16(step2[50], step2[53]);
+  step1[51] = vqaddq_s16(step2[51], step2[52]);
+  step1[52] = vqsubq_s16(step2[51], step2[52]);
+  step1[53] = vqsubq_s16(step2[50], step2[53]);
+  step1[54] = vqsubq_s16(step2[49], step2[54]);
+  step1[55] = vqsubq_s16(step2[48], step2[55]);
+  step1[56] = vqsubq_s16(step2[63], step2[56]);
+  step1[57] = vqsubq_s16(step2[62], step2[57]);
+  step1[58] = vqsubq_s16(step2[61], step2[58]);
+  step1[59] = vqsubq_s16(step2[60], step2[59]);
+  step1[60] = vqaddq_s16(step2[59], step2[60]);
+  step1[61] = vqaddq_s16(step2[58], step2[61]);
+  step1[62] = vqaddq_s16(step2[57], step2[62]);
+  step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[3];
+  step2[5] = step1[2];
+  step2[6] = step1[1];
+  step2[7] = step1[0];
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[23]);
+  step2[17] = vqaddq_s16(step1[17], step1[22]);
+  step2[18] = vqaddq_s16(step1[18], step1[21]);
+  step2[19] = vqaddq_s16(step1[19], step1[20]);
+  step2[20] = vqsubq_s16(step1[19], step1[20]);
+  step2[21] = vqsubq_s16(step1[18], step1[21]);
+  step2[22] = vqsubq_s16(step1[17], step1[22]);
+  step2[23] = vqsubq_s16(step1[16], step1[23]);
+  step2[24] = vqsubq_s16(step1[31], step1[24]);
+  step2[25] = vqsubq_s16(step1[30], step1[25]);
+  step2[26] = vqsubq_s16(step1[29], step1[26]);
+  step2[27] = vqsubq_s16(step1[28], step1[27]);
+  step2[28] = vqaddq_s16(step1[28], step1[27]);
+  step2[29] = vqaddq_s16(step1[29], step1[26]);
+  step2[30] = vqaddq_s16(step1[30], step1[25]);
+  step2[31] = vqaddq_s16(step1[31], step1[24]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[44] = step1[44];
+  step2[45] = step1[45];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[50] = step1[50];
+  step2[51] = step1[51];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 9
+  idct64_stage9_neon(step2, step1, cos_bit);
+
+  // stage 10
+  idct64_stage10_neon(step1, step2, cos_bit);
+
+  // stage 11
+
+  out[0] = vqaddq_s16(step2[0], step2[63]);
+  out[1] = vqaddq_s16(step2[1], step2[62]);
+  out[2] = vqaddq_s16(step2[2], step2[61]);
+  out[3] = vqaddq_s16(step2[3], step2[60]);
+  out[4] = vqaddq_s16(step2[4], step2[59]);
+  out[5] = vqaddq_s16(step2[5], step2[58]);
+  out[6] = vqaddq_s16(step2[6], step2[57]);
+  out[7] = vqaddq_s16(step2[7], step2[56]);
+  out[8] = vqaddq_s16(step2[8], step2[55]);
+  out[9] = vqaddq_s16(step2[9], step2[54]);
+  out[10] = vqaddq_s16(step2[10], step2[53]);
+  out[11] = vqaddq_s16(step2[11], step2[52]);
+  out[12] = vqaddq_s16(step2[12], step2[51]);
+  out[13] = vqaddq_s16(step2[13], step2[50]);
+  out[14] = vqaddq_s16(step2[14], step2[49]);
+  out[15] = vqaddq_s16(step2[15], step2[48]);
+  out[16] = vqaddq_s16(step2[16], step2[47]);
+  out[17] = vqaddq_s16(step2[17], step2[46]);
+  out[18] = vqaddq_s16(step2[18], step2[45]);
+  out[19] = vqaddq_s16(step2[19], step2[44]);
+  out[20] = vqaddq_s16(step2[20], step2[43]);
+  out[21] = vqaddq_s16(step2[21], step2[42]);
+  out[22] = vqaddq_s16(step2[22], step2[41]);
+  out[23] = vqaddq_s16(step2[23], step2[40]);
+  out[24] = vqaddq_s16(step2[24], step2[39]);
+  out[25] = vqaddq_s16(step2[25], step2[38]);
+  out[26] = vqaddq_s16(step2[26], step2[37]);
+  out[27] = vqaddq_s16(step2[27], step2[36]);
+  out[28] = vqaddq_s16(step2[28], step2[35]);
+  out[29] = vqaddq_s16(step2[29], step2[34]);
+  out[30] = vqaddq_s16(step2[30], step2[33]);
+  out[31] = vqaddq_s16(step2[31], step2[32]);
+  out[32] = vqsubq_s16(step2[31], step2[32]);
+  out[33] = vqsubq_s16(step2[30], step2[33]);
+  out[34] = vqsubq_s16(step2[29], step2[34]);
+  out[35] = vqsubq_s16(step2[28], step2[35]);
+  out[36] = vqsubq_s16(step2[27], step2[36]);
+  out[37] = vqsubq_s16(step2[26], step2[37]);
+  out[38] = vqsubq_s16(step2[25], step2[38]);
+  out[39] = vqsubq_s16(step2[24], step2[39]);
+  out[40] = vqsubq_s16(step2[23], step2[40]);
+  out[41] = vqsubq_s16(step2[22], step2[41]);
+  out[42] = vqsubq_s16(step2[21], step2[42]);
+  out[43] = vqsubq_s16(step2[20], step2[43]);
+  out[44] = vqsubq_s16(step2[19], step2[44]);
+  out[45] = vqsubq_s16(step2[18], step2[45]);
+  out[46] = vqsubq_s16(step2[17], step2[46]);
+  out[47] = vqsubq_s16(step2[16], step2[47]);
+  out[48] = vqsubq_s16(step2[15], step2[48]);
+  out[49] = vqsubq_s16(step2[14], step2[49]);
+  out[50] = vqsubq_s16(step2[13], step2[50]);
+  out[51] = vqsubq_s16(step2[12], step2[51]);
+  out[52] = vqsubq_s16(step2[11], step2[52]);
+  out[53] = vqsubq_s16(step2[10], step2[53]);
+  out[54] = vqsubq_s16(step2[9], step2[54]);
+  out[55] = vqsubq_s16(step2[8], step2[55]);
+  out[56] = vqsubq_s16(step2[7], step2[56]);
+  out[57] = vqsubq_s16(step2[6], step2[57]);
+  out[58] = vqsubq_s16(step2[5], step2[58]);
+  out[59] = vqsubq_s16(step2[4], step2[59]);
+  out[60] = vqsubq_s16(step2[3], step2[60]);
+  out[61] = vqsubq_s16(step2[2], step2[61]);
+  out[62] = vqsubq_s16(step2[1], step2[62]);
+  out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step2[64], step1[64];
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c7 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[4] = in[8];
+  step2[8] = in[4];
+  step2[12] = in[12];
+  step2[16] = in[2];
+  step2[20] = in[10];
+  step2[24] = in[6];
+  step2[28] = in[14];
+
+  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+  btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+  btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+  btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+  btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+  // stage 3
+
+  step1[0] = step2[0];
+  step1[4] = step2[4];
+  step1[8] = step2[8];
+  step1[12] = step2[12];
+
+  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+  btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+  btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+
+  step1[32] = step2[32];
+  step1[33] = step2[32];
+  step1[34] = step2[35];
+  step1[35] = step2[35];
+  step1[36] = step2[36];
+  step1[37] = step2[36];
+  step1[38] = step2[39];
+  step1[39] = step2[39];
+  step1[40] = step2[40];
+  step1[41] = step2[40];
+  step1[42] = step2[43];
+  step1[43] = step2[43];
+  step1[44] = step2[44];
+  step1[45] = step2[44];
+  step1[46] = step2[47];
+  step1[47] = step2[47];
+  step1[48] = step2[48];
+  step1[49] = step2[48];
+  step1[50] = step2[51];
+  step1[51] = step2[51];
+  step1[52] = step2[52];
+  step1[53] = step2[52];
+  step1[54] = step2[55];
+  step1[55] = step2[55];
+  step1[56] = step2[56];
+  step1[57] = step2[56];
+  step1[58] = step2[59];
+  step1[59] = step2[59];
+  step1[60] = step2[60];
+  step1[61] = step2[60];
+  step1[62] = step2[63];
+  step1[63] = step2[63];
+
+  // stage 4
+
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+
+  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+  btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+  step2[32] = step1[32];
+  step2[35] = step1[35];
+  step2[36] = step1[36];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[43] = step1[43];
+  step2[44] = step1[44];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[51] = step1[51];
+  step2[52] = step1[52];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[59] = step1[59];
+  step2[60] = step1[60];
+  step2[63] = step1[63];
+
+  // stage 5
+
+  step1[0] = step2[0];
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+  btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[16] = step2[16];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[35]);
+  step1[33] = vqaddq_s16(step2[33], step2[34]);
+  step1[34] = vqsubq_s16(step2[33], step2[34]);
+  step1[35] = vqsubq_s16(step2[32], step2[35]);
+  step1[36] = vqsubq_s16(step2[39], step2[36]);
+  step1[37] = vqsubq_s16(step2[38], step2[37]);
+  step1[38] = vqaddq_s16(step2[38], step2[37]);
+  step1[39] = vqaddq_s16(step2[39], step2[36]);
+  step1[40] = vqaddq_s16(step2[40], step2[43]);
+  step1[41] = vqaddq_s16(step2[41], step2[42]);
+  step1[42] = vqsubq_s16(step2[41], step2[42]);
+  step1[43] = vqsubq_s16(step2[40], step2[43]);
+  step1[44] = vqsubq_s16(step2[47], step2[44]);
+  step1[45] = vqsubq_s16(step2[46], step2[45]);
+  step1[46] = vqaddq_s16(step2[46], step2[45]);
+  step1[47] = vqaddq_s16(step2[47], step2[44]);
+  step1[48] = vqaddq_s16(step2[48], step2[51]);
+  step1[49] = vqaddq_s16(step2[49], step2[50]);
+  step1[50] = vqsubq_s16(step2[49], step2[50]);
+  step1[51] = vqsubq_s16(step2[48], step2[51]);
+  step1[52] = vqsubq_s16(step2[55], step2[52]);
+  step1[53] = vqsubq_s16(step2[54], step2[53]);
+  step1[54] = vqaddq_s16(step2[54], step2[53]);
+  step1[55] = vqaddq_s16(step2[55], step2[52]);
+  step1[56] = vqaddq_s16(step2[56], step2[59]);
+  step1[57] = vqaddq_s16(step2[57], step2[58]);
+  step1[58] = vqsubq_s16(step2[57], step2[58]);
+  step1[59] = vqsubq_s16(step2[56], step2[59]);
+  step1[60] = vqsubq_s16(step2[63], step2[60]);
+  step1[61] = vqsubq_s16(step2[62], step2[61]);
+  step1[62] = vqaddq_s16(step2[62], step2[61]);
+  step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+  // stage 6
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[19]);
+  step2[17] = vqaddq_s16(step1[17], step1[18]);
+  step2[18] = vqsubq_s16(step1[17], step1[18]);
+  step2[19] = vqsubq_s16(step1[16], step1[19]);
+  step2[20] = vqsubq_s16(step1[23], step1[20]);
+  step2[21] = vqsubq_s16(step1[22], step1[21]);
+  step2[22] = vqaddq_s16(step1[22], step1[21]);
+  step2[23] = vqaddq_s16(step1[23], step1[20]);
+  step2[24] = vqaddq_s16(step1[24], step1[27]);
+  step2[25] = vqaddq_s16(step1[25], step1[26]);
+  step2[26] = vqsubq_s16(step1[25], step1[26]);
+  step2[27] = vqsubq_s16(step1[24], step1[27]);
+  step2[28] = vqsubq_s16(step1[31], step1[28]);
+  step2[29] = vqsubq_s16(step1[30], step1[29]);
+  step2[30] = vqaddq_s16(step1[30], step1[29]);
+  step2[31] = vqaddq_s16(step1[31], step1[28]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[41] = step1[41];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[54] = step1[54];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[39]);
+  step1[33] = vqaddq_s16(step2[33], step2[38]);
+  step1[34] = vqaddq_s16(step2[34], step2[37]);
+  step1[35] = vqaddq_s16(step2[35], step2[36]);
+  step1[36] = vqsubq_s16(step2[35], step2[36]);
+  step1[37] = vqsubq_s16(step2[34], step2[37]);
+  step1[38] = vqsubq_s16(step2[33], step2[38]);
+  step1[39] = vqsubq_s16(step2[32], step2[39]);
+  step1[40] = vqsubq_s16(step2[47], step2[40]);
+  step1[41] = vqsubq_s16(step2[46], step2[41]);
+  step1[42] = vqsubq_s16(step2[45], step2[42]);
+  step1[43] = vqsubq_s16(step2[44], step2[43]);
+  step1[44] = vqaddq_s16(step2[43], step2[44]);
+  step1[45] = vqaddq_s16(step2[42], step2[45]);
+  step1[46] = vqaddq_s16(step2[41], step2[46]);
+  step1[47] = vqaddq_s16(step2[40], step2[47]);
+  step1[48] = vqaddq_s16(step2[48], step2[55]);
+  step1[49] = vqaddq_s16(step2[49], step2[54]);
+  step1[50] = vqaddq_s16(step2[50], step2[53]);
+  step1[51] = vqaddq_s16(step2[51], step2[52]);
+  step1[52] = vqsubq_s16(step2[51], step2[52]);
+  step1[53] = vqsubq_s16(step2[50], step2[53]);
+  step1[54] = vqsubq_s16(step2[49], step2[54]);
+  step1[55] = vqsubq_s16(step2[48], step2[55]);
+  step1[56] = vqsubq_s16(step2[63], step2[56]);
+  step1[57] = vqsubq_s16(step2[62], step2[57]);
+  step1[58] = vqsubq_s16(step2[61], step2[58]);
+  step1[59] = vqsubq_s16(step2[60], step2[59]);
+  step1[60] = vqaddq_s16(step2[59], step2[60]);
+  step1[61] = vqaddq_s16(step2[58], step2[61]);
+  step1[62] = vqaddq_s16(step2[57], step2[62]);
+  step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[23]);
+  step2[17] = vqaddq_s16(step1[17], step1[22]);
+  step2[18] = vqaddq_s16(step1[18], step1[21]);
+  step2[19] = vqaddq_s16(step1[19], step1[20]);
+  step2[20] = vqsubq_s16(step1[19], step1[20]);
+  step2[21] = vqsubq_s16(step1[18], step1[21]);
+  step2[22] = vqsubq_s16(step1[17], step1[22]);
+  step2[23] = vqsubq_s16(step1[16], step1[23]);
+  step2[24] = vqsubq_s16(step1[31], step1[24]);
+  step2[25] = vqsubq_s16(step1[30], step1[25]);
+  step2[26] = vqsubq_s16(step1[29], step1[26]);
+  step2[27] = vqsubq_s16(step1[28], step1[27]);
+  step2[28] = vqaddq_s16(step1[28], step1[27]);
+  step2[29] = vqaddq_s16(step1[29], step1[26]);
+  step2[30] = vqaddq_s16(step1[30], step1[25]);
+  step2[31] = vqaddq_s16(step1[31], step1[24]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[44] = step1[44];
+  step2[45] = step1[45];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[50] = step1[50];
+  step2[51] = step1[51];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 9
+  idct64_stage9_neon(step2, step1, cos_bit);
+
+  // stage 10
+  idct64_stage10_neon(step1, step2, cos_bit);
+
+  // stage 11
+
+  out[0] = vqaddq_s16(step2[0], step2[63]);
+  out[1] = vqaddq_s16(step2[1], step2[62]);
+  out[2] = vqaddq_s16(step2[2], step2[61]);
+  out[3] = vqaddq_s16(step2[3], step2[60]);
+  out[4] = vqaddq_s16(step2[4], step2[59]);
+  out[5] = vqaddq_s16(step2[5], step2[58]);
+  out[6] = vqaddq_s16(step2[6], step2[57]);
+  out[7] = vqaddq_s16(step2[7], step2[56]);
+  out[8] = vqaddq_s16(step2[8], step2[55]);
+  out[9] = vqaddq_s16(step2[9], step2[54]);
+  out[10] = vqaddq_s16(step2[10], step2[53]);
+  out[11] = vqaddq_s16(step2[11], step2[52]);
+  out[12] = vqaddq_s16(step2[12], step2[51]);
+  out[13] = vqaddq_s16(step2[13], step2[50]);
+  out[14] = vqaddq_s16(step2[14], step2[49]);
+  out[15] = vqaddq_s16(step2[15], step2[48]);
+  out[16] = vqaddq_s16(step2[16], step2[47]);
+  out[17] = vqaddq_s16(step2[17], step2[46]);
+  out[18] = vqaddq_s16(step2[18], step2[45]);
+  out[19] = vqaddq_s16(step2[19], step2[44]);
+  out[20] = vqaddq_s16(step2[20], step2[43]);
+  out[21] = vqaddq_s16(step2[21], step2[42]);
+  out[22] = vqaddq_s16(step2[22], step2[41]);
+  out[23] = vqaddq_s16(step2[23], step2[40]);
+  out[24] = vqaddq_s16(step2[24], step2[39]);
+  out[25] = vqaddq_s16(step2[25], step2[38]);
+  out[26] = vqaddq_s16(step2[26], step2[37]);
+  out[27] = vqaddq_s16(step2[27], step2[36]);
+  out[28] = vqaddq_s16(step2[28], step2[35]);
+  out[29] = vqaddq_s16(step2[29], step2[34]);
+  out[30] = vqaddq_s16(step2[30], step2[33]);
+  out[31] = vqaddq_s16(step2[31], step2[32]);
+  out[32] = vqsubq_s16(step2[31], step2[32]);
+  out[33] = vqsubq_s16(step2[30], step2[33]);
+  out[34] = vqsubq_s16(step2[29], step2[34]);
+  out[35] = vqsubq_s16(step2[28], step2[35]);
+  out[36] = vqsubq_s16(step2[27], step2[36]);
+  out[37] = vqsubq_s16(step2[26], step2[37]);
+  out[38] = vqsubq_s16(step2[25], step2[38]);
+  out[39] = vqsubq_s16(step2[24], step2[39]);
+  out[40] = vqsubq_s16(step2[23], step2[40]);
+  out[41] = vqsubq_s16(step2[22], step2[41]);
+  out[42] = vqsubq_s16(step2[21], step2[42]);
+  out[43] = vqsubq_s16(step2[20], step2[43]);
+  out[44] = vqsubq_s16(step2[19], step2[44]);
+  out[45] = vqsubq_s16(step2[18], step2[45]);
+  out[46] = vqsubq_s16(step2[17], step2[46]);
+  out[47] = vqsubq_s16(step2[16], step2[47]);
+  out[48] = vqsubq_s16(step2[15], step2[48]);
+  out[49] = vqsubq_s16(step2[14], step2[49]);
+  out[50] = vqsubq_s16(step2[13], step2[50]);
+  out[51] = vqsubq_s16(step2[12], step2[51]);
+  out[52] = vqsubq_s16(step2[11], step2[52]);
+  out[53] = vqsubq_s16(step2[10], step2[53]);
+  out[54] = vqsubq_s16(step2[9], step2[54]);
+  out[55] = vqsubq_s16(step2[8], step2[55]);
+  out[56] = vqsubq_s16(step2[7], step2[56]);
+  out[57] = vqsubq_s16(step2[6], step2[57]);
+  out[58] = vqsubq_s16(step2[5], step2[58]);
+  out[59] = vqsubq_s16(step2[4], step2[59]);
+  out[60] = vqsubq_s16(step2[3], step2[60]);
+  out[61] = vqsubq_s16(step2[2], step2[61]);
+  out[62] = vqsubq_s16(step2[1], step2[62]);
+  out[63] = vqsubq_s16(step2[0], step2[63]);
+}
 
 // Functions for blocks with eob at DC and within
 // topleft 8x8, 16x16, 32x32 corner
-static const transform_1d_neon
-    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
-      {
-          { av1_idct4_new, av1_idct4_new, NULL, NULL },
-          { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
-          { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
-      },
-      { { av1_idct8_new, av1_idct8_new, NULL, NULL },
-        { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
-        { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
-      {
-          { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
-          { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
-          { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
-      },
-      { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
-        { NULL, NULL, NULL, NULL },
-        { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
-          av1_iidentity32_c } },
-      { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } }
-    };
-
 static const transform_neon
     lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
       {
@@ -2120,108 +3574,35 @@ static const transform_neon
           { NULL, NULL, NULL, NULL },
           { NULL, NULL, NULL, NULL },
       },
-      { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
-        { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
-        { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+      { { idct8_low1_neon, idct8_neon, NULL, NULL },
+        { iadst8_low1_neon, iadst8_neon, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
       {
-          { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
-          { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
-            NULL },
-          { identity16_new_neon, identity16_new_neon, identity16_new_neon,
-            NULL },
+          { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
+          { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
+          { NULL, NULL, NULL, NULL },
       },
-      { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
-          idct32_new_neon },
+      { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
         { NULL, NULL, NULL, NULL },
-        { identity32_new_neon, identity32_new_neon, identity32_new_neon,
-          identity32_new_neon } },
-      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
+          idct64_low32_neon },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
 
-static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
-  int32_t *temp_in = txfm_buf;
-
-  int eobx, eoby;
-  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
-  int r, bd = 8;
-
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-
-  // row tx
-  int row_start = (buf_size_nonzero_h_div8 * 8);
-  for (int i = 0; i < row_start; i++) {
-    if (abs(rect_type) == 1) {
-      for (int j = 0; j < txfm_size_col; j++)
-        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
-      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
-    } else {
-      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
-    }
-    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-
-  // Doing memset for the rows which are not processed in row transform.
-  memset(buf_ptr, 0,
-         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
-  // col tx
-  for (int c = 0; c < txfm_size_col; c++) {
-    for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
-
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    for (r = 0; r < txfm_size_row; ++r) {
-      output[r * stride + c] =
-          highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-    }
-  }
-}
-
 static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
                                                   uint8_t *output, int stride,
                                                   TX_TYPE tx_type,
                                                   TX_SIZE tx_size, int eob) {
+  (void)tx_type;
   int16x8_t a[32 * 4];
   int16x8_t b[32 * 4];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -2232,17 +3613,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const int32_t *input_1;
   int temp_b = 0;
-  const transform_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
 
   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
     input_1 = input;
@@ -2257,9 +3629,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
       int y = i * txfm_size_col;
       round_shift_for_rect(&a[y], &a[y], txfm_size_col);
     }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
-    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
-                                  -shift[0]);
+    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+                             txw_idx, txfm_size_col, -shift[0]);
     for (int j = 0; j < buf_size_w_div8; ++j) {
       int k = j * 8 + i * txfm_size_col;
       transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
@@ -2267,9 +3638,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
     temp_b += 8;
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+                             txh_idx, txfm_size_row, -shift[1]);
   }
   if (txfm_size_col >= 16) {
     for (int i = 0; i < (txfm_size_col >> 4); i++) {
@@ -2281,90 +3651,6 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
-  int32_t *temp_in = txfm_buf;
-
-  int eobx, eoby;
-  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
-  int r, bd = 8;
-
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  // row tx
-  int row_start = (buf_size_nonzero_h_div8 * 8);
-  for (int i = 0; i < row_start; i++) {
-    if (abs(rect_type) == 1) {
-      for (int j = 0; j < txfm_size_col; j++)
-        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
-      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
-    } else {
-      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
-    }
-    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-  // Doing memset for the rows which are not processed in row transform.
-  memset(buf_ptr, 0,
-         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
-  // col tx
-  for (int c = 0; c < txfm_size_col; c++) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
 static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
@@ -2372,11 +3658,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
   int16x8_t b[16 * 2];
   int eobx, eoby, ud_flip, lr_flip;
   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -2386,15 +3671,11 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const int32_t *input_1;
   int temp_b = 0;
   const transform_neon row_txfm =
       lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 
-  assert(col_txfm != NULL);
   assert(row_txfm != NULL);
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
@@ -2432,9 +3713,8 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
     }
   }
   for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
+    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+                             txh_idx, txfm_size_row, -shift[1]);
   }
   if (txfm_size_col >= 16) {
     for (int i = 0; i < (txfm_size_col >> 4); i++) {
@@ -2446,90 +3726,6 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
-  int32_t *temp_in = txfm_buf;
-
-  int eobx, eoby;
-  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
-  int r, bd = 8;
-
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  // row tx
-  int row_start = (buf_size_nonzero_h_div8 * 8);
-  for (int i = 0; i < row_start; i++) {
-    if (abs(rect_type) == 1) {
-      for (int j = 0; j < txfm_size_col; j++)
-        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
-      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
-    } else {
-      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
-    }
-    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-  // Doing memset for the rows which are not processed in row transform.
-  memset(buf_ptr, 0,
-         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
-  // col tx
-  for (int c = 0; c < txfm_size_col; c++) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
 static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
@@ -2537,11 +3733,10 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
   int16x8_t b[16 * 2];
   int eobx, eoby, ud_flip, lr_flip;
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -2550,17 +3745,13 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const int32_t *input_1;
   int temp_b = 0;
-  const transform_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   const transform_neon col_txfm =
       lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 
   assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
@@ -2577,9 +3768,8 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
       int y = i * txfm_size_col;
       round_shift_for_rect(&a[y], &a[y], txfm_size_col);
     }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
-    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
-                                  -shift[0]);
+    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+                             txw_idx, txfm_size_col, -shift[0]);
     for (int j = 0; j < buf_size_w_div8; ++j) {
       int k = j * 8 + i * txfm_size_col;
       transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
@@ -2604,24 +3794,24 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
 
 static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
                                                  uint8_t *output, int stride,
-                                                 TX_TYPE tx_type,
-                                                 TX_SIZE tx_size, int eob) {
+                                                 TX_TYPE tx_type, int eob) {
   (void)eob;
+  TX_SIZE tx_size = TX_4X4;
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2647,6 +3837,7 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -2666,24 +3857,25 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
 }
 
 void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
-                                   int eob) {
+                                   int stride, TX_TYPE tx_type, int eob) {
   (void)eob;
+  TX_SIZE tx_size = TX_4X8;
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+                                                   16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2711,6 +3903,7 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -2730,24 +3923,25 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
 }
 
 void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
-                                   int eob) {
+                                   int stride, TX_TYPE tx_type, int eob) {
   (void)eob;
+  TX_SIZE tx_size = TX_8X4;
   DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+                                                   16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2775,6 +3969,7 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -2794,24 +3989,25 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
 }
 
 void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type,
-                                    TX_SIZE tx_size, int eob) {
+                                    int stride, TX_TYPE tx_type, int eob) {
   (void)eob;
+  TX_SIZE tx_size = TX_4X16;
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+                                                   16, 16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2837,6 +4033,7 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -2856,25 +4053,25 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
 }
 
 void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type,
-                                    TX_SIZE tx_size, int eob) {
+                                    int stride, TX_TYPE tx_type, int eob) {
   (void)eob;
-
+  TX_SIZE tx_size = TX_16X4;
   DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
   int32_t *temp_in = txfm_buf;
 
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_out = temp_in + buf_offset;
   int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+                                                   16, 16, 16, 16, 16 };
   int r, bd = 8;
   const transform_1d_neon row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2900,89 +4097,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
-  int32_t *temp_in = txfm_buf;
-
-  int eobx, eoby, ud_flip, lr_flip, row_start;
-  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
-  const int bd = 8;
-  int r;
-
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  row_start = (buf_size_nonzero_h_div8 << 3);
-
-  for (int i = 0; i < row_start; i++) {
-    if (abs(rect_type) == 1) {
-      for (int j = 0; j < txfm_size_col; j++)
-        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
-      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
-    } else {
-      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
-    }
-    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-
-  // Doing memset for the rows which are not processed in row transform.
-  memset(buf_ptr, 0,
-         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
-  for (int c = 0; c < txfm_size_col; c++) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
     col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
 
@@ -3008,17 +4123,18 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
   int16x8_t b[64 * 8];
   int eobx, eoby, ud_flip, lr_flip;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   const int buf_size_w_div8 = txfm_size_col >> 3;
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const int32_t *input_1;
@@ -3038,14 +4154,14 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
     input_1 = input;
     for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
       int k = j * 8 + i * txfm_size_col;
-      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride);
       transpose_s16_8x8q(&a[k], &a[k]);
       input_1 += 8;
     }
-    input += (txfm_size_col * 8);
+    input += (input_stride * 8);
     if (abs(rect_type) == 1) {
       int y = i * txfm_size_col;
-      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+      round_shift_for_rect(&a[y], &a[y], input_stride);
     }
     row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
     av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
@@ -3083,36 +4199,6 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
   }
 }
 
-static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  switch (tx_type) {
-    case IDTX:
-      lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
-                                         tx_size, eob);
-      break;
-
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-      lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
-                                               tx_size, eob);
-      break;
-
-    case V_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-      lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
-                                               tx_size, eob);
-      break;
-
-    default:
-      lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
-                                                tx_size, eob);
-      break;
-  }
-}
-
 static INLINE void lowbd_inv_txfm2d_add_universe_neon(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
@@ -3146,73 +4232,27 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon(
 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
                                    int eob) {
-  int row;
   switch (tx_size) {
     case TX_4X4:
-      lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
-                                    eob);
+      lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
       break;
 
     case TX_4X8:
-      lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
-                                    eob);
+      lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
       break;
 
     case TX_8X4:
-      lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
-                                    eob);
+      lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
       break;
 
     case TX_4X16:
-      lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
-                                     eob);
+      lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
       break;
 
     case TX_16X4:
-      lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
-                                     eob);
+      lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
       break;
 
-    case TX_16X64: {
-      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
-                                             tx_size, eob);
-    } break;
-
-    case TX_64X16: {
-      int32_t mod_input[64 * 16];
-      for (row = 0; row < 16; ++row) {
-        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
-        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
-      }
-      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
-                                             tx_size, eob);
-    } break;
-
-    case TX_32X64: {
-      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
-                                             tx_size, eob);
-    } break;
-
-    case TX_64X32: {
-      int32_t mod_input[64 * 32];
-      for (row = 0; row < 32; ++row) {
-        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
-        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
-      }
-      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
-                                             tx_size, eob);
-    } break;
-
-    case TX_64X64: {
-      int32_t mod_input[64 * 64];
-      for (row = 0; row < 32; ++row) {
-        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
-        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
-      }
-      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
-                                             tx_size, eob);
-    } break;
-
     default:
       lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
                                          tx_size, eob);
diff --git a/media/libaom/src/av1/common/arm/av1_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_txfm_neon.c
index de3c54724..7e3a05ab7 100644
--- a/media/libaom/src/av1/common/arm/av1_txfm_neon.c
+++ b/media/libaom/src/av1/common/arm/av1_txfm_neon.c
@@ -12,6 +12,8 @@
 #include <arm_neon.h>
 #include <assert.h>
 
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "av1/common/arm/mem_neon.h"
 
diff --git a/media/libaom/src/av1/common/arm/cfl_neon.c b/media/libaom/src/av1/common/arm/cfl_neon.c
index 39025b5e5..371be5f0e 100644
--- a/media/libaom/src/av1/common/arm/cfl_neon.c
+++ b/media/libaom/src/av1/common/arm/cfl_neon.c
@@ -131,6 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #ifndef __aarch64__
 uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
@@ -247,6 +248,7 @@ static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
     input += input_stride;
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 CFL_GET_SUBSAMPLE_FUNCTION(neon)
 
@@ -511,6 +513,7 @@ static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
 
 CFL_PREDICT_FN(neon, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
   return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
 }
@@ -582,3 +585,4 @@ static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
 }
 
 CFL_PREDICT_FN(neon, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/arm/convolve_neon.c b/media/libaom/src/av1/common/arm/convolve_neon.c
index d0c4f8ff6..51c96961c 100644
--- a/media/libaom/src/av1/common/arm/convolve_neon.c
+++ b/media/libaom/src/av1/common/arm/convolve_neon.c
@@ -195,12 +195,12 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
   const int8_t bits = FILTER_BITS - conv_params->round_0;
 
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)filter_params_y;
 
@@ -214,7 +214,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
   const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -603,14 +603,14 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const int vert_offset = filter_params_y->taps / 2 - 1;
 
   src -= vert_offset * src_stride;
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -618,7 +618,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   if (w <= 4) {
     uint8x8_t d01;
@@ -844,17 +844,110 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
+// Horizontal filtering for convolve_2d_sr for width multiple of 8
+// Processes one row at a time
+static INLINE void horiz_filter_w8_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  do {
+    uint8x8_t t0 = vld1_u8(src_ptr);
+    s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+    int width_tmp = width;
+    const uint8_t *s = src_ptr + 8;
+    int16_t *dst_tmp = dst_ptr;
+
+    __builtin_prefetch(dst_ptr);
+
+    do {
+      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t sum = s0;
+      s0 = s7;
+
+      s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+      int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
+                                         x_filter, horiz_const, shift_round_0);
+
+      vst1q_s16(dst_tmp, res0);
+
+      s += 8;
+      dst_tmp += 8;
+      width_tmp -= 8;
+    } while (width_tmp > 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    height--;
+  } while (height > 0);
+}
+
+// Horizontal filtering for convolve_2d_sr for width <= 4
+// Processes one row at a time
+static INLINE void horiz_filter_w4_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  do {
+    const uint8_t *s = src_ptr;
+
+    __builtin_prefetch(s);
+
+    uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+    int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    s0 = vget_low_s16(tt0);
+    s4 = vget_high_s16(tt0);
+
+    __builtin_prefetch(dst_ptr);
+    s += 8;
+
+    t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+    s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+    s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+    s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+    s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+    s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+    s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+    s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+    int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                     horiz_const, shift_round_0);
+
+    if (width == 4) {
+      vst1_s16(dst_ptr, d0);
+      dst_ptr += dst_stride;
+    } else if (width == 2) {
+      vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+      dst_ptr += dst_stride;
+    }
+
+    src_ptr += src_stride;
+    height--;
+  } while (height > 0);
+}
+
 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
+                             const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
   int im_dst_stride;
   int width, height;
-  uint8x8_t t0;
 #if defined(__aarch64__)
+  uint8x8_t t0;
   uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+  const uint8_t *s;
 #endif
 
   DECLARE_ALIGNED(16, int16_t,
@@ -867,7 +960,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   const int horiz_offset = filter_params_x->taps / 2 - 1;
 
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-  const uint8_t *s;
+
   int16_t *dst_ptr;
 
   dst_ptr = im_block;
@@ -880,7 +973,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   int16_t x_filter_tmp[8];
   int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -893,18 +986,14 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   assert(conv_params->round_0 > 0);
 
   if (w <= 4) {
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-#if defined(__aarch64__)
-    int16x4_t s8, s9, s10, d1, d2, d3;
-#endif
-
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
 
+#if defined(__aarch64__)
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     do {
+      assert(height >= 4);
       s = src_ptr;
-
-#if defined(__aarch64__)
       __builtin_prefetch(s + 0 * src_stride);
       __builtin_prefetch(s + 1 * src_stride);
       __builtin_prefetch(s + 2 * src_stride);
@@ -963,57 +1052,30 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       src_ptr += 4 * src_stride;
       dst_ptr += 4 * im_dst_stride;
       height -= 4;
-#else
-      int16x8_t tt0;
-
-      __builtin_prefetch(s);
-
-      t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s0 = vget_low_s16(tt0);
-      s4 = vget_high_s16(tt0);
-
-      __builtin_prefetch(dst_ptr);
-      s += 8;
+    } while (height >= 4);
 
-      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
-      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
-      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
-      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
-      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
-      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
-      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
-
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                             horiz_const, shift_round_0);
-
-      if (w == 4) {
-        vst1_s16(dst_ptr, d0);
-        dst_ptr += im_dst_stride;
-      } else if (w == 2) {
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
-        dst_ptr += im_dst_stride;
-      }
-
-      src_ptr += src_stride;
-      height -= 1;
-#endif
-    } while (height > 0);
-  } else {
-    int16_t *d_tmp;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
-#if defined(__aarch64__)
-    int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
-    int16x8_t s11, s12, s13, s14;
+    if (height) {
+      assert(height < 4);
+      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                                 height, x_filter_tmp, horiz_const,
+                                 shift_round_0);
+    }
+#else
+    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                               height, x_filter_tmp, horiz_const,
+                               shift_round_0);
 #endif
 
+  } else {
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
 
 #if defined(__aarch64__)
+    int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
     do {
+      assert(height >= 8);
       __builtin_prefetch(src_ptr + 0 * src_stride);
       __builtin_prefetch(src_ptr + 1 * src_stride);
       __builtin_prefetch(src_ptr + 2 * src_stride);
@@ -1099,45 +1161,121 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
       src_ptr += 8 * src_stride;
       dst_ptr += 8 * im_dst_stride;
       height -= 8;
-    } while (height > 0);
-#else
-    do {
-      t0 = vld1_u8(src_ptr);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+    } while (height >= 8);
 
-      width = w;
-      s = src_ptr + 8;
-      d_tmp = dst_ptr;
+    if (height >= 4) {
+      assert(height < 8);
+      int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+          reg10, reg11, reg12, reg13, reg14;
+      int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+      int16x8_t out0, out1, out2, out3;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+
+      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-      __builtin_prefetch(dst_ptr);
+      reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
 
       do {
-        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        int16x8_t sum = s0;
-        s0 = s7;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-        s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+        reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-        res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                                 horiz_const, shift_round_0);
+        d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                           x_filter_tmp);
+
+        d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+                           x_filter_tmp);
+
+        d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+                           x_filter_tmp);
+
+        d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+                           x_filter_tmp);
+
+        d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+                           x_filter_tmp);
+
+        d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+                           x_filter_tmp);
+
+        d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+                           x_filter_tmp);
+
+        d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
+                           x_filter_tmp);
+
+        transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
+                          &out2, &out3);
 
-        vst1q_s16(d_tmp, res0);
+        out0 = vaddq_s16(out0, horiz_const);
+        out0 = vqrshlq_s16(out0, shift_round_0);
 
+        out1 = vaddq_s16(out1, horiz_const);
+        out1 = vqrshlq_s16(out1, shift_round_0);
+
+        out2 = vaddq_s16(out2, horiz_const);
+        out2 = vqrshlq_s16(out2, shift_round_0);
+
+        out3 = vaddq_s16(out3, horiz_const);
+        out3 = vqrshlq_s16(out3, shift_round_0);
+
+        store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+        reg0 = reg8;
+        reg1 = reg9;
+        reg2 = reg10;
+        reg3 = reg11;
+        reg4 = reg12;
+        reg5 = reg13;
+        reg6 = reg14;
         s += 8;
         d_tmp += 8;
         width -= 8;
       } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += im_dst_stride;
-      height -= 1;
-    } while (height > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
+    }
+
+    if (height) {
+      assert(height < 4);
+      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+                                 height, x_filter_tmp, horiz_const,
+                                 shift_round_0);
+    }
+#else
+
+    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+                               height, x_filter_tmp, horiz_const,
+                               shift_round_0);
 #endif
   }
 
@@ -1149,7 +1287,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
     const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
                               (1 << (offset_bits - conv_params->round_1 - 1));
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
     const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
     const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1409,12 +1547,12 @@ void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   const uint8_t *src1;
diff --git a/media/libaom/src/av1/common/arm/convolve_neon.h b/media/libaom/src/av1/common/arm/convolve_neon.h
index f382984f2..dbcfab631 100644
--- a/media/libaom/src/av1/common/arm/convolve_neon.h
+++ b/media/libaom/src/av1/common/arm/convolve_neon.h
@@ -73,7 +73,7 @@ static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
   int32x4_t sum_0, sum_1;
   int32x4_t s3_0, s3_1;
   const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
-  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
 
   /* for the purpose of right shift by { conv_params->round_0 } */
   const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
@@ -124,7 +124,7 @@ static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
   int16x4_t sum, temp0, temp1, temp2;
 
   const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
-  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
   const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
   const int32x4_t zero = vdupq_n_s32(0);
   const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
diff --git a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
index e5674ef7c..92112fb85 100644
--- a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
+++ b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
@@ -23,19 +23,17 @@
 #include "av1/common/arm/transpose_neon.h"
 
 #if !defined(__aarch64__)
-static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
-                                   const uint16_t fwd_offset,
-                                   const uint16_t bck_offset,
-                                   const int16x4_t sub_const_vec,
-                                   const int16_t round_bits,
-                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_4x1(
+    uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset,
+    const uint16_t bck_offset, const int16x4_t sub_const_vec,
+    const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
   int16x4_t tmp0;
   uint16x4_t tmp_u0;
   uint32x4_t sum0;
   int32x4_t dst0;
   int16x8_t tmp4;
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
 
     sum0 = vmull_n_u16(res0, fwd_offset);
@@ -65,12 +63,10 @@ static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
   }
 }
 
-static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
-                                   const uint16_t fwd_offset,
-                                   const uint16_t bck_offset,
-                                   const int16x4_t sub_const,
-                                   const int16_t round_bits,
-                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_8x1(
+    uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset,
+    const uint16_t bck_offset, const int16x4_t sub_const,
+    const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
   int16x4_t tmp0, tmp2;
   int16x8_t f0;
   uint32x4_t sum0, sum2;
@@ -78,7 +74,7 @@ static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
 
   uint16x8_t tmp_u0;
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t sub_const_vec = vmovl_s16(sub_const);
     const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
 
@@ -123,7 +119,7 @@ static INLINE void compute_avg_4x4(
     uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const_vec, const int16_t round_bits,
-    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+    const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
   int16x4_t tmp0, tmp1, tmp2, tmp3;
   uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
   uint32x4_t sum0, sum1, sum2, sum3;
@@ -132,7 +128,7 @@ static INLINE void compute_avg_4x4(
   int16x8_t tmp4, tmp5;
   const int16x8_t zero = vdupq_n_s16(0);
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
     const int32x4_t const_vec = vmovl_s16(sub_const_vec);
 
@@ -203,8 +199,8 @@ static INLINE void compute_avg_8x4(
     uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x4_t sub_const, const int16_t round_bits,
-    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
-    uint8x8_t *t3) {
+    const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1,
+    uint8x8_t *t2, uint8x8_t *t3) {
   int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int16x8_t f0, f1, f2, f3;
   uint32x4_t sum0, sum1, sum2, sum3;
@@ -214,7 +210,7 @@ static INLINE void compute_avg_8x4(
   uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
   const int16x8_t zero = vdupq_n_s16(0);
 
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const int32x4_t sub_const_vec = vmovl_s16(sub_const);
     const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
 
@@ -319,7 +315,7 @@ static INLINE void compute_avg_8x4(
   }
 }
 
-static INLINE void jnt_convolve_2d_horiz_neon(
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
   const int bd = 8;
@@ -563,7 +559,7 @@ static INLINE void jnt_convolve_2d_horiz_neon(
   }
 }
 
-static INLINE void jnt_convolve_2d_vert_neon(
+static INLINE void dist_wtd_convolve_2d_vert_neon(
     int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
     ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
   uint8_t *dst_u8_ptr, *d_u8;
@@ -587,7 +583,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
   const uint16_t fwd_offset = conv_params->fwd_offset;
   const uint16_t bck_offset = conv_params->bck_offset;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
   uint16x4_t res4, d0;
@@ -652,8 +648,8 @@ static INLINE void jnt_convolve_2d_vert_neon(
         d += (dst_stride << 2);
 
         compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
-                        bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
-                        &t0, &t1);
+                        bck_offset, sub_const_vec, round_bits,
+                        use_dist_wtd_comp_avg, &t0, &t1);
 
         vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
         d_u8 += dst8_stride;
@@ -691,7 +687,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
         d += (dst_stride);
 
         compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
-                        round_bits, use_jnt_comp_avg, &t0);
+                        round_bits, use_dist_wtd_comp_avg, &t0);
 
         vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
         d_u8 += dst8_stride;
@@ -717,12 +713,12 @@ static INLINE void jnt_convolve_2d_vert_neon(
   } while (w > 0);
 }
 
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
-                              int dst8_stride, int w, int h,
-                              const InterpFilterParams *filter_params_x,
-                              const InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+                                   uint8_t *dst8, int dst8_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
 
@@ -736,9 +732,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   const int round_0 = conv_params->round_0 - 1;
   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   int16_t x_filter_tmp[8];
   int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -748,19 +744,18 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
   vst1q_s16(&x_filter_tmp[0], filter_x_coef);
 
-  jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
-                             x_filter_tmp, im_h, w, round_0);
+  dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                                  x_filter_tmp, im_h, w, round_0);
 
-  jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
-                            y_filter, h, w);
+  dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
+                                 conv_params, y_filter, h, w);
 }
 
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
-                                   uint8_t *dst8, int dst8_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_neon(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
       tmp_shift3;
   uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
@@ -783,8 +778,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
 
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   if (!(w & 0x07)) {
     for (y = 0; y < (h >> 2); ++y) {
@@ -811,7 +806,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
           compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
                           res_q2, res_q3, conv_params->fwd_offset,
                           conv_params->bck_offset, sub_const_vec, bits,
-                          conv_params->use_jnt_comp_avg, &tmp_shift0,
+                          conv_params->use_dist_wtd_comp_avg, &tmp_shift0,
                           &tmp_shift1, &tmp_shift2, &tmp_shift3);
 
           vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
@@ -854,7 +849,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
 
         compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
                         conv_params->fwd_offset, conv_params->bck_offset,
-                        sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+                        sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg,
                         &tmp_shift0, &tmp_shift1);
 
         vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
@@ -881,12 +876,12 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
-                             int dst8_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
 
@@ -902,14 +897,14 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const uint16_t fwd_offset = conv_params->fwd_offset;
   const uint16_t bck_offset = conv_params->bck_offset;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - horiz_offset;
 
@@ -1031,8 +1026,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
                           vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
                           vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
-                          round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
-                          &t1);
+                          round_offset_vec, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
                         0);  // 00 01 02 03
@@ -1103,7 +1098,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
                           bck_offset, round_offset_vec, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
                         0);  // 00 01 02 03
@@ -1231,11 +1226,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
-              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
-              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+                          vreinterpretq_u16_s16(res1),
+                          vreinterpretq_u16_s16(res2),
+                          vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1243,11 +1239,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
-              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
-              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+                          vreinterpretq_u16_s16(res5),
+                          vreinterpretq_u16_s16(res6),
+                          vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1319,7 +1316,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
                           bck_offset, round_offset64, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_u8(d_u8, t0);
           d_u8 += (dst8_stride);
@@ -1342,12 +1339,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
   }
 }
 
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
-                             int dst8_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
   assert(!(w % 4));
   assert(!(h % 4));
 
@@ -1363,15 +1360,15 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const uint16_t fwd_offset = conv_params->fwd_offset;
   const uint16_t bck_offset = conv_params->bck_offset;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int shift_value = (conv_params->round_1 - 1 - bits);
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
   const uint8_t *src_ptr = src - (vert_offset * src_stride);
 
@@ -1489,8 +1486,8 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
                           vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
                           vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
-                          round_offset64, round_bits, use_jnt_comp_avg, &t0,
-                          &t1);
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
           d_u8 += dst8_stride;
@@ -1535,7 +1532,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
                           bck_offset, round_offset64, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
           d_u8 += dst8_stride;
@@ -1654,11 +1651,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
-              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
-              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+                          vreinterpretq_u16_s16(res1),
+                          vreinterpretq_u16_s16(res2),
+                          vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1666,11 +1664,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
           load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
           d_tmp += (dst_stride << 2);
 
-          compute_avg_8x4(
-              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
-              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
-              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
-              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+                          vreinterpretq_u16_s16(res5),
+                          vreinterpretq_u16_s16(res6),
+                          vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
 
           store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
           d_u8 += (dst8_stride << 2);
@@ -1718,7 +1717,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
 
           compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
                           bck_offset, round_offset64, round_bits,
-                          use_jnt_comp_avg, &t0);
+                          use_dist_wtd_comp_avg, &t0);
 
           vst1_u8(d_u8, t0);
           d_u8 += (dst8_stride);
diff --git a/media/libaom/src/av1/common/arm/mem_neon.h b/media/libaom/src/av1/common/arm/mem_neon.h
index c4ae2e784..171055fe1 100644
--- a/media/libaom/src/av1/common/arm/mem_neon.h
+++ b/media/libaom/src/av1/common/arm/mem_neon.h
@@ -13,6 +13,7 @@
 
 #include <arm_neon.h>
 #include <string.h>
+#include "aom_dsp/aom_dsp_common.h"
 
 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
                                      const uint8x8_t s1) {
@@ -315,6 +316,26 @@ static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
   *s3 = vld1q_s16(s);
 }
 
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
                                          uint32x2_t *tu0, uint32x2_t *tu1,
                                          uint32x2_t *tu2, uint32x2_t *tu3) {
@@ -383,6 +404,15 @@ static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
   *tu0 = vset_lane_u32(a, *tu0, 1);
 }
 
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define store_unaligned_u8_4x1(dst, src, lane)         \
+  do {                                                 \
+    uint32_t a;                                        \
+    a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+    memcpy(dst, &a, 4);                                \
+  } while (0)
+
 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
                                          uint16x4_t *tu0) {
   uint16_t a;
@@ -491,4 +521,19 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
   vst1q_u32(s, s4);
 }
 
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+}
+
 #endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/media/libaom/src/av1/common/arm/selfguided_neon.c b/media/libaom/src/av1/common/arm/selfguided_neon.c
index b3a37c4cb..fc404a64a 100644
--- a/media/libaom/src/av1/common/arm/selfguided_neon.c
+++ b/media/libaom/src/av1/common/arm/selfguided_neon.c
@@ -19,8 +19,8 @@
 #include "aom_dsp/txfm_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/common.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/restoration.h"
 #include "av1/common/arm/mem_neon.h"
@@ -86,7 +86,7 @@ static INLINE void calc_ab_fast_internal_common(
 
     for (int x = 0; x < 4; x++) {
       for (int y = 0; y < 4; y++) {
-        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
       }
     }
     load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
@@ -214,7 +214,7 @@ static INLINE void calc_ab_internal_common(
 
     for (int x = 0; x < 4; x++) {
       for (int y = 0; y < 8; y++) {
-        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
       }
     }
     load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
@@ -376,6 +376,21 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
       w -= 8;
       count++;
     } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
   }
 
   {
@@ -467,7 +482,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
   const uint32_t n = (2 * r + 1) * (2 * r + 1);
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
-  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
@@ -509,6 +524,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
   } while (h > 0);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
                                         uint16_t *B16, int32_t *B,
                                         const int buf_stride, const int width,
@@ -522,7 +538,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
   const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
-  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -573,6 +589,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
     h -= (ht_inc * 4);
   } while (h > 0);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
                                              int32_t *B, const int buf_stride,
@@ -584,7 +601,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
   const uint32_t n = (2 * r + 1) * (2 * r + 1);
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
-  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -626,6 +643,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
   } while (h > 0);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
                                              int32_t *B, const int buf_stride,
                                              const int width, const int height,
@@ -638,7 +656,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
   const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
   const uint32x4_t const_n_val = vdupq_n_u32(n);
   const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
-  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
   const uint32x4_t const_val = vdupq_n_u32(255);
 
   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -679,6 +697,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
     h -= (ht_inc * 4);
   } while (h > 0);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
                            int32_t *dst2, const int dst_stride, const int width,
@@ -788,6 +807,21 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
       w -= 8;
       count++;
     } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
   }
 
   {
@@ -1145,7 +1179,7 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
                                              int32_t *dst, int dst_stride,
                                              int bit_depth, int sgr_params_idx,
                                              int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1181,17 +1215,25 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
   // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
   // [1, 256] for all bit depths. b output is kept in 32bit buffer.
 
-  if (8 == bit_depth) {
-    calc_ab_fast_internal_lbd(
-        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
-        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
-        params->s[radius_idx], 2);
-  } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (bit_depth > 8) {
     calc_ab_fast_internal_hbd(
         (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
         (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
         bit_depth, r, params->s[radius_idx], 2);
+  } else {
+    calc_ab_fast_internal_lbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+        params->s[radius_idx], 2);
   }
+#else
+  (void)bit_depth;
+  calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
+                            (tmp16_buf - buf_stride - 1),
+                            (sum_buf - buf_stride - 1), buf_stride * 2,
+                            width + 2, height + 2, r, params->s[radius_idx], 2);
+#endif
   final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
                              dgd_stride, dst, dst_stride, width, height);
 }
@@ -1200,7 +1242,7 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
                                         int dgd_stride, int32_t *dst,
                                         int dst_stride, int bit_depth,
                                         int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1235,19 +1277,27 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
   A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
 
+#if CONFIG_AV1_HIGHBITDEPTH
   // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
   // [1, 256] for all bit depths. b output is kept in 32bit buffer.
-  if (8 == bit_depth) {
-    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+  if (bit_depth > 8) {
+    calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
                          (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
                          (B - buf_stride - 1), buf_stride, width + 2,
-                         height + 2, r, params->s[radius_idx], 1);
+                         height + 2, bit_depth, r, params->s[radius_idx], 1);
   } else {
-    calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
                          (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
                          (B - buf_stride - 1), buf_stride, width + 2,
-                         height + 2, bit_depth, r, params->s[radius_idx], 1);
+                         height + 2, r, params->s[radius_idx], 1);
   }
+#else
+  (void)bit_depth;
+  calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+                       (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                       (B - buf_stride - 1), buf_stride, width + 2, height + 2,
+                       r, params->s[radius_idx], 1);
+#endif
   final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
                         dst_stride, width, height);
 }
@@ -1299,8 +1349,14 @@ static INLINE void src_convert_u8_to_u16(const uint8_t *src,
       dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
     }
   }
+
+  // memset uninitialized rows of src buffer as they are needed for the
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
                                         uint16_t *dst, const int dst_stride,
                                         int width, int height) {
@@ -1339,13 +1395,18 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
     memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
            sizeof(uint16_t) * width);
   }
+  // memset uninitialized rows of src buffer as they are needed for the
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
                                     int stride, int32_t *flt0, int32_t *flt1,
                                     int flt_stride, int sgr_params_idx,
                                     int bit_depth, int highbd) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
   uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
@@ -1356,6 +1417,7 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   const int dgd_stride = stride;
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (highbd) {
     const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
     src_convert_hbd_copy(
@@ -1370,6 +1432,13 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
         dgd16_stride, width_ext, height_ext);
   }
+#else
+  (void)highbd;
+  src_convert_u8_to_u16(
+      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
+#endif
 
   if (params->r[0] > 0)
     restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
@@ -1380,11 +1449,11 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
   return 0;
 }
 
-void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
-                                       int height, int stride, int eps,
-                                       const int *xqd, uint8_t *dst8,
-                                       int dst_stride, int32_t *tmpbuf,
-                                       int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+                                           int height, int stride, int eps,
+                                           const int *xqd, uint8_t *dst8,
+                                           int dst_stride, int32_t *tmpbuf,
+                                           int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -1395,11 +1464,12 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   const int dgd_stride = stride;
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
 
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
+#if CONFIG_AV1_HIGHBITDEPTH
   if (highbd) {
     const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
     src_convert_hbd_copy(
@@ -1414,7 +1484,13 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
         dgd16_stride, width_ext, height_ext);
   }
-
+#else
+  (void)highbd;
+  src_convert_u8_to_u16(
+      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
+#endif
   if (params->r[0] > 0)
     restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
                               bit_depth, eps, 0);
@@ -1422,7 +1498,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
     restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
                          bit_depth, eps, 1);
 
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
 
   {
     int16_t *src_ptr;
@@ -1485,6 +1561,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
 
         r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
 
+#if CONFIG_AV1_HIGHBITDEPTH
         if (highbd) {
           r4 = vminq_u16(r4, max);
           vst1q_u16(dst16_ptr, r4);
@@ -1492,6 +1569,11 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
           t0 = vqmovn_u16(r4);
           vst1_u8(dst_ptr, t0);
         }
+#else
+        (void)max;
+        t0 = vqmovn_u16(r4);
+        vst1_u8(dst_ptr, t0);
+#endif
         w -= 8;
         count += 8;
         dst_ptr += 8;
diff --git a/media/libaom/src/av1/common/arm/transpose_neon.h b/media/libaom/src/av1/common/arm/transpose_neon.h
index 8a3d9f07f..91d89b43f 100644
--- a/media/libaom/src/av1/common/arm/transpose_neon.h
+++ b/media/libaom/src/av1/common/arm/transpose_neon.h
@@ -250,6 +250,71 @@ static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
                      vreinterpret_u16_u32(c3.val[1]));
 }
 
+static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
+                                     int16x4_t *a2, int16x4_t *a3,
+                                     int16x4_t *a4, int16x4_t *a5,
+                                     int16x4_t *a6, int16x4_t *a7,
+                                     int16x8_t *o0, int16x8_t *o1,
+                                     int16x8_t *o2, int16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+  int16x4x2_t b2 = vtrn_s16(*a4, *a5);
+  int16x4x2_t b3 = vtrn_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                            vreinterpret_s32_s16(b1.val[0]));
+  int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                            vreinterpret_s32_s16(b1.val[1]));
+  int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+                            vreinterpret_s32_s16(b3.val[0]));
+  int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+                            vreinterpret_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+                     vreinterpret_s16_s32(c2.val[0]));
+  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+                     vreinterpret_s16_s32(c3.val[0]));
+  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+                     vreinterpret_s16_s32(c2.val[1]));
+  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+                     vreinterpret_s16_s32(c3.val[1]));
+}
+
 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
                                      uint16x8_t *a2, uint16x8_t *a3,
                                      uint16x8_t *a4, uint16x8_t *a5,
@@ -386,7 +451,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                      vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
 }
 
-static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -448,10 +513,10 @@ static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
-  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
-  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
-  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
-  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+  const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
 
   *out = d0.val[0];
   *(out + 1) = d1.val[0];
diff --git a/media/libaom/src/av1/common/arm/warp_plane_neon.c b/media/libaom/src/av1/common/arm/warp_plane_neon.c
index 7f02d42a7..c10a34fcd 100644
--- a/media/libaom/src/av1/common/arm/warp_plane_neon.c
+++ b/media/libaom/src/av1/common/arm/warp_plane_neon.c
@@ -20,7 +20,7 @@
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
-/* This is a modified version of 'warped_filter' from warped_motion.c:
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
    * Each coefficient is stored in 8 bits instead of 16 bits
    * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
 
@@ -333,22 +333,22 @@ static INLINE void vertical_filter_neon(const int16x8_t *src,
   c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
                  vreinterpretq_s32_s16(b3.val[1]));
 
-  f0 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f1 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f2 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f3 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f4 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f5 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f6 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f7 = vld1q_s16(
-      (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f0 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f1 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f2 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f3 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f4 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f5 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f6 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f7 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
   d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
   d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
@@ -640,7 +640,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
             uint16x4_t tmp16_lo = vld1_u16(p);
             int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
             int16x4_t tmp16_low;
-            if (conv_params->use_jnt_comp_avg) {
+            if (conv_params->use_dist_wtd_comp_avg) {
               res_lo = vmulq_s32(res_lo, bwd);
               tmp32_lo = vmulq_s32(tmp32_lo, fwd);
               tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
@@ -671,7 +671,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
               uint16x4_t tmp16_hi = vld1_u16(p4);
               int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
               int16x4_t tmp16_high;
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 res_hi = vmulq_s32(res_hi, bwd);
                 tmp32_hi = vmulq_s32(tmp32_hi, fwd);
                 tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
diff --git a/media/libaom/src/av1/common/av1_common_int.h b/media/libaom/src/av1/common/av1_common_int.h
new file mode 100644
index 000000000..0403405e9
--- /dev/null
+++ b/media/libaom/src/av1/common/av1_common_int.h
@@ -0,0 +1,1557 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_
+#define AOM_AV1_COMMON_AV1_COMMON_INT_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/hash_motion.h"
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_table.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough))  // NOLINT
+#endif
+
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+  do {                           \
+  } while (0)
+#endif
+
+#define CDEF_MAX_STRENGTHS 16
+
+/* Constant values while waiting for the sequence header */
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
+
+#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
+// Extra frame context which is always kept at default values
+#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
+
+#define NUM_PING_PONG_BUFFERS 2
+
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+  (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS)
+/* clang-format on */
+
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
+
+enum {
+  SINGLE_REFERENCE = 0,
+  COMPOUND_REFERENCE = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES = 3,
+} UENUM1BYTE(REFERENCE_MODE);
+
+enum {
+  /**
+   * Frame context updates are disabled
+   */
+  REFRESH_FRAME_CONTEXT_DISABLED,
+  /**
+   * Update frame context to values resulting from backward probability
+   * updates based on entropy/counts in the decoded frame
+   */
+  REFRESH_FRAME_CONTEXT_BACKWARD,
+} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE);
+
+#define MFMV_STACK_SIZE 3
+typedef struct {
+  int_mv mfmv0;
+  uint8_t ref_frame_offset;
+} TPL_MV_REF;
+
+typedef struct {
+  int_mv mv;
+  MV_REFERENCE_FRAME ref_frame;
+} MV_REF;
+
+typedef struct RefCntBuffer {
+  // For a RefCntBuffer, the following are reference-holding variables:
+  // - cm->ref_frame_map[]
+  // - cm->cur_frame
+  // - cm->scaled_ref_buf[] (encoder only)
+  // - pbi->output_frame_index[] (decoder only)
+  // With that definition, 'ref_count' is the number of reference-holding
+  // variables that are currently referencing this buffer.
+  // For example:
+  // - suppose this buffer is at index 'k' in the buffer pool, and
+  // - Total 'n' of the variables / array elements above have value 'k' (that
+  // is, they are pointing to buffer at index 'k').
+  // Then, pool->frame_bufs[k].ref_count = n.
+  int ref_count;
+
+  unsigned int order_hint;
+  unsigned int ref_order_hints[INTER_REFS_PER_FRAME];
+
+  // These variables are used only in encoder and compare the absolute
+  // display order hint to compute the relative distance and overcome
+  // the limitation of get_relative_dist() which returns incorrect
+  // distance when a very old frame is used as a reference.
+  unsigned int display_order_hint;
+  unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
+
+  MV_REF *mvs;
+  uint8_t *seg_map;
+  struct segmentation seg;
+  int mi_rows;
+  int mi_cols;
+  // Width and height give the size of the buffer (before any upscaling, unlike
+  // the sizes that can be derived from the buf structure)
+  int width;
+  int height;
+  WarpedMotionParams global_motion[REF_FRAMES];
+  int showable_frame;  // frame can be used as show existing frame in future
+  uint8_t film_grain_params_present;
+  aom_film_grain_t film_grain_params;
+  aom_codec_frame_buffer_t raw_frame_buffer;
+  YV12_BUFFER_CONFIG buf;
+  FRAME_TYPE frame_type;
+
+  // This is only used in the encoder but needs to be indexed per ref frame
+  // so it's extremely convenient to keep it here.
+  int interp_filter_selected[SWITCHABLE];
+
+  // Inter frame reference frame delta for loop filter
+  int8_t ref_deltas[REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+
+  FRAME_CONTEXT frame_context;
+} RefCntBuffer;
+
+typedef struct BufferPool {
+// Protect BufferPool from being accessed by several FrameWorkers at
+// the same time during frame parallel decode.
+// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+// TODO(wtc): Remove this. See
+// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  aom_get_frame_buffer_cb_fn_t get_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct {
+  int cdef_damping;
+  int nb_cdef_strengths;
+  int cdef_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_bits;
+} CdefInfo;
+
+typedef struct {
+  int delta_q_present_flag;
+  // Resolution of delta quant
+  int delta_q_res;
+  int delta_lf_present_flag;
+  // Resolution of delta lf level
+  int delta_lf_res;
+  // This is a flag for number of deltas of loop filter level
+  // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
+  // 1: use separate deltas for each filter level
+  int delta_lf_multi;
+} DeltaQInfo;
+
+typedef struct {
+  int enable_order_hint;        // 0 - disable order hint, and related tools
+  int order_hint_bits_minus_1;  // dist_wtd_comp, ref_frame_mvs,
+                                // frame_sign_bias
+                                // if 0, enable_dist_wtd_comp and
+                                // enable_ref_frame_mvs must be set as 0.
+  int enable_dist_wtd_comp;     // 0 - disable dist-wtd compound modes
+                                // 1 - enable it
+  int enable_ref_frame_mvs;     // 0 - disable ref frame mvs
+                                // 1 - enable it
+} OrderHintInfo;
+
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
+// One exception is the last member 'op_params' that is ignored by
+// are_seq_headers_consistent() function.
+typedef struct SequenceHeader {
+  int num_bits_width;
+  int num_bits_height;
+  int max_frame_width;
+  int max_frame_height;
+  uint8_t frame_id_numbers_present_flag;
+  int frame_id_length;
+  int delta_frame_id_length;
+  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
+  int mib_size;        // Size of the superblock in units of MI blocks
+  int mib_size_log2;   // Log 2 of above.
+
+  OrderHintInfo order_hint_info;
+
+  uint8_t force_screen_content_tools;  // 0 - force off
+                                       // 1 - force on
+                                       // 2 - adaptive
+  uint8_t still_picture;               // Video is a single frame still picture
+  uint8_t reduced_still_picture_hdr;   // Use reduced header for still picture
+  uint8_t force_integer_mv;            // 0 - Don't force. MV can use subpel
+                                       // 1 - force to integer
+                                       // 2 - adaptive
+  uint8_t enable_filter_intra;         // enables/disables filterintra
+  uint8_t enable_intra_edge_filter;    // enables/disables edge upsampling
+  uint8_t enable_interintra_compound;  // enables/disables interintra_compound
+  uint8_t enable_masked_compound;      // enables/disables masked compound
+  uint8_t enable_dual_filter;          // 0 - disable dual interpolation filter
+                                       // 1 - enable vert/horz filter selection
+  uint8_t enable_warped_motion;        // 0 - disable warp for the sequence
+                                       // 1 - enable warp for the sequence
+  uint8_t enable_superres;             // 0 - Disable superres for the sequence
+                                       //     and no frame level superres flag
+                                       // 1 - Enable superres for the sequence
+                                       //     enable per-frame superres flag
+  uint8_t enable_cdef;                 // To turn on/off CDEF
+  uint8_t enable_restoration;          // To turn on/off loop restoration
+  BITSTREAM_PROFILE profile;
+
+  // Color config.
+  aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
+                              // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+  uint8_t use_highbitdepth;   // If true, we need to use 16bit frame buffers.
+  uint8_t monochrome;         // Monochorme video
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  int color_range;
+  int subsampling_x;  // Chroma subsampling for x
+  int subsampling_y;  // Chroma subsampling for y
+  aom_chroma_sample_position_t chroma_sample_position;
+  uint8_t separate_uv_delta_q;
+  uint8_t film_grain_params_present;
+
+  // Operating point info.
+  int operating_points_cnt_minus_1;
+  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  uint8_t decoder_model_info_present_flag;
+  aom_dec_model_info_t decoder_model_info;
+  uint8_t display_model_info_present_flag;
+  AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in spec. One bit: 0 or 1.
+
+  // IMPORTANT: the op_params member must be at the end of the struct so that
+  // are_seq_headers_consistent() can be implemented with a memcmp() call.
+  // TODO(urvang): We probably don't need the +1 here.
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+} SequenceHeader;
+
+typedef struct {
+  int skip_mode_allowed;
+  int skip_mode_flag;
+  int ref_frame_idx_0;
+  int ref_frame_idx_1;
+} SkipModeInfo;
+
+typedef struct {
+  FRAME_TYPE frame_type;
+  REFERENCE_MODE reference_mode;
+
+  unsigned int order_hint;
+  unsigned int display_order_hint;
+  unsigned int frame_number;
+  SkipModeInfo skip_mode_info;
+  int refresh_frame_flags;  // Which ref frames are overwritten by this frame
+  int frame_refs_short_signaling;
+} CurrentFrame;
+
+// Struct containing some frame level features.
+typedef struct {
+  bool disable_cdf_update;
+  bool allow_high_precision_mv;
+  bool cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+  bool allow_screen_content_tools;
+  bool allow_intrabc;
+  bool allow_warped_motion;
+  // Whether to use previous frames' motion vectors for prediction.
+  bool allow_ref_frame_mvs;
+  bool coded_lossless;  // frame is fully lossless at the coded resolution.
+  bool all_lossless;    // frame is fully lossless at the upscaled resolution.
+  bool reduced_tx_set_used;
+  bool error_resilient_mode;
+  bool switchable_motion_mode;
+  TX_MODE tx_mode;
+  InterpFilter interp_filter;
+  int primary_ref_frame;
+  int byte_alignment;
+  // Flag signaling how frame contexts should be updated at the end of
+  // a frame decode
+  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+} FeatureFlags;
+
+// Struct containing params related to tiles.
+typedef struct CommonTileParams {
+  int cols;           // number of tile columns that frame is divided into
+  int rows;           // number of tile rows that frame is divided into
+  int max_width_sb;   // maximum tile width in superblock units.
+  int max_height_sb;  // maximum tile height in superblock units.
+  // Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+  int min_inner_width;
+
+  // If true, tiles are uniformly spaced with power-of-two number of rows and
+  // columns.
+  // If false, tiles have explicitly configured widths and heights.
+  int uniform_spacing;
+
+  // Following members are only valid when uniform_spacing == 1
+  int log2_cols;  // log2 of 'cols'.
+  int log2_rows;  // log2 of 'rows'.
+  int width;      // tile width in MI units
+  int height;     // tile height in MI units
+  // End of members that are only valid when uniform_spacing == 1
+
+  // Min num of tile columns possible based on 'max_width_sb' and frame width.
+  int min_log2_cols;
+  // Min num of tile rows possible based on 'max_height_sb' and frame height.
+  int min_log2_rows;
+  // Min num of tile columns possible based on frame width.
+  int max_log2_cols;
+  // Max num of tile columns possible based on frame width.
+  int max_log2_rows;
+  // log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+  int min_log2;
+  // col_start_sb[i] is the start position of tile column i in superblock units.
+  // valid for 0 <= i <= cols
+  int col_start_sb[MAX_TILE_COLS + 1];
+  // row_start_sb[i] is the start position of tile row i in superblock units.
+  // valid for 0 <= i <= rows
+  int row_start_sb[MAX_TILE_ROWS + 1];
+  // If true, we are using large scale tile mode.
+  unsigned int large_scale;
+  // Only relevant when large_scale == 1.
+  // If true, the independent decoding of a single tile or a section of a frame
+  // is allowed.
+  unsigned int single_tile_decoding;
+} CommonTileParams;
+
+// Struct containing params related to MB_MODE_INFO arrays and related info.
+typedef struct CommonModeInfoParams CommonModeInfoParams;
+struct CommonModeInfoParams {
+  // Number of rows/cols in the frame in 16 pixel units.
+  // This is computed from frame width and height aligned to a multiple of 8.
+  int mb_rows;
+  int mb_cols;
+  // Total MBs = mb_rows * mb_cols.
+  int MBs;
+
+  // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units.
+  // This is computed from frame width and height aligned to a multiple of 8.
+  int mi_rows;
+  int mi_cols;
+
+  // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
+  // in the frame.
+  // Note: This array should be treated like a scratch memory, and should NOT be
+  // accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+  MB_MODE_INFO *mi_alloc;
+  // Number of allocated elements in 'mi_alloc'.
+  int mi_alloc_size;
+  // Stride for 'mi_alloc' array.
+  int mi_alloc_stride;
+  // The minimum block size that each element in 'mi_alloc' can correspond to.
+  // For decoder, this is always BLOCK_4X4.
+  // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k,
+  // and BLOCK_8X8 for resolution >= 4k.
+  BLOCK_SIZE mi_alloc_bsize;
+
+  // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
+  // It's possible that:
+  // - Multiple pointers in the grid point to the same element in 'mi_alloc'
+  // (for example, for all 4x4 blocks that belong to the same partition block).
+  // - Some pointers can be NULL (for example, for blocks outside visible area).
+  MB_MODE_INFO **mi_grid_base;
+  // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+  int mi_grid_size;
+  // Stride for 'mi_grid_base' (and 'tx_type_map' also).
+  int mi_stride;
+
+  // An array of tx types for each 4x4 block in the frame.
+  // Number of allocated elements is same as 'mi_grid_size', and stride is
+  // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
+  // 'mi_grid_base'.
+  TX_TYPE *tx_type_map;
+
+  // Function pointers to allow separate logic for encoder and decoder.
+  void (*free_mi)(struct CommonModeInfoParams *mi_params);
+  void (*setup_mi)(struct CommonModeInfoParams *mi_params);
+  void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width,
+                    int height);
+};
+
+// Parameters related to quantization at the frame level.
+typedef struct CommonQuantParams CommonQuantParams;
+struct CommonQuantParams {
+  // Base qindex of the frame in the range 0 to 255.
+  int base_qindex;
+
+  // Delta of qindex (from base_qindex) for Y plane DC coefficient.
+  // Note: y_ac_delta_q is implicitly 0.
+  int y_dc_delta_q;
+
+  // Delta of qindex (from base_qindex) for U plane DC and AC coefficients.
+  int u_dc_delta_q;
+  int v_dc_delta_q;
+
+  // Delta of qindex (from base_qindex) for V plane DC and AC coefficients.
+  // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+  int u_ac_delta_q;
+  int v_ac_delta_q;
+
+  // Note: The qindex per superblock may have a delta from the qindex obtained
+  // at frame level from parameters above, based on 'cm->delta_q_info'.
+
+  // The dequantizers below are true dequantizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+
+  // Global quant matrix tables
+  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+
+  // Local quant matrix tables for each frame
+  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+  // Flag indicating whether quantization matrices are being used:
+  //  - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
+  //    indices to be used to access appropriate global quant matrix tables.
+  //  - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+  bool using_qmatrix;
+  int qmatrix_level_y;
+  int qmatrix_level_u;
+  int qmatrix_level_v;
+};
+
+// Context used for transmitting various symbols in the bistream.
+typedef struct CommonContexts CommonContexts;
+struct CommonContexts {
+  // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
+  // partition[i][j] is the context for ith tile row, jth mi_col.
+  PARTITION_CONTEXT **partition;
+
+  // Context used to derive context for multiple symbols:
+  // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
+  // to transmit skip_txfm flag.
+  // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
+  // sign.
+  // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+  ENTROPY_CONTEXT **entropy[MAX_MB_PLANE];
+
+  // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
+  // transmit 'is_split' flag to indicate if this transform block should be
+  // split into smaller sub-blocks.
+  // txfm[i][j] is the context for ith tile row, jth mi_col.
+  TXFM_CONTEXT **txfm;
+
+  // Dimensions that were used to allocate the arrays above.
+  // If these dimensions change, the arrays may have to be re-allocated.
+  int num_planes;     // Corresponds to av1_num_planes(cm)
+  int num_tile_rows;  // Corresponds to cm->tiles.row
+  int num_mi_cols;    // Corresponds to cm->mi_params.mi_cols
+};
+
+typedef struct AV1Common {
+  // Information about the current frame that is being coded.
+  CurrentFrame current_frame;
+  // Code and details about current error status.
+  struct aom_internal_error_info error;
+
+  // AV1 allows two types of frame scaling operations:
+  // (1) Frame super-resolution: that allows coding a frame at lower resolution
+  // and after decoding the frame, normatively uscales and restores the frame --
+  // inside the coding loop.
+  // (2) Frame resize: that allows coding frame at lower/higher resolution, and
+  // then non-normatively upscale the frame at the time of rendering -- outside
+  // the coding loop.
+  // Hence, the need for 3 types of dimensions.
+
+  // Coded frame dimensions.
+  int width;
+  int height;
+
+  // Rendered frame dimensions, after applying both super-resolution and resize
+  // to the coded frame.
+  // Different from coded dimensions if super-resolution and/or resize are
+  // being used for this frame.
+  int render_width;
+  int render_height;
+
+  // Frame dimensions after applying super-resolution to the coded frame (if
+  // present), but before applying resize.
+  // Larger than the coded dimensions if super-resolution is being used for
+  // this frame.
+  // Different from rendered dimensions if resize is being used for this frame.
+  int superres_upscaled_width;
+  int superres_upscaled_height;
+
+  // The denominator of the superres scale used by this frame.
+  // Note: The numerator is fixed to be SCALE_NUMERATOR.
+  uint8_t superres_scale_denominator;
+
+  // If true, buffer removal times are present.
+  bool buffer_removal_time_present;
+  // buffer_removal_times[op_num] specifies the frame removal time in units of
+  // DecCT clock ticks counted from the removal time of the last random access
+  // point for operating point op_num.
+  // TODO(urvang): We probably don't need the +1 here.
+  uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1];
+  // Presentation time of the frame in clock ticks DispCT counted from the
+  // removal time of the last random access point for the operating point that
+  // is being decoded.
+  uint32_t frame_presentation_time;
+
+  // Buffer where previous frame is stored.
+  RefCntBuffer *prev_frame;
+
+  // Buffer into which the current frame will be stored and other related info.
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
+
+  // For encoder, we have a two-level mapping from reference frame type to the
+  // corresponding buffer in the buffer pool:
+  // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+  // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+  // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+  // the reference counted buffer structure RefCntBuffer, taken from the buffer
+  // pool cm->buffer_pool->frame_bufs.
+  //
+  // LAST_FRAME,                        ...,      EXTREF_FRAME
+  //      |                                           |
+  //      v                                           v
+  // remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
+  //      |                                           |
+  //      v                                           v
+  // ref_frame_map[],                   ...,     ref_frame_map[]
+  //
+  // Note: INTRA_FRAME always refers to the current frame, so there's no need to
+  // have a remapped index for the same.
+  int remapped_ref_idx[REF_FRAMES];
+
+  // Scale of the current frame with respect to itself.
+  // This is currently used for intra block copy, which behaves like an inter
+  // prediction mode, where the reference frame is the current frame itself.
+  struct scale_factors sf_identity;
+
+  // Scale factors of the reference frame with respect to the current frame.
+  // This is required for generating inter prediction and will be non-identity
+  // for a reference frame, if it has different dimensions than the coded
+  // dimensions of the current frame.
+  struct scale_factors ref_scale_factors[REF_FRAMES];
+
+  // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+  // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+  // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
+  // remapped reference index 'j' (that is, original reference type 'i') to
+  // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+  RefCntBuffer *ref_frame_map[REF_FRAMES];
+
+  // If true, this frame is actually shown after decoding.
+  // If false, this frame is coded in the bitstream, but not shown. It is only
+  // used as a reference for other frames coded later.
+  int show_frame;
+
+  // If true, this frame can be used as a show-existing frame for other frames
+  // coded later.
+  // When 'show_frame' is true, this is always true for all non-keyframes.
+  // When 'show_frame' is false, this value is transmitted in the bitstream.
+  int showable_frame;
+
+  // If true, show an existing frame coded before, instead of actually coding a
+  // frame. The existing frame comes from one of the existing reference buffers,
+  // as signaled in the bitstream.
+  int show_existing_frame;
+
+  // Whether some features are allowed or not.
+  FeatureFlags features;
+
+  // Params related to MB_MODE_INFO arrays and related info.
+  CommonModeInfoParams mi_params;
+
+#if CONFIG_ENTROPY_STATS
+  int coef_cdf_category;
+#endif
+  // Quantization params.
+  CommonQuantParams quant_params;
+
+  // Segmentation info for current frame.
+  struct segmentation seg;
+
+  // Segmentation map for previous frame.
+  uint8_t *last_frame_seg_map;
+
+  // Deblocking filter parameters.
+  loop_filter_info_n lf_info;
+  struct loopfilter lf;
+
+  // Loop Restoration filter parameters.
+  RestorationInfo rst_info[MAX_MB_PLANE];  // Loop Restoration filter info.
+  int32_t *rst_tmpbuf;  // Scratch buffer for self-guided restoration filter.
+  RestorationLineBuffers *rlbs;  // Line buffers required by loop restoration.
+  YV12_BUFFER_CONFIG rst_frame;  // Stores the output of loop restoration.
+
+  // CDEF (Constrained Directional Enhancement Filter) parameters.
+  CdefInfo cdef_info;
+
+  // Parameters for film grain synthesis.
+  aom_film_grain_t film_grain_params;
+
+  // Parameters for delta quantization and delta loop filter level.
+  DeltaQInfo delta_q_info;
+
+  // Global motion parameters for each reference frame.
+  WarpedMotionParams global_motion[REF_FRAMES];
+
+  // Elements part of the sequence header, that are applicable for all the
+  // frames in the video.
+  SequenceHeader seq_params;
+
+  // Current CDFs of all the symbols for the current frame.
+  FRAME_CONTEXT *fc;
+  // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
+  // (e.g. for a keyframe). These default CDFs are defined by the bitstream and
+  // copied from default CDF tables for each symbol.
+  FRAME_CONTEXT *default_frame_context;
+
+  // Parameters related to tiling.
+  CommonTileParams tiles;
+
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
+  // Above context buffers and their sizes.
+  // Note: above contexts are allocated in this struct, as their size is
+  // dependent on frame width, while left contexts are declared and allocated in
+  // MACROBLOCKD struct, as they have a fixed size.
+  CommonContexts above_contexts;
+
+  // When cm->seq_params.frame_id_numbers_present_flag == 1, current and
+  // reference frame IDs are signaled in the bitstream.
+  int current_frame_id;
+  int ref_frame_id[REF_FRAMES];
+
+  // Motion vectors provided by motion field estimation.
+  // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
+  // mi_row = 2 * row,
+  // mi_col = 2 * col, and
+  // stride = cm->mi_params.mi_stride / 2
+  TPL_MV_REF *tpl_mvs;
+  // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+  int tpl_mvs_mem_size;
+  // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
+  // current frame is positive; and 0 otherwise.
+  int ref_frame_sign_bias[REF_FRAMES];
+  // ref_frame_side[k] is 1 if relative distance between reference 'k' and
+  // current frame is positive, -1 if relative distance is 0; and 0 otherwise.
+  // TODO(jingning): This can be combined with sign_bias later.
+  int8_t ref_frame_side[REF_FRAMES];
+
+  // Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+  unsigned int number_temporal_layers;
+  // Temporal layer ID of this frame
+  // (in the range 0 ... (number_temporal_layers - 1)).
+  int temporal_layer_id;
+
+  // Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+  unsigned int number_spatial_layers;
+  // Spatial layer ID of this frame
+  // (in the range 0 ... (number_spatial_layers - 1)).
+  int spatial_layer_id;
+
+#if TXCOEFF_TIMER
+  int64_t cum_txcoeff_timer;
+  int64_t txcoeff_timer;
+  int txb_count;
+#endif  // TXCOEFF_TIMER
+
+#if TXCOEFF_COST_TIMER
+  int64_t cum_txcoeff_cost_timer;
+  int64_t txcoeff_cost_timer;
+  int64_t txcoeff_cost_count;
+#endif  // TXCOEFF_COST_TIMER
+
+#if CONFIG_LPF_MASK
+  int is_decoding;
+#endif  // CONFIG_LPF_MASK
+} AV1_COMMON;
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+  if (index < 0 || index >= REF_FRAMES) return NULL;
+  if (cm->ref_frame_map[index] == NULL) return NULL;
+  return &cm->ref_frame_map[index]->buf;
+}
+
+static INLINE int get_free_fb(AV1_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0) break;
+
+  if (i != FRAME_BUFFERS) {
+    if (frame_bufs[i].buf.use_external_reference_buffers) {
+      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+      // external reference buffers. Restore the buffer pointers to point to the
+      // internally allocated memory.
+      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+      ybf->y_buffer = ybf->store_buf_adr[0];
+      ybf->u_buffer = ybf->store_buf_adr[1];
+      ybf->v_buffer = ybf->store_buf_adr[2];
+      ybf->use_external_reference_buffers = 0;
+    }
+
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // We should never run out of free buffers. If this assertion fails, there
+    // is a reference leak.
+    assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
+  unlock_buffer_pool(cm->buffer_pool);
+  return i;
+}
+
+static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
+  // Release the previously-used frame-buffer
+  if (cm->cur_frame != NULL) {
+    --cm->cur_frame->ref_count;
+    cm->cur_frame = NULL;
+  }
+
+  // Assign a new framebuffer
+  const int new_fb_idx = get_free_fb(cm);
+  if (new_fb_idx == INVALID_IDX) return NULL;
+
+  cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
+  cm->cur_frame->buf.buf_8bit_valid = 0;
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  return cm->cur_frame;
+}
+
+// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
+// counts accordingly.
+static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
+                                         RefCntBuffer *rhs_ptr) {
+  RefCntBuffer *const old_ptr = *lhs_ptr;
+  if (old_ptr != NULL) {
+    assert(old_ptr->ref_count > 0);
+    // One less reference to the buffer at 'old_ptr', so decrease ref count.
+    --old_ptr->ref_count;
+  }
+
+  *lhs_ptr = rhs_ptr;
+  // One more reference to the buffer at 'rhs_ptr', so increase ref count.
+  ++rhs_ptr->ref_count;
+}
+
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+  return cm->current_frame.frame_type == KEY_FRAME ||
+         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+  return cm->current_frame.frame_type == S_FRAME;
+}
+
+// These functions take a reference frame label between LAST_FRAME and
+// EXTREF_FRAME inclusive.  Note that this is different to the indexing
+// previously used by the frame_refs[] array.
+static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm,
+                                        const MV_REFERENCE_FRAME ref_frame) {
+  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
+             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
+             : INVALID_IDX;
+}
+
+static INLINE RefCntBuffer *get_ref_frame_buf(
+    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Both const and non-const versions of this function are provided so that it
+// can be used with a const AV1_COMMON if needed.
+static INLINE const struct scale_factors *get_ref_scale_factors_const(
+    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE struct scale_factors *get_ref_scale_factors(
+    AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE RefCntBuffer *get_primary_ref_frame_buf(
+    const AV1_COMMON *const cm) {
+  const int primary_ref_frame = cm->features.primary_ref_frame;
+  if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
+  const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+  return !cm->features.error_resilient_mode &&
+         cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
+         cm->seq_params.order_hint_info.enable_order_hint &&
+         !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+  return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
+         cm->seq_params.enable_warped_motion;
+}
+
+static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
+  const int buf_rows = buf->mi_rows;
+  const int buf_cols = buf->mi_cols;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  if (buf->mvs == NULL || buf_rows != mi_params->mi_rows ||
+      buf_cols != mi_params->mi_cols) {
+    aom_free(buf->mvs);
+    buf->mi_rows = mi_params->mi_rows;
+    buf->mi_cols = mi_params->mi_cols;
+    CHECK_MEM_ERROR(cm, buf->mvs,
+                    (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) *
+                                             ((mi_params->mi_cols + 1) >> 1),
+                                         sizeof(*buf->mvs)));
+    aom_free(buf->seg_map);
+    CHECK_MEM_ERROR(
+        cm, buf->seg_map,
+        (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols,
+                              sizeof(*buf->seg_map)));
+  }
+
+  const int mem_size =
+      ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1);
+  int realloc = cm->tpl_mvs == NULL;
+  if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
+
+  if (realloc) {
+    aom_free(cm->tpl_mvs);
+    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+    cm->tpl_mvs_mem_size = mem_size;
+  }
+}
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+}
+
+static INLINE void av1_init_above_context(CommonContexts *above_contexts,
+                                          int num_planes, int tile_row,
+                                          MACROBLOCKD *xd) {
+  for (int i = 0; i < num_planes; ++i) {
+    xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row];
+  }
+  xd->above_partition_context = above_contexts->partition[tile_row];
+  xd->above_txfm_context = above_contexts->txfm[tile_row];
+}
+
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
+  const int num_planes = av1_num_planes(cm);
+  const CommonQuantParams *const quant_params = &cm->quant_params;
+
+  for (int i = 0; i < num_planes; ++i) {
+    xd->plane[i].dqcoeff = dqcoeff;
+
+    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+      memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
+             sizeof(quant_params->y_dequant_QTX));
+      memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix,
+             sizeof(quant_params->y_iqmatrix));
+
+    } else {
+      if (i == AOM_PLANE_U) {
+        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX,
+               sizeof(quant_params->u_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix,
+               sizeof(quant_params->u_iqmatrix));
+      } else {
+        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX,
+               sizeof(quant_params->v_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix,
+               sizeof(quant_params->v_iqmatrix));
+      }
+    }
+  }
+  xd->mi_stride = cm->mi_params.mi_stride;
+  xd->error_info = &cm->error;
+  cfl_init(&xd->cfl, &cm->seq_params);
+}
+
+static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       const int num_planes) {
+  int i;
+  int row_offset = mi_row;
+  int col_offset = mi_col;
+  for (i = 0; i < num_planes; ++i) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    // Offset the buffer pointer
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+      row_offset = mi_row - 1;
+    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+      col_offset = mi_col - 1;
+    int above_idx = col_offset;
+    int left_idx = row_offset & MAX_MIB_MASK;
+    pd->above_entropy_context =
+        &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
+    pd->left_entropy_context =
+        &xd->left_entropy_context[i][left_idx >> pd->subsampling_y];
+  }
+}
+
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units. Align to a multiple of SBs.
+  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
+}
+
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+                                const int num_planes) {
+  int i;
+  for (i = 0; i < num_planes; i++) {
+    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+
+    xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
+    xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
+  }
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh, int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+  xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
+  xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
+  xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE);
+
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
+
+  // Are edges available for intra prediction?
+  xd->up_available = (mi_row > tile->mi_row_start);
+
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
+
+  xd->left_available = (mi_col > tile->mi_col_start);
+  xd->chroma_up_available = xd->up_available;
+  xd->chroma_left_available = xd->left_available;
+  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
+    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
+  if (ss_y && bh < mi_size_high[BLOCK_8X8])
+    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
+  if (xd->up_available) {
+    xd->above_mbmi = xd->mi[-xd->mi_stride];
+  } else {
+    xd->above_mbmi = NULL;
+  }
+
+  if (xd->left_available) {
+    xd->left_mbmi = xd->mi[-1];
+  } else {
+    xd->left_mbmi = NULL;
+  }
+
+  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+  xd->is_chroma_ref = chroma_ref;
+  if (chroma_ref) {
+    // To help calculate the "above" and "left" chroma blocks, note that the
+    // current block may cover multiple luma blocks (eg, if partitioned into
+    // 4x4 luma blocks).
+    // First, find the top-left-most luma block covered by this chroma block
+    MB_MODE_INFO **base_mi =
+        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+    // Then, we consider the luma region covered by the left or above 4x4 chroma
+    // prediction. We want to point to the chroma reference block in that
+    // region, which is the bottom-right-most mi unit.
+    // This leads to the following offsets:
+    MB_MODE_INFO *chroma_above_mi =
+        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+    xd->chroma_above_mbmi = chroma_above_mi;
+
+    MB_MODE_INFO *chroma_left_mi =
+        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+    xd->chroma_left_mbmi = chroma_left_mi;
+  }
+
+  xd->height = bh;
+  xd->width = bw;
+  xd->is_sec_rect = 0;
+  if (xd->width < xd->height) {
+    // Only mark is_sec_rect as 1 for the last block.
+    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
+    // For other partitions, it would be (0, 1).
+    if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1;
+  }
+
+  if (xd->width > xd->height)
+    if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1;
+}
+
+static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+                                           const MB_MODE_INFO *above_mi,
+                                           const MB_MODE_INFO *left_mi) {
+  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[above];
+  const int left_ctx = intra_mode_context[left];
+  return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+                                            int mi_col, BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx =
+      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bh);
+}
+
+static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                      int subsampling_x, int subsampling_y) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
+                ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
+  return ref_pos;
+}
+
+static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
+                                            size_t element) {
+  assert(cdf != NULL);
+  return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
+}
+
+static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
+  out[0] = CDF_PROB_TOP;
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ);
+  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
+  out[0] = AOM_ICDF(out[0]);
+  out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
+  out[0] = CDF_PROB_TOP;
+  out[0] -= cdf_element_prob(in, PARTITION_VERT);
+  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
+  out[0] = AOM_ICDF(out[0]);
+  out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+                                                int mi_col, BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int hbs = mi_size_wide[bsize] / 2;
+    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+      case PARTITION_HORZ_4:
+      case PARTITION_VERT_4:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+                                          int mi_col, BLOCK_SIZE bsize) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx =
+      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+  // Minimum partition point is 8x8. Offset the bsl accordingly.
+  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
+  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+  if (bsize <= BLOCK_8X8)
+    return PARTITION_TYPES;
+  else if (bsize == BLOCK_128X128)
+    return EXT_PARTITION_TYPES - 2;
+  else
+    return EXT_PARTITION_TYPES;
+}
+
+static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  int max_blocks_wide = block_size_wide[bsize];
+
+  if (xd->mb_to_right_edge < 0) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+  }
+
+  // Scale the width in the transform block unit.
+  return max_blocks_wide >> MI_SIZE_LOG2;
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  int max_blocks_high = block_size_high[bsize];
+
+  if (xd->mb_to_bottom_edge < 0) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+  }
+
+  // Scale the height in the transform block unit.
+  return max_blocks_high >> MI_SIZE_LOG2;
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+                                          const MACROBLOCKD *xd,
+                                          int mi_col_start, int mi_col_end,
+                                          const int tile_row) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  const int width = mi_col_end - mi_col_start;
+  const int aligned_width =
+      ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
+  const int offset_y = mi_col_start;
+  const int width_y = aligned_width;
+  const int offset_uv = offset_y >> seq_params->subsampling_x;
+  const int width_uv = width_y >> seq_params->subsampling_x;
+  CommonContexts *const above_contexts = &cm->above_contexts;
+
+  av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y);
+  if (num_planes > 1) {
+    if (above_contexts->entropy[1][tile_row] &&
+        above_contexts->entropy[2][tile_row]) {
+      av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv,
+                     width_uv);
+      av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv,
+                     width_uv);
+    } else {
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid value of planes");
+    }
+  }
+
+  av1_zero_array(above_contexts->partition[tile_row] + mi_col_start,
+                 aligned_width);
+
+  memset(above_contexts->txfm[tile_row] + mi_col_start,
+         tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+  av1_zero(xd->left_entropy_context);
+  av1_zero(xd->left_partition_context);
+
+  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+         sizeof(xd->left_txfm_context_buffer));
+}
+
+// Disable array-bounds checks as the TX_SIZE enum contains values larger than
+// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround
+// infeasible. The assert is enough for static analysis and this or other tools
+// asan, valgrind would catch oob access at runtime.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic warning "-Warray-bounds"
+#endif
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+  int i;
+  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
+                                 const MACROBLOCKD *xd) {
+  uint8_t bw = tx_size_wide[tx_size];
+  uint8_t bh = tx_size_high[tx_size];
+
+  if (skip) {
+    bw = n4_w * MI_SIZE;
+    bh = n4_h * MI_SIZE;
+  }
+
+  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
+}
+
+static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params,
+                                  int mi_row, int mi_col) {
+  return mi_row * mi_params->mi_stride + mi_col;
+}
+
+static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
+                                   int mi_row, int mi_col) {
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_row = mi_row / mi_alloc_size_1d;
+  const int mi_alloc_col = mi_col / mi_alloc_size_1d;
+
+  return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col;
+}
+
+// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi.
+static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params,
+                                  MACROBLOCKD *const xd, int mi_row,
+                                  int mi_col) {
+  // 'mi_grid_base' should point to appropriate memory in 'mi'.
+  const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+  const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+  mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
+  // 'xd->mi' should point to an offset in 'mi_grid_base';
+  xd->mi = mi_params->mi_grid_base + mi_grid_idx;
+  // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'.
+  xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx;
+  xd->tx_type_map_stride = mi_params->mi_stride;
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size, TX_SIZE txb_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+  uint8_t txw = tx_size_wide[tx_size];
+  uint8_t txh = tx_size_high[tx_size];
+  int i;
+  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
+  switch (tx_dim) {
+    case 128:
+    case 64: return TX_64X64; break;
+    case 32: return TX_32X32; break;
+    case 16: return TX_16X16; break;
+    case 8: return TX_8X8; break;
+    default: return TX_4X4;
+  }
+}
+
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+  if (width == height) {
+    return get_sqr_tx_size(width);
+  }
+  if (width < height) {
+    if (width + width == height) {
+      switch (width) {
+        case 4: return TX_4X8; break;
+        case 8: return TX_8X16; break;
+        case 16: return TX_16X32; break;
+        case 32: return TX_32X64; break;
+      }
+    } else {
+      switch (width) {
+        case 4: return TX_4X16; break;
+        case 8: return TX_8X32; break;
+        case 16: return TX_16X64; break;
+      }
+    }
+  } else {
+    if (height + height == width) {
+      switch (height) {
+        case 4: return TX_8X4; break;
+        case 8: return TX_16X8; break;
+        case 16: return TX_32X16; break;
+        case 32: return TX_64X32; break;
+      }
+    } else {
+      switch (height) {
+        case 4: return TX_16X4; break;
+        case 8: return TX_32X8; break;
+        case 16: return TX_64X16; break;
+      }
+    }
+  }
+  assert(0);
+  return TX_4X4;
+}
+
+static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
+                                         const TXFM_CONTEXT *const left_ctx,
+                                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  const int above = *above_ctx < txw;
+  const int left = *left_ctx < txh;
+  int category = TXFM_PARTITION_CONTEXTS;
+
+  // dummy return, not used by others.
+  if (tx_size <= TX_4X4) return 0;
+
+  TX_SIZE max_tx_size =
+      get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
+
+  if (max_tx_size >= TX_8X8) {
+    category =
+        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+        (TX_SIZES - 1 - max_tx_size) * 2;
+  }
+  assert(category != TXFM_PARTITION_CONTEXTS);
+  return category * 3 + above + left;
+}
+
+// Compute the next partition in the direction of the sb_type stored in the mi
+// array, starting with bsize.
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols)
+    return PARTITION_INVALID;
+
+  const int offset = mi_row * mi_params->mi_stride + mi_col;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
+  const BLOCK_SIZE subsize = mi[0]->sb_type;
+
+  if (subsize == bsize) return PARTITION_NONE;
+
+  const int bhigh = mi_size_high[bsize];
+  const int bwide = mi_size_wide[bsize];
+  const int sshigh = mi_size_high[subsize];
+  const int sswide = mi_size_wide[subsize];
+
+  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows &&
+      mi_col + bhigh / 2 < mi_params->mi_cols) {
+    // In this case, the block might be using an extended partition
+    // type.
+    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride];
+
+    if (sswide == bwide) {
+      // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
+      // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
+      // half was split.
+      if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
+      assert(sshigh * 2 == bhigh);
+
+      if (mbmi_below->sb_type == subsize)
+        return PARTITION_HORZ;
+      else
+        return PARTITION_HORZ_B;
+    } else if (sshigh == bhigh) {
+      // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
+      // PARTITION_VERT_B. To distinguish the latter two, check if the right
+      // half was split.
+      if (sswide * 4 == bwide) return PARTITION_VERT_4;
+      assert(sswide * 2 == bhigh);
+
+      if (mbmi_right->sb_type == subsize)
+        return PARTITION_VERT;
+      else
+        return PARTITION_VERT_B;
+    } else {
+      // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
+      // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
+      // dimensions, we immediately know this is a split (which will recurse to
+      // get to subsize). Otherwise look down and to the right. With
+      // PARTITION_VERT_A, the right block will have height bhigh; with
+      // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
+      // it's PARTITION_SPLIT.
+      if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
+
+      if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
+      if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
+
+      return PARTITION_SPLIT;
+    }
+  }
+  const int vert_split = sswide < bwide;
+  const int horz_split = sshigh < bhigh;
+  const int split_idx = (vert_split << 1) | horz_split;
+  assert(split_idx != 0);
+
+  static const PARTITION_TYPE base_partitions[4] = {
+    PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
+  };
+
+  return base_partitions[split_idx];
+}
+
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+                               BLOCK_SIZE sb_size) {
+  seq_params->sb_size = sb_size;
+  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
+}
+
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int coded_lossless = 1;
+  if (cm->seg.enabled) {
+    for (int i = 0; i < MAX_SEGMENTS; ++i) {
+      if (!xd->lossless[i]) {
+        coded_lossless = 0;
+        break;
+      }
+    }
+  } else {
+    coded_lossless = xd->lossless[0];
+  }
+  return coded_lossless;
+}
+
+static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
+  return seq_level_idx == SEQ_LEVEL_MAX ||
+         (seq_level_idx < SEQ_LEVELS &&
+          // The following levels are currently undefined.
+          seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
+          seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
+          seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 &&
+          seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
+          seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_AV1_COMMON_INT_H_
diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d.c b/media/libaom/src/av1/common/av1_inv_txfm1d.c
index 7ef2d6d7f..8d69efcd2 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm1d.c
+++ b/media/libaom/src/av1/common/av1_inv_txfm1d.c
@@ -13,11 +13,8 @@
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/common/av1_txfm.h"
 
-// TODO(angiebird): Make 1-d txfm functions static
-//
-
-void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range) {
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 4;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -57,8 +54,8 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
 }
 
-void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range) {
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 8;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -138,8 +135,8 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
 }
 
-void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 16;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -303,8 +300,8 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
 }
 
-void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 32;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -656,8 +653,8 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
 }
 
-void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   int bit = cos_bit;
   const int32_t *sinpi = sinpi_arr(bit);
   int32_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -713,8 +710,8 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   output[3] = round_shift(x3, bit);
 }
 
-void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 8;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -809,7 +806,6 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
-  stage++;
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -822,8 +818,8 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   bf1[7] = -bf0[1];
 }
 
-void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                     const int8_t *stage_range) {
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 16;
   const int32_t *cospi = cospi_arr(cos_bit);
@@ -1010,7 +1006,6 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
-  stage++;
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -1064,8 +1059,8 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
   for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
 }
 
-void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range) {
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
   assert(output != input);
   const int32_t size = 64;
   const int32_t *cospi = cospi_arr(cos_bit);
diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d.h b/media/libaom/src/av1/common/av1_inv_txfm1d.h
index c31c019aa..e1d5d98d1 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm1d.h
+++ b/media/libaom/src/av1/common/av1_inv_txfm1d.h
@@ -29,22 +29,22 @@ static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
   for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
 }
 
-void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range);
-void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                   const int8_t *stage_range);
-void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                    const int8_t *stage_range);
-void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
-                     const int8_t *stage_range);
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
 void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
                       const int8_t *stage_range);
 void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
index 7d80a0099..47fedbd2a 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
+++ b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
@@ -36,12 +36,12 @@ static const int8_t inv_start_range[TX_SIZES_ALL] = {
   7,  // 64x16 transform
 };
 
-extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL];
 
-// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12
+// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12
 // for each valid row and col combination
 #define INV_COS_BIT 12
-extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
-extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
+extern const int8_t av1_inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t av1_inv_cos_bit_row[5 /*row*/][5 /*col*/];
 
 #endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/media/libaom/src/av1/common/av1_inv_txfm2d.c b/media/libaom/src/av1/common/av1_inv_txfm2d.c
index 4e6944314..559d12129 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm2d.c
+++ b/media/libaom/src/av1/common/av1_inv_txfm2d.c
@@ -113,14 +113,14 @@ void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
 
 static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
-    case TXFM_TYPE_DCT4: return av1_idct4_new;
-    case TXFM_TYPE_DCT8: return av1_idct8_new;
-    case TXFM_TYPE_DCT16: return av1_idct16_new;
-    case TXFM_TYPE_DCT32: return av1_idct32_new;
-    case TXFM_TYPE_DCT64: return av1_idct64_new;
-    case TXFM_TYPE_ADST4: return av1_iadst4_new;
-    case TXFM_TYPE_ADST8: return av1_iadst8_new;
-    case TXFM_TYPE_ADST16: return av1_iadst16_new;
+    case TXFM_TYPE_DCT4: return av1_idct4;
+    case TXFM_TYPE_DCT8: return av1_idct8;
+    case TXFM_TYPE_DCT16: return av1_idct16;
+    case TXFM_TYPE_DCT32: return av1_idct32;
+    case TXFM_TYPE_DCT64: return av1_idct64;
+    case TXFM_TYPE_ADST4: return av1_iadst4;
+    case TXFM_TYPE_ADST8: return av1_iadst8;
+    case TXFM_TYPE_ADST16: return av1_iadst16;
     case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
@@ -149,7 +149,7 @@ static const int8_t inv_shift_32x8[2] = { -2, -4 };
 static const int8_t inv_shift_16x64[2] = { -2, -4 };
 static const int8_t inv_shift_64x16[2] = { -2, -4 };
 
-const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
+const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = {
   inv_shift_4x4,   inv_shift_8x8,   inv_shift_16x16, inv_shift_32x32,
   inv_shift_64x64, inv_shift_4x8,   inv_shift_8x4,   inv_shift_8x16,
   inv_shift_16x8,  inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
@@ -158,7 +158,7 @@ const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
 };
 
 /* clang-format off */
-const int8_t inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
+const int8_t av1_inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
                             [MAX_TXWH_IDX] = {  // txh_idx
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
@@ -167,7 +167,7 @@ const int8_t inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
     {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
   };
 
-const int8_t inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
+const int8_t av1_inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
                             [MAX_TXWH_IDX] = {  // txh_idx
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
     { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
@@ -177,23 +177,22 @@ const int8_t inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
   };
 /* clang-format on */
 
-const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
+static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
 
 void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
                           TXFM_2D_FLIP_CFG *cfg) {
   assert(cfg != NULL);
   cfg->tx_size = tx_size;
-  set_flip_cfg(tx_type, cfg);
   av1_zero(cfg->stage_range_col);
   av1_zero(cfg->stage_range_row);
   set_flip_cfg(tx_type, cfg);
   const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
   const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
-  cfg->shift = inv_txfm_shift_ls[tx_size];
+  cfg->shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  cfg->cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
   if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
     memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
@@ -229,7 +228,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
     (void)real_range_row;
     if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
       // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
-      // so opt_range_col >= real_range_col will not hold
+      // so opt_range_row >= real_range_row will not hold
       stage_range_row[i] = opt_range_row;
     } else {
       assert(opt_range_row >= real_range_row);
@@ -242,7 +241,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
         cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
     (void)real_range_col;
     if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
-      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // the adst4 may use 1 extra bit on top of opt_range_col at stage 1
       // so opt_range_col >= real_range_col will not hold
       stage_range_col[i] = opt_range_col;
     } else {
diff --git a/media/libaom/src/av1/common/av1_loopfilter.c b/media/libaom/src/av1/common/av1_loopfilter.c
index 537d8dfe9..c756760de 100644
--- a/media/libaom/src/av1/common/av1_loopfilter.c
+++ b/media/libaom/src/av1/common/av1_loopfilter.c
@@ -17,8 +17,8 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/av1_loopfilter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
@@ -28,11 +28,9 @@ static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
   { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
 };
 
-static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
-  { 0, 1 }, { 2, 2 }, { 3, 3 }
-};
-
-typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
+                                                      { 2, 2 },
+                                                      { 3, 3 } };
 
 static const int mode_lf_lut[] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
@@ -40,393 +38,6 @@ static const int mode_lf_lut[] = {
   1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
 };
 
-#if LOOP_FILTER_BITMASK
-// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the left border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this (-- and | are used for better view)
-//
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    -----------------
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the top border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    -----------------
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
-  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
-};
-
-const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
-};
-
-const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
-};
-
-const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
-                                                      -1, -1, -1, 0,  1,  2,
-                                                      3,  -1, -1, -1, -1, -1,
-                                                      -1, -1, -1, -1 };
-
-const FilterMask left_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
-      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
-      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
-      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
-      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
-      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-const FilterMask above_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
-      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
-      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
-      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
-      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
-      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
-      0x000000000000000fULL } },  // block size 16X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
-                                     int mi_col) {
-  assert(cm->lf.lfm != NULL);
-  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
-  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
-  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
-}
-
-typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
-                        const uint8_t *limit, const uint8_t *thresh);
-
-typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
-                            const uint8_t *limit0, const uint8_t *thresh0,
-                            const uint8_t *blimit1, const uint8_t *limit1,
-                            const uint8_t *thresh1);
-
-typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh, int bd);
-
-typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1, int bd);
-#endif  // LOOP_FILTER_BITMASK
-
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
 
@@ -448,13 +59,13 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   }
 }
 
-uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
-                         const int dir_idx, int plane,
-                         const MB_MODE_INFO *mbmi) {
+uint8_t av1_get_filter_level(const AV1_COMMON *cm,
+                             const loop_filter_info_n *lfi_n, const int dir_idx,
+                             int plane, const MB_MODE_INFO *mbmi) {
   const int segment_id = mbmi->segment_id;
-  if (cm->delta_lf_present_flag) {
-    int delta_lf;
-    if (cm->delta_lf_multi) {
+  if (cm->delta_q_info.delta_lf_present_flag) {
+    int8_t delta_lf;
+    if (cm->delta_q_info.delta_lf_multi) {
       const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
       delta_lf = mbmi->delta_lf[delta_lf_idx];
     } else {
@@ -531,6 +142,9 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
   filt_lvl_r[1] = cm->lf.filter_level_u;
   filt_lvl_r[2] = cm->lf.filter_level_v;
 
+  assert(plane_start >= AOM_PLANE_Y);
+  assert(plane_end <= MAX_MB_PLANE);
+
   for (plane = plane_start; plane < plane_end; plane++) {
     if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
       break;
@@ -542,7 +156,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
     for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
       for (int dir = 0; dir < 2; ++dir) {
         int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
-        assert(plane >= 0 && plane <= 2);
         const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
         if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
           const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
@@ -575,1321 +188,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
   }
 }
 
-#if LOOP_FILTER_BITMASK
-// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
-// Every 4 rows is represented by one uint64_t mask. Hence,
-// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
-//
-// Given a location by (mi_col, mi_row), This function returns the index
-// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
-//
-// For example, mi_row is the offset of pixels in mi size (4),
-// (mi_row / 4) returns which uint64_t.
-// After locating which uint64_t, mi_row % 4 is the
-// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
-// Therefore, shift = (row << stride_log2) + mi_col;
-int get_index_shift(int mi_col, int mi_row, int *index) {
-  // *index = mi_row >> 2;
-  // rows = mi_row % 4;
-  // stride_log2 = 4;
-  // shift = (rows << stride_log2) + mi_col;
-  *index = mi_row >> 2;
-  return ((mi_row & 3) << 4) | mi_col;
-}
-
-static void check_mask(const FilterMask *lfm) {
-#ifndef NDEBUG
-  for (int i = 0; i < 4; ++i) {
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
-    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
-    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
-    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
-    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
-    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
-    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
-    assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
-  }
-#else
-  (void)lfm;
-#endif
-}
-
-static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
-  if (plane == 0) {
-    // Assert if we try to apply 2 different loop filters at the same
-    // position.
-    check_mask(lfm->left_y);
-    check_mask(lfm->above_y);
-  } else if (plane == 1) {
-    check_mask(lfm->left_u);
-    check_mask(lfm->above_u);
-  } else {
-    check_mask(lfm->left_v);
-    check_mask(lfm->above_v);
-  }
-}
-
-static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
-                         TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
-  if (dir == VERT_EDGE) {
-    switch (plane) {
-      case 0:
-        for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 1:
-        for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 2:
-        for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      default: assert(plane <= 2);
-    }
-  } else {
-    switch (plane) {
-      case 0:
-        for (int i = 0; i < 4; ++i)
-          lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 1:
-        for (int i = 0; i < 4; ++i)
-          lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      case 2:
-        for (int i = 0; i < 4; ++i)
-          lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
-        break;
-      default: assert(plane <= 2);
-    }
-  }
-}
-
-static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
-                             int mi_col, int ssx, int ssy, EDGE_DIR dir) {
-  if (plane && (ssx || ssy)) {
-    if (ssx && ssy) {  // format 420
-      if ((mi_row << MI_SIZE_LOG2) > cm->height ||
-          (mi_col << MI_SIZE_LOG2) > cm->width)
-        return 1;
-    } else if (ssx) {  // format 422
-      if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-          (mi_col << MI_SIZE_LOG2) > cm->width)
-        return 1;
-    }
-  } else {
-    if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-        (mi_col << MI_SIZE_LOG2) >= cm->width)
-      return 1;
-  }
-
-  int row_or_col;
-  if (plane == 0) {
-    row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
-  } else {
-    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
-    // So if mi_col == 1, it is actually the frame boundary.
-    if (dir == VERT_EDGE) {
-      row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
-    } else {
-      row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
-    }
-  }
-  return row_or_col == 0;
-}
-
-static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
-                        int ssx, int ssy, TX_SIZE tx_size) {
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
-  const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
-  // decide whether current vertical/horizontal edge needs loop filtering
-  for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
-    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
-    mi_row |= ssy;
-    mi_col |= ssx;
-
-    MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
-    const MB_MODE_INFO *const mbmi = mi[0];
-    const int curr_skip = mbmi->skip && is_inter_block(mbmi);
-    const BLOCK_SIZE bsize = mbmi->sb_type;
-    const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
-    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
-    const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
-    const int prediction_masks = dir == VERT_EDGE
-                                     ? block_size_wide[plane_bsize] - 1
-                                     : block_size_high[plane_bsize] - 1;
-    const int is_coding_block_border =
-        dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
-
-    // TODO(chengchen): step can be optimized.
-    const int row_step = mi_size_high[TX_4X4] << ssy;
-    const int col_step = mi_size_wide[TX_4X4] << ssx;
-    const int mi_height =
-        dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
-    const int mi_width =
-        dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
-
-    // assign filter levels
-    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
-      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
-        // do not filter frame boundary
-        // Note: when chroma planes' size are half of luma plane,
-        // chroma plane mi corresponds to even position.
-        // If frame size is not even, we still need to filter this chroma
-        // position. Therefore the boundary condition check needs to be
-        // separated to two cases.
-        if (plane && (ssx || ssy)) {
-          if (ssx && ssy) {  // format 420
-            if ((r << MI_SIZE_LOG2) > cm->height ||
-                (c << MI_SIZE_LOG2) > cm->width)
-              continue;
-          } else if (ssx) {  // format 422
-            if ((r << MI_SIZE_LOG2) >= cm->height ||
-                (c << MI_SIZE_LOG2) > cm->width)
-              continue;
-          }
-        } else {
-          if ((r << MI_SIZE_LOG2) >= cm->height ||
-              (c << MI_SIZE_LOG2) >= cm->width)
-            continue;
-        }
-
-        const int row = r % MI_SIZE_64X64;
-        const int col = c % MI_SIZE_64X64;
-        if (plane == 0) {
-          if (dir == VERT_EDGE)
-            lfm->lfl_y_ver[row][col] = level;
-          else
-            lfm->lfl_y_hor[row][col] = level;
-        } else if (plane == 1) {
-          lfm->lfl_u[row][col] = level;
-        } else {
-          lfm->lfl_v[row][col] = level;
-        }
-      }
-    }
-
-    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
-      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
-        // do not filter frame boundary
-        if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
-
-        uint64_t mask[4] = { 0 };
-        const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
-        const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
-        MB_MODE_INFO **mi_prev =
-            cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
-        const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
-        const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
-        const uint8_t level_prev =
-            get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
-        const int is_edge =
-            (level || level_prev) &&
-            (!curr_skip || !prev_skip || is_coding_block_border);
-
-        if (is_edge) {
-          const TX_SIZE prev_tx_size =
-              plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
-                    : mbmi_prev->tx_size;
-          TX_SIZE min_tx_size = (dir == VERT_EDGE)
-                                    ? AOMMIN(txsize_horz_map[tx_size],
-                                             txsize_horz_map[prev_tx_size])
-                                    : AOMMIN(txsize_vert_map[tx_size],
-                                             txsize_vert_map[prev_tx_size]);
-          min_tx_size = AOMMIN(min_tx_size, TX_16X16);
-          assert(min_tx_size < TX_SIZES);
-          const int row = r % MI_SIZE_64X64;
-          const int col = c % MI_SIZE_64X64;
-          int index = 0;
-          const int shift = get_index_shift(col, row, &index);
-          assert(index < 4 && index >= 0);
-          mask[index] |= ((uint64_t)1 << shift);
-          // set mask on corresponding bit
-          update_masks(dir, plane, mask, min_tx_size, lfm);
-        }
-      }
-    }
-  }
-}
-
-static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
-                                int blk_row, int blk_col,
-                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                int plane, int ssx, int ssy) {
-  blk_row <<= ssy;
-  blk_col <<= ssx;
-  if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
-      ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
-    return;
-
-  // U/V plane, tx_size is always the largest size
-  if (plane) {
-    assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
-    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
-                tx_size);
-    return;
-  }
-
-  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
-  const MB_MODE_INFO *const mbmi = mi[0];
-  // For Y plane:
-  // If intra block, tx size is univariant.
-  // If inter block, tx size follows inter_tx_size.
-  TX_SIZE plane_tx_size = tx_size;
-  const int is_inter = is_inter_block(mbmi);
-
-  if (plane == 0) {
-    if (is_inter) {
-      if (mbmi->skip) {
-        // TODO(chengchen): change av1_get_transform_size() to be consistant.
-        // plane_tx_size = get_max_rect_tx_size(plane_bsize);
-        plane_tx_size = mbmi->tx_size;
-      } else {
-        plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
-            plane_bsize, blk_row, blk_col)];
-      }
-    } else {
-      MB_MODE_INFO **mi_this = cm->mi_grid_visible +
-                               (mi_row + blk_row) * cm->mi_stride + mi_col +
-                               blk_col;
-      const MB_MODE_INFO *const mbmi_this = mi_this[0];
-      plane_tx_size = mbmi_this->tx_size;
-    }
-  }
-
-  assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
-
-  if (plane || plane_tx_size == tx_size) {
-    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
-                tx_size);
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsw = tx_size_wide_unit[sub_txs];
-    const int bsh = tx_size_high_unit[sub_txs];
-    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
-      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
-        const int offsetr = blk_row + row;
-        const int offsetc = blk_col + col;
-        setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
-                            sub_txs, plane, ssx, ssy);
-      }
-    }
-  }
-}
-
-static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
-                                 int plane, int ssx, int ssy) {
-  MB_MODE_INFO **mi =
-      cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
-  const MB_MODE_INFO *const mbmi = mi[0];
-
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
-  const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
-
-  const int block_width = mi_size_wide[plane_bsize];
-  const int block_height = mi_size_high[plane_bsize];
-
-  TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
-  // The decoder is designed so that it can process 64x64 luma pixels at a
-  // time. If this is a chroma plane with subsampling and bsize corresponds to
-  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
-  // mustn't be used for the subsampled plane (because it would be bigger than
-  // a 64x64 luma block) so we round down to TX_32X32.
-  if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
-    if (max_txsize == TX_16X64)
-      max_txsize = TX_16X32;
-    else if (max_txsize == TX_64X16)
-      max_txsize = TX_32X16;
-    else
-      max_txsize = TX_32X32;
-  }
-
-  const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
-  const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-  const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
-  const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
-  mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
-  mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
-
-  // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
-  // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
-  // U/V: largest tx size is 32x32.
-  for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
-    for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
-      const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
-      const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
-      for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
-        for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
-          setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
-                              max_txsize, plane, ssx, ssy);
-        }
-      }
-    }
-  }
-}
-
-static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
-  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
-      (mi_col << MI_SIZE_LOG2) >= cm->width)
-    return;
-
-  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int quarter_step = mi_size_wide[bsize] / 4;
-  const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
-  const int has_next_row =
-      (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
-  const int has_next_col =
-      (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
-  int i;
-
-  switch (partition) {
-    case PARTITION_NONE:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      break;
-    case PARTITION_HORZ:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      break;
-    case PARTITION_VERT:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_SPLIT:
-      setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
-      if (has_next_col)
-        setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
-      if (has_next_row)
-        setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
-      if (has_next_col & has_next_row)
-        setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
-                         ssy);
-      break;
-    case PARTITION_HORZ_A:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      break;
-    case PARTITION_HORZ_B:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      if (has_next_col & has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_VERT_A:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_VERT_B:
-      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
-      if (has_next_col)
-        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
-      if (has_next_row)
-        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
-      break;
-    case PARTITION_HORZ_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
-        // chroma plane filter the odd location
-        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
-
-        setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
-      }
-      break;
-    case PARTITION_VERT_4:
-      for (i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= cm->mi_cols) break;
-        // chroma plane filter the odd location
-        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
-
-        setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
-      }
-      break;
-    default: assert(0);
-  }
-}
-
-// TODO(chengchen): if lossless, do not need to setup mask. But when
-// segments enabled, each segment has different lossless settings.
-void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
-                       int subsampling_x, int subsampling_y, int row_end,
-                       int col_end) {
-  const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
-  for (int y = 0; y < num_64x64; ++y) {
-    for (int x = 0; x < num_64x64; ++x) {
-      const int row = mi_row + y * MI_SIZE_64X64;
-      const int col = mi_col + x * MI_SIZE_64X64;
-      if (row >= row_end || col >= col_end) continue;
-      if ((row << MI_SIZE_LOG2) >= cm->height ||
-          (col << MI_SIZE_LOG2) >= cm->width)
-        continue;
-
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
-      if (lfm == NULL) return;
-
-      // init mask to zero
-      if (plane == 0) {
-        av1_zero(lfm->left_y);
-        av1_zero(lfm->above_y);
-        av1_zero(lfm->lfl_y_ver);
-        av1_zero(lfm->lfl_y_hor);
-      } else if (plane == 1) {
-        av1_zero(lfm->left_u);
-        av1_zero(lfm->above_u);
-        av1_zero(lfm->lfl_u);
-      } else {
-        av1_zero(lfm->left_v);
-        av1_zero(lfm->above_v);
-        av1_zero(lfm->lfl_v);
-      }
-    }
-  }
-
-  // set up bitmask for each superblock
-  setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
-                   subsampling_x, subsampling_y);
-
-  for (int y = 0; y < num_64x64; ++y) {
-    for (int x = 0; x < num_64x64; ++x) {
-      const int row = mi_row + y * MI_SIZE_64X64;
-      const int col = mi_col + x * MI_SIZE_64X64;
-      if (row >= row_end || col >= col_end) continue;
-      if ((row << MI_SIZE_LOG2) >= cm->height ||
-          (col << MI_SIZE_LOG2) >= cm->width)
-        continue;
-
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
-      if (lfm == NULL) return;
-
-      // check if the mask is valid
-      check_loop_filter_masks(lfm, plane);
-
-      {
-        // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
-        // Even tx size is greater, we only apply max length filter, which
-        // is 16.
-        if (plane == 0) {
-          for (int j = 0; j < 4; ++j) {
-            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
-            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
-            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
-            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
-
-            // set 32x32 and 64x64 to 0
-            lfm->left_y[TX_32X32].bits[j] = 0;
-            lfm->left_y[TX_64X64].bits[j] = 0;
-            lfm->above_y[TX_32X32].bits[j] = 0;
-            lfm->above_y[TX_64X64].bits[j] = 0;
-          }
-        } else if (plane == 1) {
-          for (int j = 0; j < 4; ++j) {
-            lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
-            lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
-
-            // set 32x32 to 0
-            lfm->left_u[TX_32X32].bits[j] = 0;
-            lfm->above_u[TX_32X32].bits[j] = 0;
-          }
-        } else {
-          for (int j = 0; j < 4; ++j) {
-            lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
-            lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
-
-            // set 32x32 to 0
-            lfm->left_v[TX_32X32].bits[j] = 0;
-            lfm->above_v[TX_32X32].bits[j] = 0;
-          }
-        }
-      }
-
-      // check if the mask is valid
-      check_loop_filter_masks(lfm, plane);
-    }
-  }
-}
-
-static void filter_selectively_vert_row2(
-    int subsampling_factor, uint8_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                     lfi1->hev_thr);
-          }
-        } else if (mask_16x16_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          }
-        } else if (mask_8x8_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-
-static void highbd_filter_selectively_vert_row2(
-    int subsampling_factor, uint16_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                            lfi0->hev_thr, lfi1->mblim,
-                                            lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_16x16_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_8x8_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
-        } else {
-          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-
-static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
-                                     int subsampling, uint64_t mask_16x16,
-                                     uint64_t mask_8x8, uint64_t mask_4x4,
-                                     const loop_filter_info_n *lfi_n,
-                                     const uint8_t *lfl) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds.
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, lfin->mblim, lfin->lim,
-                                       lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_8x8 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
-          count = 2;
-        } else {
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-  }
-}
-
-static void highbd_filter_selectively_horiz(
-    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
-    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
-    uint8_t *lfl, int bd) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds.
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                             lfi->hev_thr, lfin->mblim,
-                                             lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                              lfi->hev_thr, lfin->mblim,
-                                              lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_8x8 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                             lfi->hev_thr, lfin->mblim,
-                                             lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                             lfi->hev_thr, lfin->mblim,
-                                             lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, lfin->mblim, lfin->lim,
-                                           lfin->hev_thr, bd);
-          count = 2;
-        } else {
-          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-  }
-}
-
-void av1_build_bitmask_vert_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  int skip, prev_skip = 0;
-  int is_coding_block_border;
-
-  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
-    const int mi_row = r << subsampling_y;
-    const int row = mi_row % MI_SIZE_64X64;
-    int index = 0;
-    const int shift = get_index_shift(0, row, &index);
-
-    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
-         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
-      const int mi_col = c << subsampling_x;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int col_in_unit = 0;
-           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
-        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
-        if (x >= plane_ptr->dst.width) break;
-        const int col = col_in_unit << subsampling_x;
-        const uint64_t mask = ((uint64_t)1 << (shift | col));
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_ver[row][col]; break;
-          case 1: level = lfm->lfl_u[row][col]; break;
-          case 2: level = lfm->lfl_v[row][col]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((c + col_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
-          const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
-          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-          switch (plane) {
-            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        col_in_unit += tx_size_wide_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_build_bitmask_horz_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  int skip, prev_skip = 0;
-  int is_coding_block_border;
-
-  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
-    const int mi_col = c << subsampling_x;
-    const int col = mi_col % MI_SIZE_64X64;
-
-    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
-         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
-      const int mi_row = r << subsampling_y;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int r_in_unit = 0;
-           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
-        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
-        if (y >= plane_ptr->dst.height) break;
-        const int row = r_in_unit << subsampling_y;
-        int index = 0;
-        const int shift = get_index_shift(col, row, &index);
-        const uint64_t mask = ((uint64_t)1 << shift);
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_hor[row][col]; break;
-          case 1: level = lfm->lfl_u[row][col]; break;
-          case 2: level = lfm->lfl_v[row][col]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((r + r_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
-          const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
-          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-
-          switch (plane) {
-            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        r_in_unit += tx_size_high_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_filter_block_plane_bitmask_vert(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int two_row_step = 2 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  const int two_row_stride = row_stride << 1;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-
-  // 1. vertical filtering. filter two rows at a time
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += two_row_step) {
-    const int row = r | ssy;
-    const int row_next = row + row_step;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    int index_next = 0;
-    const int shift_next = get_index_shift(col, row_next, &index_next);
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_ver[row][col];
-        lfl2 = &lfm->lfl_y_ver[row_next][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u[row][col];
-        lfl2 = &lfm->lfl_u[row_next][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v[row][col];
-        lfl2 = &lfm->lfl_v[row_next][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_vert_row2(
-          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_vert_row2(
-          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-    dst->buf += two_row_stride;
-  }
-  // reset buf pointer for horizontal filtering
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_bitmask_horz(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += row_step) {
-    if (mi_row + r == 0) {
-      dst->buf += row_stride;
-      continue;
-    }
-    const int row = r | ssy;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_hor[row][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u[row][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v[row][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
-    dst->buf += row_stride;
-  }
-  // reset buf pointer for next block
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_ver(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int single_step = 1 << ssy;
-  const int r_step = 2 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-
-  // filter two rows at a time
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      // current and next row should belong to the same mask_idx and index
-      // next row's shift
-      const int row_next = row + single_step;
-      int index_next = 0;
-      const int shift_next = get_index_shift(col, row_next, &index_next);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_ver[row][col];
-          lfl2 = &lfm->lfl_y_ver[row_next][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u[row][col];
-          lfl2 = &lfm->lfl_u[row_next][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v[row][col];
-          lfl2 = &lfm->lfl_v[row_next][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_vert_row2(
-            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
-                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
-                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
-                                     &cm->lf_info, lfl, lfl2);
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += 2 * MI_SIZE * dst->stride;
-  }
-}
-
-void av1_filter_block_plane_hor(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int r_step = 1 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      if (mi_row + r == 0) continue;
-
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_hor[row][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u[row][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v[row][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride, pl, ssx, mask_16x16,
-                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
-                                        (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += MI_SIZE * dst->stride;
-  }
-}
-#endif  // LOOP_FILTER_BITMASK
-
 static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
                                   const MB_MODE_INFO *const mbmi,
                                   const EDGE_DIR edge_dir, const int mi_row,
@@ -1914,7 +212,7 @@ static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
     tx_size = mb_tx_size;
   }
 
-  // since in case of chrominance or non-square transorm need to convert
+  // since in case of chrominance or non-square transform need to convert
   // transform size into transform size in particular direction.
   // for vertical edge, filter direction is horizontal, for horizontal
   // edge, filter direction is vertical.
@@ -1933,7 +231,7 @@ typedef struct AV1_DEBLOCKING_PARAMETERS {
 } AV1_DEBLOCKING_PARAMETERS;
 
 // Return TX_SIZE from get_transform_size(), so it is plane and direction
-// awared
+// aware
 static TX_SIZE set_lpf_parameters(
     AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
     const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
@@ -1958,7 +256,8 @@ static TX_SIZE set_lpf_parameters(
   // and mi_col should be odd number for chroma plane.
   const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
   const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
-  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO **mi =
+      cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
   const MB_MODE_INFO *mbmi = mi[0];
   // If current mbmi is not correctly setup, return an invalid value to stop
   // filtering. One example is that if this tile is not coded, then its mbmi
@@ -1979,7 +278,7 @@ static TX_SIZE set_lpf_parameters(
     // prepare outer edge parameters. deblock the edge if it's an edge of a TU
     {
       const uint32_t curr_level =
-          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+          av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
       const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
       uint32_t level = curr_level;
       if (coord) {
@@ -1994,12 +293,13 @@ static TX_SIZE set_lpf_parameters(
               xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
 
           const uint32_t pv_lvl =
-              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+              av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
 
           const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
           const BLOCK_SIZE bsize =
               get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
                                    plane_ptr->subsampling_y);
+          assert(bsize < BLOCK_SIZES_ALL);
           const int prediction_masks = edge_dir == VERT_EDGE
                                            ? block_size_wide[bsize] - 1
                                            : block_size_high[bsize] - 1;
@@ -2047,21 +347,18 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col) {
-  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-  for (int y = 0; y < y_range; y += row_step) {
+  for (int y = 0; y < y_range; y++) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
     for (int x = 0; x < x_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
-      // If 4x4 trasnform is used, it will then filter the internal edge
+      // If 4x4 transform is used, it will then filter the internal edge
       //  aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2078,6 +375,9 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
         tx_size = TX_4X4;
       }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -2122,6 +422,32 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
         // no filtering
         default: break;
       }
+#else
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+                              params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       // advance the destination pointer
       advance_units = tx_size_wide_unit[tx_size];
       x += advance_units;
@@ -2134,21 +460,18 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col) {
-  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-  for (int x = 0; x < x_range; x += col_step) {
+  for (int x = 0; x < x_range; x++) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will first filter the vertical edge aligned with a 8x8
-      // block. If 4x4 trasnform is used, it will then filter the internal
+      // block. If 4x4 transform is used, it will then filter the internal
       // edge aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2157,14 +480,17 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
       AV1_DEBLOCKING_PARAMETERS params;
       memset(&params, 0, sizeof(params));
 
-      tx_size =
-          set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
-                             HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
       if (tx_size == TX_INVALID) {
         params.filter_length = 0;
         tx_size = TX_4X4;
       }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -2210,6 +536,117 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
         // no filtering
         default: break;
       }
+#else
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+      // advance the destination pointer
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
+    }
+  }
+}
+
+void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
+                                      const MACROBLOCKD *const xd,
+                                      const int plane,
+                                      const MACROBLOCKD_PLANE *const plane_ptr,
+                                      const uint32_t mi_row,
+                                      const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = cm->mi_params.mi_rows >> scale_vert;
+  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  for (int y = 0; y < y_range; y++) {
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 transform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      // advance the destination pointer
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+    }
+  }
+}
+
+void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
+                                      const MACROBLOCKD *const xd,
+                                      const int plane,
+                                      const MACROBLOCKD_PLANE *const plane_ptr,
+                                      const uint32_t mi_row,
+                                      const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = cm->mi_params.mi_rows >> scale_vert;
+  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  for (int x = 0; x < x_range; x++) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 transform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
 
       // advance the destination pointer
       advance_units = tx_size_high_unit[tx_size];
@@ -2221,18 +658,19 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
 
 static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                              MACROBLOCKD *xd, int start, int stop,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                              int is_decoding,
 #endif
                              int plane_start, int plane_end) {
   struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
-  const int col_end = cm->mi_cols;
+  const int col_end = cm->mi_params.mi_cols;
   int mi_row, mi_col;
   int plane;
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   if (is_decoding) {
+    cm->is_decoding = is_decoding;
     for (plane = plane_start; plane < plane_end; plane++) {
       if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
         break;
@@ -2243,24 +681,25 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
 
       av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
                            plane, plane + 1);
+
       av1_build_bitmask_vert_info(cm, &pd[plane], plane);
       av1_build_bitmask_horz_info(cm, &pd[plane], plane);
 
       // apply loop filtering which only goes through buffer once
       for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
         for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
-          av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+          av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col,
                                plane, plane + 1);
           av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
                                               mi_col);
           if (mi_col - MI_SIZE_64X64 >= 0) {
-            av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+            av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
                                  mi_col - MI_SIZE_64X64, plane, plane + 1);
             av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
                                                 mi_col - MI_SIZE_64X64);
           }
         }
-        av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+        av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
                              mi_col - MI_SIZE_64X64, plane, plane + 1);
         av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
                                             mi_col - MI_SIZE_64X64);
@@ -2278,31 +717,6 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
     else if (plane == 2 && !(cm->lf.filter_level_v))
       continue;
 
-#if LOOP_FILTER_BITMASK
-    // filter all vertical edges every superblock (could be 128x128 or 64x64)
-    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
-      for (mi_col = col_start; mi_col < col_end;
-           mi_col += cm->seq_params.mib_size) {
-        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                             mi_col, plane, plane + 1);
-
-        av1_setup_bitmask(cm, mi_row, mi_col, plane, pd[plane].subsampling_x,
-                          pd[plane].subsampling_y, stop, col_end);
-        av1_filter_block_plane_ver(cm, &pd[plane], plane, mi_row, mi_col);
-      }
-    }
-
-    // filter all horizontal edges every superblock
-    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
-      for (mi_col = col_start; mi_col < col_end;
-           mi_col += cm->seq_params.mib_size) {
-        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                             mi_col, plane, plane + 1);
-
-        av1_filter_block_plane_hor(cm, &pd[plane], plane, mi_row, mi_col);
-      }
-    }
-#else
     if (cm->lf.combine_vert_horz_lf) {
       // filter all vertical and horizontal edges in every 128x128 super block
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
@@ -2348,29 +762,28 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
         }
       }
     }
-#endif  // LOOP_FILTER_BITMASK
   }
 }
 
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                            MACROBLOCKD *xd,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                            int is_decoding,
 #endif
                            int plane_start, int plane_end, int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
-  mi_rows_to_filter = cm->mi_rows;
-  if (partial_frame && cm->mi_rows > 8) {
-    start_mi_row = cm->mi_rows >> 1;
+  mi_rows_to_filter = cm->mi_params.mi_rows;
+  if (partial_frame && cm->mi_params.mi_rows > 8) {
+    start_mi_row = cm->mi_params.mi_rows >> 1;
     start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
   loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
                    is_decoding,
 #endif
                    plane_start, plane_end);
diff --git a/media/libaom/src/av1/common/av1_loopfilter.h b/media/libaom/src/av1/common/av1_loopfilter.h
index 80ac61178..ce26d1647 100644
--- a/media/libaom/src/av1/common/av1_loopfilter.h
+++ b/media/libaom/src/av1/common/av1_loopfilter.h
@@ -33,11 +33,12 @@ enum lf_path {
   LF_PATH_SLOW,
 };
 
-#if LOOP_FILTER_BITMASK
+enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
 typedef struct {
   uint64_t bits[4];
 } FilterMask;
 
+#if CONFIG_LPF_MASK
 // This structure holds bit masks for all 4x4 blocks in a 64x64 region.
 // Each 1 bit represents a position in which we want to apply the loop filter.
 // For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
@@ -61,10 +62,12 @@ typedef struct {
   uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
 
   // U plane filter level
-  uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
 
   // V plane filter level
-  uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
 
   // other info
   FilterMask skip;
@@ -74,7 +77,7 @@ typedef struct {
   FilterMask tx_size_ver[2][5];
   FilterMask tx_size_hor[2][5];
 } LoopFilterMask;
-#endif  // LOOP_FILTER_BITMASK
+#endif  // CONFIG_LPF_MASK
 
 struct loopfilter {
   int filter_level[2];
@@ -95,11 +98,11 @@ struct loopfilter {
 
   int combine_vert_horz_lf;
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
   LoopFilterMask *lfm;
   size_t lfm_num;
   int lfm_stride;
-#endif  // LOOP_FILTER_BITMASK
+#endif  // CONFIG_LPF_MASK
 };
 
 // Need to align this structure so when it is declared and
@@ -125,13 +128,13 @@ void av1_loop_filter_init(struct AV1Common *cm);
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int is_decoding,
+                           struct macroblockd *xd, int is_decoding,
                            int plane_start, int plane_end, int partial_frame);
 #else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int plane_start,
+                           struct macroblockd *xd, int plane_start,
                            int plane_end, int partial_frame);
 #endif
 
@@ -154,14 +157,10 @@ typedef struct LoopFilterWorkerData {
   MACROBLOCKD *xd;
 } LFWorkerData;
 
-uint8_t get_filter_level(const struct AV1Common *cm,
-                         const loop_filter_info_n *lfi_n, const int dir_idx,
-                         int plane, const MB_MODE_INFO *mbmi);
-#if LOOP_FILTER_BITMASK
-void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
-                       int plane, int subsampling_x, int subsampling_y,
-                       int row_end, int col_end);
-
+uint8_t av1_get_filter_level(const struct AV1Common *cm,
+                             const loop_filter_info_n *lfi_n, const int dir_idx,
+                             int plane, const MB_MODE_INFO *mbmi);
+#if CONFIG_LPF_MASK
 void av1_filter_block_plane_ver(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane_ptr,
                                 int pl, int mi_row, int mi_col);
@@ -169,56 +168,38 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm,
 void av1_filter_block_plane_hor(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane, int pl,
                                 int mi_row, int mi_col);
-LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
-                                     int mi_row, int mi_col);
-int get_index_shift(int mi_col, int mi_row, int *index);
-
-static const FilterMask left_txform_mask[TX_SIZES] = {
-  { { 0x0000000000000001ULL,  // TX_4X4,
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0000000000010001ULL,  // TX_8X8,
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
-  { { 0x0001000100010001ULL,  // TX_16X16,
-      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-  { { 0x0001000100010001ULL,  // TX_32X32,
-      0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+int get_index_shift(int mi_col, int mi_row, int *index);
 
-  { { 0x0001000100010001ULL,  // TX_64X64,
-      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
-};
+void av1_build_bitmask_vert_info(
+    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane);
 
-static const uint64_t above_txform_mask[2][TX_SIZES] = {
-  {
-      0x0000000000000001ULL,  // TX_4X4
-      0x0000000000000003ULL,  // TX_8X8
-      0x000000000000000fULL,  // TX_16X16
-      0x00000000000000ffULL,  // TX_32X32
-      0x000000000000ffffULL,  // TX_64X64
-  },
-  {
-      0x0000000000000001ULL,  // TX_4X4
-      0x0000000000000005ULL,  // TX_8X8
-      0x0000000000000055ULL,  // TX_16X16
-      0x0000000000005555ULL,  // TX_32X32
-      0x0000000055555555ULL,  // TX_64X64
-  },
-};
+void av1_build_bitmask_horz_info(
+    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane);
 
-extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+void av1_filter_block_plane_bitmask_vert(
+    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+    int pl, int mi_row, int mi_col);
 
-extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+void av1_filter_block_plane_bitmask_horz(
+    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+    int pl, int mi_row, int mi_col);
 
-extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row,
+                                     int mi_col, BLOCK_SIZE bsize,
+                                     MB_MODE_INFO *mbmi);
 
-extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                  int is_horz_coding_block_border,
+                                  int is_vert_coding_block_border);
 
-extern const FilterMask left_mask_univariant_reordered[67];
-
-extern const FilterMask above_mask_univariant_reordered[67];
-#endif
+void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             MB_MODE_INFO *mbmi);
+#endif  // CONFIG_LPF_MASK
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/av1_rtcd_defs.pl b/media/libaom/src/av1/common/av1_rtcd_defs.pl
index dee1f1c79..296c6c572 100644
--- a/media/libaom/src/av1/common/av1_rtcd_defs.pl
+++ b/media/libaom/src/av1/common/av1_rtcd_defs.pl
@@ -33,21 +33,46 @@ struct txfm_param;
 struct aom_variance_vtable;
 struct search_site_config;
 struct yv12_buffer_config;
+struct NN_CONFIG;
+typedef struct NN_CONFIG NN_CONFIG;
+
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif  // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
 
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
                                      uint16_t *output_q3);
 
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+#endif
+
 typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
 typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
                                    int dst_stride, int alpha_q3);
 
-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
-                                   int dst_stride, int alpha_q3, int bd);
 EOF
 }
 forward_decls qw/av1_common_forward_decls/;
@@ -65,22 +90,24 @@ if ($opts{arch} eq "x86_64") {
 add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
 specialize qw/av1_convolve_horiz_rs sse4_1/;
 
-add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
-specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
-
-add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+  specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
 
-add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
+  add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
+  specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/;
+}
 
+add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
 specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
-specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
-specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
-
 
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+specialize qw/av1_dr_prediction_z1 avx2/;
 add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z2 avx2/;
 add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z3 avx2/;
 
 # FILTER_INTRA predictor functions
 add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
@@ -108,21 +135,21 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
 
-add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
 
-add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/;
 
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -151,10 +178,15 @@ add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *out
 add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-# directional intra predictor functions
-add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
-add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  # directional intra predictor functions
+  add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z1 avx2/;
+  add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z2 avx2/;
+  add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z3 avx2/;
+}
 
 # build compound seg mask functions
 add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
@@ -166,6 +198,10 @@ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
 specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
 
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
+
 #
 # Encoder functions below this point.
 #
@@ -176,10 +212,17 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   # the transform coefficients are held in 32-bit
   # values, so the assembler code for  av1_block_error can no longer be used.
   add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/av1_block_error avx2/;
+  specialize qw/av1_block_error sse2 avx2 neon/;
+
+  add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
+  specialize qw/av1_block_error_lp avx2 neon/;
 
   add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_fp sse2 avx2/;
+  specialize qw/av1_quantize_fp sse2 avx2 neon/;
+
+  add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan";
+  specialize qw/av1_quantize_lp avx2 neon/;
+
 
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp_32x32 avx2/;
@@ -196,54 +239,71 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
 
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x8 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x32 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x16 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x16 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x4 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x32 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x8 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
 
   add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
   add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x64 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x32 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x64 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x16 sse4_1/;
 
   #
   # Motion search
   #
-  add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
-
   add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
-  add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-  specialize qw/av1_temporal_filter_apply sse2 msa/;
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
+  }
 
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
+  }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
   # ENCODEMB INVOKE
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+    specialize qw/av1_highbd_block_error sse2 avx2/;
+  }
 
-  add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/av1_highbd_block_error sse2/;
-
-  add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-
-  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+  }
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
@@ -257,30 +317,57 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
-  add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+  add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
   specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
   specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
 
   # hash
-  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
+  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
   specialize qw/av1_get_crc32c_value sse4_2/;
 
-  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride,  double *M, double *H";
+  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
   specialize qw/av1_compute_stats sse4_1 avx2/;
 
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+    specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+  }
+
+  add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+  specialize qw/av1_calc_proj_params avx2/;
+
   add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
   specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+    specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
+  }
+  add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
+  specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
+
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
+  specialize qw/av1_nn_predict sse3/;
 }
 # end encoder functions
 
+# CNN functions
+
+add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
+add_proto qw/void av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+add_proto qw/void av1_cnn_convolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step";
+add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
+
 # Deringing Functions
 
 add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
-add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
+add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift";
 
-add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
-add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
 
 # VS compiling for 32 bit targets does not support vector types in
 # structs as arguments, which makes the v256 type of the intrinsics
@@ -288,27 +375,32 @@ add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const
 if ($opts{config} !~ /libs-x86-win32-vs.*/) {
   specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
   specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
-  specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
-  specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 neon/;
+specialize qw/av1_warp_affine sse4_1 avx2 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+  specialize qw/av1_highbd_warp_affine sse4_1/;
+}
 
-add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_highbd_warp_affine sse4_1/;
+add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
+specialize qw/av1_calc_frame_error sse2 avx2/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
-  specialize qw/compute_cross_correlation sse4_1/;
+  add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
+  specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
 }
 
 # LOOP_RESTORATION functions
 
-add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
+add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
 
 add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
@@ -317,44 +409,48 @@ specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
 
-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-
-  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
-  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
+}
+
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
 
   specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
   specialize qw/av1_convolve_2d_scale sse4_1/;
-  specialize qw/av1_jnt_convolve_2d ssse3 avx2 neon/;
-  specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
-  specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
-  specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
-  specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
-  specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
-  specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
-  specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
-  specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
-  specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
-  specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
-  specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
-  specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
+  specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
+  if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
+    specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+    specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+  }
 
 # INTRA_EDGE functions
 add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
@@ -368,8 +464,8 @@ add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
 specialize qw/av1_upsample_intra_edge_high sse4_1/;
 
 # CFL
-add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
-specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
+add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
 
 add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
@@ -380,19 +476,21 @@ specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
 add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
 
-add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
-specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
+  add_proto qw/cfl_predict_hbd_fn cfl_get_predict_hbd_fn/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_predict_hbd_fn ssse3 avx2 neon/;
+}
 
-add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size";
-specialize qw/get_predict_hbd_fn ssse3 avx2 neon/;
+add_proto qw/cfl_predict_lbd_fn cfl_get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_predict_lbd_fn ssse3 avx2 neon/;
 
 1;
diff --git a/media/libaom/src/av1/common/av1_txfm.c b/media/libaom/src/av1/common/av1_txfm.c
index bb70eab70..ac43402f4 100644
--- a/media/libaom/src/av1/common/av1_txfm.c
+++ b/media/libaom/src/av1/common/av1_txfm.c
@@ -10,10 +10,11 @@
  */
 
 #include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/av1_txfm.h"
 
-// av1_cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+// av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)));
 const int32_t av1_cospi_arr_data[7][64] = {
   { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
     972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
diff --git a/media/libaom/src/av1/common/av1_txfm.h b/media/libaom/src/av1/common/av1_txfm.h
index 59d64ca4a..20049b680 100644
--- a/media/libaom/src/av1/common/av1_txfm.h
+++ b/media/libaom/src/av1/common/av1_txfm.h
@@ -59,7 +59,9 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
   const int64_t min_value = -(1LL << (bit - 1));
   if (value < min_value || value > max_value) {
     fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+#if !CONFIG_AV1_ENCODER
     assert(0);
+#endif
   }
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
 #if DO_RANGE_CHECK_CLAMP
@@ -110,7 +112,7 @@ typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
 typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
                               TX_TYPE tx_type, int bd);
 
-typedef enum TXFM_TYPE {
+enum {
   TXFM_TYPE_DCT4,
   TXFM_TYPE_DCT8,
   TXFM_TYPE_DCT16,
@@ -125,7 +127,7 @@ typedef enum TXFM_TYPE {
   TXFM_TYPE_IDENTITY32,
   TXFM_TYPES,
   TXFM_TYPE_INVALID,
-} TXFM_TYPE;
+} UENUM1BYTE(TXFM_TYPE);
 
 typedef struct TXFM_2D_FLIP_CFG {
   TX_SIZE tx_size;
diff --git a/media/libaom/src/av1/common/blockd.c b/media/libaom/src/av1/common/blockd.c
index 2e796b656..00725ea2d 100644
--- a/media/libaom/src/av1/common/blockd.c
+++ b/media/libaom/src/av1/common/blockd.c
@@ -13,8 +13,8 @@
 
 #include "aom_ports/system_state.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 
 PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
   if (!left_mi) return DC_PRED;
@@ -28,11 +28,12 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
   return above_mi->mode;
 }
 
-void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                      int has_eob, int aoff, int loff) {
-  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+                              struct macroblockd_plane *pd, int plane,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              int has_eob, int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff;
   const int txs_wide = tx_size_wide_unit[tx_size];
   const int txs_high = tx_size_high_unit[tx_size];
 
@@ -56,23 +57,18 @@ void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
     memset(l, has_eob, sizeof(*l) * txs_high);
   }
 }
-void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, const int num_planes) {
-  int i;
-  int nplanes;
-  int chroma_ref;
-  chroma_ref =
-      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y);
-  nplanes = 1 + (num_planes - 1) * chroma_ref;
-  for (i = 0; i < nplanes; i++) {
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                               const int num_planes) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
+  for (int i = 0; i < nplanes; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
-    const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
-    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+    const int txs_wide = mi_size_wide[plane_bsize];
+    const int txs_high = mi_size_high[plane_bsize];
+    memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+    memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
   }
 }
 
@@ -104,37 +100,3 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
     xd->plane[i].subsampling_y = 1;
   }
 }
-
-const int16_t dr_intra_derivative[90] = {
-  // More evenly spread out angles and limited to 10-bit
-  // Values that are 0 will never be used
-  //                    Approx angle
-  0,    0, 0,        //
-  1023, 0, 0,        // 3, ...
-  547,  0, 0,        // 6, ...
-  372,  0, 0, 0, 0,  // 9, ...
-  273,  0, 0,        // 14, ...
-  215,  0, 0,        // 17, ...
-  178,  0, 0,        // 20, ...
-  151,  0, 0,        // 23, ... (113 & 203 are base angles)
-  132,  0, 0,        // 26, ...
-  116,  0, 0,        // 29, ...
-  102,  0, 0, 0,     // 32, ...
-  90,   0, 0,        // 36, ...
-  80,   0, 0,        // 39, ...
-  71,   0, 0,        // 42, ...
-  64,   0, 0,        // 45, ... (45 & 135 are base angles)
-  57,   0, 0,        // 48, ...
-  51,   0, 0,        // 51, ...
-  45,   0, 0, 0,     // 54, ...
-  40,   0, 0,        // 58, ...
-  35,   0, 0,        // 61, ...
-  31,   0, 0,        // 64, ...
-  27,   0, 0,        // 67, ... (67 & 157 are base angles)
-  23,   0, 0,        // 70, ...
-  19,   0, 0,        // 73, ...
-  15,   0, 0, 0, 0,  // 76, ...
-  11,   0, 0,        // 81, ...
-  7,    0, 0,        // 84, ...
-  3,    0, 0,        // 87, ...
-};
diff --git a/media/libaom/src/av1/common/blockd.h b/media/libaom/src/av1/common/blockd.h
index a2311c1b0..47597bc83 100644
--- a/media/libaom/src/av1/common/blockd.h
+++ b/media/libaom/src/av1/common/blockd.h
@@ -37,20 +37,22 @@ extern "C" {
 
 #define MAX_DIFFWTD_MASK_BITS 1
 
+#define INTERINTRA_WEDGE_SIGN 0
+
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum ATTRIBUTE_PACKED {
+enum {
   DIFFWTD_38 = 0,
   DIFFWTD_38_INV,
   DIFFWTD_MASK_TYPES,
-} DIFFWTD_MASK_TYPE;
+} UENUM1BYTE(DIFFWTD_MASK_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
   INTRA_ONLY_FRAME = 2,  // replaces intra-only
   S_FRAME = 3,
   FRAME_TYPES,
-} FRAME_TYPE;
+} UENUM1BYTE(FRAME_TYPE);
 
 static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
@@ -73,24 +75,24 @@ static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
 }
 
 static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
-  static PREDICTION_MODE lut[] = {
-    MB_MODE_COUNT,  // DC_PRED
-    MB_MODE_COUNT,  // V_PRED
-    MB_MODE_COUNT,  // H_PRED
-    MB_MODE_COUNT,  // D45_PRED
-    MB_MODE_COUNT,  // D135_PRED
-    MB_MODE_COUNT,  // D113_PRED
-    MB_MODE_COUNT,  // D157_PRED
-    MB_MODE_COUNT,  // D203_PRED
-    MB_MODE_COUNT,  // D67_PRED
-    MB_MODE_COUNT,  // SMOOTH_PRED
-    MB_MODE_COUNT,  // SMOOTH_V_PRED
-    MB_MODE_COUNT,  // SMOOTH_H_PRED
-    MB_MODE_COUNT,  // PAETH_PRED
-    MB_MODE_COUNT,  // NEARESTMV
-    MB_MODE_COUNT,  // NEARMV
-    MB_MODE_COUNT,  // GLOBALMV
-    MB_MODE_COUNT,  // NEWMV
+  static const PREDICTION_MODE lut[] = {
+    DC_PRED,        // DC_PRED
+    V_PRED,         // V_PRED
+    H_PRED,         // H_PRED
+    D45_PRED,       // D45_PRED
+    D135_PRED,      // D135_PRED
+    D113_PRED,      // D113_PRED
+    D157_PRED,      // D157_PRED
+    D203_PRED,      // D203_PRED
+    D67_PRED,       // D67_PRED
+    SMOOTH_PRED,    // SMOOTH_PRED
+    SMOOTH_V_PRED,  // SMOOTH_V_PRED
+    SMOOTH_H_PRED,  // SMOOTH_H_PRED
+    PAETH_PRED,     // PAETH_PRED
+    NEARESTMV,      // NEARESTMV
+    NEARMV,         // NEARMV
+    GLOBALMV,       // GLOBALMV
+    NEWMV,          // NEWMV
     NEARESTMV,      // NEAREST_NEARESTMV
     NEARMV,         // NEAR_NEARMV
     NEARESTMV,      // NEAREST_NEWMV
@@ -101,12 +103,12 @@ static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
     NEWMV,          // NEW_NEWMV
   };
   assert(NELEMENTS(lut) == MB_MODE_COUNT);
-  assert(is_inter_compound_mode(mode));
+  assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode));
   return lut[mode];
 }
 
 static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
-  static PREDICTION_MODE lut[] = {
+  static const PREDICTION_MODE lut[] = {
     MB_MODE_COUNT,  // DC_PRED
     MB_MODE_COUNT,  // V_PRED
     MB_MODE_COUNT,  // H_PRED
@@ -156,18 +158,16 @@ static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
-typedef int8_t MV_REFERENCE_FRAME;
-
 typedef struct {
-  // Number of base colors for Y (0) and UV (1)
-  uint8_t palette_size[2];
   // Value of base colors for Y, U, and V
   uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+  // Number of base colors for Y (0) and UV (1)
+  uint8_t palette_size[2];
 } PALETTE_MODE_INFO;
 
 typedef struct {
-  uint8_t use_filter_intra;
   FILTER_INTRA_MODE filter_intra_mode;
+  uint8_t use_filter_intra;
 } FILTER_INTRA_MODE_INFO;
 
 static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
@@ -189,23 +189,24 @@ typedef struct RD_STATS {
   int64_t rdcost;
   int64_t sse;
   int skip;  // sse should equal to dist when skip == 1
-  int64_t ref_rdcost;
   int zero_rate;
-  uint8_t invalid_rate;
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
-  int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
-                        [TXB_COEFF_COST_MAP_SIZE];
+  // TODO(jingning): Temporary solution to silence stack over-size warning
+  // in handle_inter_mode. This should be fixed after rate-distortion
+  // optimization refactoring.
+  int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+                            [TXB_COEFF_COST_MAP_SIZE];
 #endif  // CONFIG_RD_DEBUG
 } RD_STATS;
 
 // This struct is used to group function args that are commonly
 // sent together in functions related to interinter compound modes
 typedef struct {
-  int wedge_index;
-  int wedge_sign;
-  DIFFWTD_MASK_TYPE mask_type;
   uint8_t *seg_mask;
+  int8_t wedge_index;
+  int8_t wedge_sign;
+  DIFFWTD_MASK_TYPE mask_type;
   COMPOUND_TYPE type;
 } INTERINTER_COMPOUND_DATA;
 
@@ -213,66 +214,60 @@ typedef struct {
 #define TXK_TYPE_BUF_LEN 64
 // This structure now relates to 4x4 block regions.
 typedef struct MB_MODE_INFO {
+  // interinter members
+  INTERINTER_COMPOUND_DATA interinter_comp;
+  WarpedMotionParams wm_params;
+  int_mv mv[2];
+  int current_qindex;
+  // Only for INTER blocks
+  int_interpfilters interp_filters;
+  // TODO(debargha): Consolidate these flags
+#if CONFIG_RD_DEBUG
+  RD_STATS rd_stats;
+  int mi_row;
+  int mi_col;
+#endif
+#if CONFIG_INSPECTION
+  int16_t tx_skip[TXK_TYPE_BUF_LEN];
+#endif
+  PALETTE_MODE_INFO palette_mode_info;
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
-  TX_SIZE tx_size;
-  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
-  int8_t skip;
-  int8_t skip_mode;
-  int8_t segment_id;
-  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
-
   // Only for INTRA blocks
   UV_PREDICTION_MODE uv_mode;
-
-  PALETTE_MODE_INFO palette_mode_info;
-  uint8_t use_intrabc;
-
-  // Only for INTER blocks
-  InterpFilters interp_filters;
-  MV_REFERENCE_FRAME ref_frame[2];
-
-  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
-
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-
-  // The actual prediction angle is the base angle + (angle_delta * step).
-  int8_t angle_delta[PLANE_TYPES];
-
   // interintra members
   INTERINTRA_MODE interintra_mode;
-  // TODO(debargha): Consolidate these flags
-  int use_wedge_interintra;
-  int interintra_wedge_index;
-  int interintra_wedge_sign;
-  // interinter members
-  INTERINTER_COMPOUND_DATA interinter_comp;
   MOTION_MODE motion_mode;
-  int overlappable_neighbors[2];
-  int_mv mv[2];
-  uint8_t ref_mv_idx;
   PARTITION_TYPE partition;
+  MV_REFERENCE_FRAME ref_frame[2];
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  int8_t skip;
+  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  TX_SIZE tx_size;
+  int8_t delta_lf_from_base;
+  int8_t delta_lf[FRAME_LF_COUNT];
+  int8_t interintra_wedge_index;
+  // The actual prediction angle is the base angle + (angle_delta * step).
+  int8_t angle_delta[PLANE_TYPES];
   /* deringing gain *per-superblock* */
-  int8_t cdef_strength;
-  int current_qindex;
-  int delta_lf_from_base;
-  int delta_lf[FRAME_LF_COUNT];
-#if CONFIG_RD_DEBUG
-  RD_STATS rd_stats;
-  int mi_row;
-  int mi_col;
-#endif
-  int num_proj_ref;
-  WarpedMotionParams wm_params;
-
-  // Index of the alpha Cb and alpha Cr combination
-  int cfl_alpha_idx;
   // Joint sign of alpha Cb and alpha Cr
-  int cfl_alpha_signs;
-
-  int compound_idx;
-  int comp_group_idx;
+  int8_t cfl_alpha_signs;
+  // Index of the alpha Cb and alpha Cr combination
+  uint8_t cfl_alpha_idx;
+  uint8_t num_proj_ref;
+  uint8_t overlappable_neighbors[2];
+  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
+  uint8_t compound_idx;
+  uint8_t use_wedge_interintra : 1;
+  uint8_t segment_id : 3;
+  uint8_t seg_id_predicted : 1;  // valid only when temporal_update is enabled
+  uint8_t skip_mode : 1;
+  uint8_t use_intrabc : 1;
+  uint8_t ref_mv_idx : 2;
+  // Indicate if masked compound is used(1) or not(0).
+  uint8_t comp_group_idx : 1;
+  int8_t cdef_strength : 4;
 } MB_MODE_INFO;
 
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
@@ -366,13 +361,13 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
                                    int mi_row, int tx_blk_col, int tx_blk_row,
                                    int subsampling_x, int subsampling_y) {
   *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
-             (tx_blk_col << tx_size_wide_log2[0]);
+             (tx_blk_col << MI_SIZE_LOG2);
   *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
-             (tx_blk_row << tx_size_high_log2[0]);
+             (tx_blk_row << MI_SIZE_LOG2);
 }
 #endif
 
-enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision);
 
 struct buf_2d {
   uint8_t *buf;
@@ -403,10 +398,10 @@ typedef struct macroblockd_plane {
   int subsampling_y;
   struct buf_2d dst;
   struct buf_2d pre[2];
-  ENTROPY_CONTEXT *above_context;
-  ENTROPY_CONTEXT *left_context;
+  ENTROPY_CONTEXT *above_entropy_context;
+  ENTROPY_CONTEXT *left_entropy_context;
 
-  // The dequantizers below are true dequntizers used only in the
+  // The dequantizers below are true dequantizers used only in the
   // dequantization process.  They have the same coefficient
   // shift/scale as TX.
   int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
@@ -417,23 +412,9 @@ typedef struct macroblockd_plane {
 
   qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
   qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-
-  // the 'dequantizers' below are not literal dequantizer values.
-  // They're used by encoder RDO to generate ad-hoc lambda values.
-  // They use a hardwired Q3 coeff shift and do not necessarily match
-  // the TX scale in use.
-  const int16_t *dequant_Q3;
 } MACROBLOCKD_PLANE;
 
-#define BLOCK_OFFSET(x, i) \
-  ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
-
-typedef struct RefBuffer {
-  int idx;      // frame buf idx
-  int map_idx;  // frame map idx
-  YV12_BUFFER_CONFIG *buf;
-  struct scale_factors sf;
-} RefBuffer;
+#define BLOCK_OFFSET(i) ((i) << 4)
 
 typedef struct {
   DECLARE_ALIGNED(16, InterpKernel, vfilter);
@@ -478,74 +459,148 @@ typedef struct cfl_ctx {
   // Chroma subsampling
   int subsampling_x, subsampling_y;
 
-  int mi_row, mi_col;
-
   // Whether the reconstructed luma pixels need to be stored
   int store_y;
 
 #if CONFIG_DEBUG
   int rate;
 #endif  // CONFIG_DEBUG
-
-  int is_chroma_reference;
 } CFL_CTX;
 
-typedef struct jnt_comp_params {
-  int use_jnt_comp_avg;
+typedef struct dist_wtd_comp_params {
+  int use_dist_wtd_comp_avg;
   int fwd_offset;
   int bck_offset;
-} JNT_COMP_PARAMS;
+} DIST_WTD_COMP_PARAMS;
+
+struct scale_factors;
 
 // Most/all of the pointers are mere pointers to actual arrays are allocated
 // elsewhere. This is mostly for coding convenience.
 typedef struct macroblockd {
+  // Row and column position of current macroblock in mi units.
+  int mi_row;
+  int mi_col;
+  // Same as cm->mi_params.mi_stride, copied here for convenience.
+  int mi_stride;
+
+  // True if current block transmits chroma information.
+  // More detail:
+  // Smallest supported block size for both luma and chroma plane is 4x4. Hence,
+  // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
+  // blocks smaller than 8x8 maybe combined into one chroma block.
+  // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
+  // luma blocks. Then, a single chroma block of size 4x4 will cover the area of
+  // these four luma blocks. This is implemented in bitstream as follows:
+  // - There are four MB_MODE_INFO structs for the four luma blocks.
+  // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
+  // any information for chroma planes.
+  // - Last block will have is_chroma_ref = true and transmits chroma
+  // information for the 4x4 chroma block that covers whole 8x8 area covered by
+  // four luma blocks.
+  // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+  bool is_chroma_ref;
+
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
   TileInfo tile;
 
-  int mi_stride;
-
+  // Appropriate offset inside cm->mi_params.mi_grid_base based on current
+  // mi_row and mi_col.
   MB_MODE_INFO **mi;
+
+  // True if 4x4 block above the current block is available.
+  bool up_available;
+  // True if 4x4 block to the left of the current block is available.
+  bool left_available;
+  // True if the above chrome reference block is available.
+  bool chroma_up_available;
+  // True if the left chrome reference block is available.
+  bool chroma_left_available;
+
+  // MB_MODE_INFO for 4x4 block to the left of the current block, if
+  // left_available == true; otherwise NULL.
   MB_MODE_INFO *left_mbmi;
+  // MB_MODE_INFO for 4x4 block above the current block, if
+  // up_available == true; otherwise NULL.
   MB_MODE_INFO *above_mbmi;
+  // Above chroma reference block if is_chroma_ref == true for the current block
+  // and chroma_up_available == true; otherwise NULL.
+  // See also: the special case logic when current chroma block covers more than
+  // one luma blocks in set_mi_row_col().
   MB_MODE_INFO *chroma_left_mbmi;
+  // Left chroma reference block if is_chroma_ref == true for the current block
+  // and chroma_left_available == true; otherwise NULL.
+  // See also: the special case logic when current chroma block covers more than
+  // one luma blocks in set_mi_row_col().
   MB_MODE_INFO *chroma_above_mbmi;
 
-  int up_available;
-  int left_available;
-  int chroma_up_available;
-  int chroma_left_available;
+  // Appropriate offset based on current 'mi_row' and 'mi_col', inside
+  // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
+  // 'MACROBLOCK' structs.
+  uint8_t *tx_type_map;
+  // Stride for 'tx_type_map'. Note that this may / may not be same as
+  // 'mi_stride', depending on which actual array 'tx_type_map' points to.
+  int tx_type_map_stride;
 
-  /* Distance of MB away from frame edges in subpixels (1/8th pixel)  */
+  // Distance of this macroblock from frame edges in 1/8th pixel units.
   int mb_to_left_edge;
   int mb_to_right_edge;
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  /* pointers to reference frames */
-  const RefBuffer *block_refs[2];
+  // Scale factors for reference frames of the current block.
+  // These are pointers into 'cm->ref_scale_factors'.
+  const struct scale_factors *block_ref_scale_factors[2];
 
-  /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE];
-
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
-
+  // Entropy contexts for the above blocks.
+  // above_entropy_context[i][j] corresponds to above entropy context for ith
+  // plane and jth mi column of this *frame*, wrt current 'mi_row'.
+  // These are pointers into 'cm->above_contexts.entropy'.
+  ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE];
+  // Entropy contexts for the left blocks.
+  // left_entropy_context[i][j] corresponds to left entropy context for ith
+  // plane and jth mi row of this *superblock*, wrt current 'mi_col'.
+  // Note: These contain actual data, NOT pointers.
+  ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE];
+
+  // Partition contexts for the above blocks.
+  // above_partition_context[i] corresponds to above partition context for ith
+  // mi column of this *frame*, wrt current 'mi_row'.
+  // These are pointers into 'cm->above_contexts.partition'.
+  PARTITION_CONTEXT *above_partition_context;
+  // Partition contexts for the left blocks.
+  // left_partition_context[i] corresponds to left partition context for ith
+  // mi row of this *superblock*, wrt current 'mi_col'.
+  // Note: These contain actual data, NOT pointers.
+  PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE];
+
+  // Transform contexts for the above blocks.
+  // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in
+  // code currently. Need to make it consistent / document why.
   TXFM_CONTEXT *above_txfm_context;
+  // Transform contexts for the left blocks.
   TXFM_CONTEXT *left_txfm_context;
+  // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'.
+  // Can we remove this indirection?
   TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
 
+  // Default values for the two restoration filters for each plane.
+  // These values are used as reference values when writing the bitstream. That
+  // is, we transmit the delta between the actual values in
+  // cm->rst_info[plane].unit_info[unit_idx] and these reference values.
   WienerInfo wiener_info[MAX_MB_PLANE];
   SgrprojInfo sgrproj_info[MAX_MB_PLANE];
 
-  // block dimension in the unit of mode_info.
-  uint8_t n4_w, n4_h;
+  // Block dimensions in MB_MODE_INFO units.
+  uint8_t width;
+  uint8_t height;
 
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
   uint8_t is_sec_rect;
 
   // Counts of each reference frame in the above and left neighboring blocks.
@@ -553,15 +608,18 @@ typedef struct macroblockd {
   uint8_t neighbors_ref_counts[REF_FRAMES];
 
   FRAME_CONTEXT *tile_ctx;
-  /* Bit depth: 8, 10, 12 */
+  // Bit depth: copied from cm->seq_params.bit_depth for convenience.
   int bd;
 
   int qindex[MAX_SEGMENTS];
   int lossless[MAX_SEGMENTS];
+  // TODO(urvang): Move to decoder.
   int corrupted;
+  // Same as cm->features.cur_frame_force_integer_mv.
   int cur_frame_force_integer_mv;
-  // same with that in AV1_COMMON
+  // Pointer to cm->error.
   struct aom_internal_error_info *error_info;
+  // Same as cm->global_motion.
   const WarpedMotionParams *global_motion;
   int delta_qindex;
   int current_qindex;
@@ -571,7 +629,7 @@ typedef struct macroblockd {
   // filtering level) and code the delta between previous superblock's delta
   // lf and current delta lf. It is equivalent to the delta between previous
   // superblock's actual lf and current lf.
-  int delta_lf_from_base;
+  int8_t delta_lf_from_base;
   // For this experiment, we have four frame filter levels for different plane
   // and direction. So, to support the per superblock update, we need to add
   // a few more params as below.
@@ -585,14 +643,27 @@ typedef struct macroblockd {
   // SEG_LVL_ALT_LF_Y_H = 2;
   // SEG_LVL_ALT_LF_U   = 3;
   // SEG_LVL_ALT_LF_V   = 4;
-  int delta_lf[FRAME_LF_COUNT];
-  int cdef_preset[4];
+  int8_t delta_lf[FRAME_LF_COUNT];
+  // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
+  // current superblock has already been read from (decoder) / written to
+  // (encoder) the bitstream; and false otherwise.
+  // More detail:
+  // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st
+  // non-skip coding block. So, we need this array to keep track of whether CDEF
+  // strengths for the given CDEF units have been transmitted yet or not.
+  // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is
+  // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
+  // superblock size is 128x128). Hence the array size is 4.
+  // (3) In the current implementation, CDEF strength for this CDEF unit is
+  // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
+  // cm->mi_params.mi_grid_base).
+  bool cdef_transmitted[4];
 
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
   uint8_t *mc_buf[2];
   CFL_CTX cfl;
 
-  JNT_COMP_PARAMS jcp_param;
+  DIST_WTD_COMP_PARAMS jcp_param;
 
   uint16_t cb_offset[MAX_MB_PLANE];
   uint16_t txb_offset[MAX_MB_PLANE];
@@ -602,7 +673,7 @@ typedef struct macroblockd {
   uint8_t *tmp_obmc_bufs[2];
 } MACROBLOCKD;
 
-static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
+static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }
 
@@ -646,19 +717,19 @@ static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
 static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
                                      PLANE_TYPE plane_type) {
   static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
-    DCT_DCT,    // DC
-    ADST_DCT,   // V
-    DCT_ADST,   // H
-    DCT_DCT,    // D45
-    ADST_ADST,  // D135
-    ADST_DCT,   // D117
-    DCT_ADST,   // D153
-    DCT_ADST,   // D207
-    ADST_DCT,   // D63
-    ADST_ADST,  // SMOOTH
-    ADST_DCT,   // SMOOTH_V
-    DCT_ADST,   // SMOOTH_H
-    ADST_ADST,  // PAETH
+    DCT_DCT,    // DC_PRED
+    ADST_DCT,   // V_PRED
+    DCT_ADST,   // H_PRED
+    DCT_DCT,    // D45_PRED
+    ADST_ADST,  // D135_PRED
+    ADST_DCT,   // D113_PRED
+    DCT_ADST,   // D157_PRED
+    DCT_ADST,   // D203_PRED
+    ADST_DCT,   // D67_PRED
+    ADST_ADST,  // SMOOTH_PRED
+    ADST_DCT,   // SMOOTH_V_PRED
+    DCT_ADST,   // SMOOTH_H_PRED
+    ADST_ADST,  // PAETH_PRED
   };
   const PREDICTION_MODE mode =
       (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
@@ -686,6 +757,22 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };
 
+static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = {
+  0x080F,  // DC_PRED:       0000 1000 0000 1111
+  0x040F,  // V_PRED:        0000 0100 0000 1111
+  0x080F,  // H_PRED:        0000 1000 0000 1111
+  0x020F,  // D45_PRED:      0000 0010 0000 1111
+  0x080F,  // D135_PRED:     0000 1000 0000 1111
+  0x040F,  // D113_PRED:     0000 0100 0000 1111
+  0x080F,  // D157_PRED:     0000 1000 0000 1111
+  0x080F,  // D203_PRED:     0000 1000 0000 1111
+  0x040F,  // D67_PRED:      0000 0100 0000 1111
+  0x080F,  // SMOOTH_PRED:   0000 1000 0000 1111
+  0x040F,  // SMOOTH_V_PRED: 0000 0100 0000 1111
+  0x080F,  // SMOOTH_H_PRED: 0000 1000 0000 1111
+  0x0C0E,  // PAETH_PRED:    0000 1100 0000 1110
+};
+
 static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
   0x0001,  // 0000 0000 0000 0001
   0x0201,  // 0000 0010 0000 0001
@@ -695,6 +782,11 @@ static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
   0xFFFF,  // 1111 1111 1111 1111
 };
 
+static const TxSetType av1_ext_tx_set_lookup[2][2] = {
+  { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX },
+  { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT },
+};
+
 static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
                                                 int use_reduced_set) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@@ -704,13 +796,7 @@ static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
   if (use_reduced_set)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
   const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
-  if (is_inter) {
-    return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
-                                    : EXT_TX_SET_ALL16);
-  } else {
-    return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
-                                    : EXT_TX_SET_DTT4_IDTX_1DDCT);
-  }
+  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 }
 
 // Maps tx set types to the indices.
@@ -749,7 +835,6 @@ static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
     return largest_tx_size;
 }
 
-extern const int16_t dr_intra_derivative[90];
 static const uint8_t mode_to_angle_map[] = {
   0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
 };
@@ -777,11 +862,13 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
 
 static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
                                           const MACROBLOCKD *xd,
-                                          TX_SIZE tx_size) {
+                                          TX_SIZE tx_size,
+                                          int is_screen_content_type) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
-      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
+      is_screen_content_type)
     return DCT_DCT;
 
   return intra_mode_to_tx_type(mbmi, plane_type);
@@ -792,45 +879,77 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
                                               int subsampling_x,
                                               int subsampling_y) {
-  if (bsize == BLOCK_INVALID) return BLOCK_INVALID;
+  assert(bsize < BLOCK_SIZES_ALL);
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
   return ss_size_lookup[bsize][subsampling_x][subsampling_y];
 }
 
+/*
+ * Logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+ *   txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
 static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
                                          int blk_col) {
-  TX_SIZE txs = max_txsize_rect_lookup[bsize];
-  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
-    txs = sub_tx_size_map[txs];
-  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
-  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
-  const int bw_log2 = mi_size_wide_log2[bsize];
-  const int stride_log2 = bw_log2 - tx_w_log2;
+  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
+  };
+  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
+  };
+  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
+  };
   const int index =
-      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+      (blk_col >> tw_w_log2_table[bsize]);
   assert(index < INTER_TX_SIZE_BUF_LEN);
   return index;
 }
 
+#if CONFIG_INSPECTION
+/*
+ * Here is the logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+ *   txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
 static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
                                          int blk_col) {
-  TX_SIZE txs = max_txsize_rect_lookup[bsize];
-  for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
-    txs = sub_tx_size_map[txs];
-  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
-  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
-  const int bw_uint_log2 = mi_size_wide_log2[bsize];
-  const int stride_log2 = bw_uint_log2 - tx_w_log2;
+  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+  };
+  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+  };
+  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2,
+  };
   const int index =
-      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+      (blk_col >> tw_w_log2_table[bsize]);
   assert(index < TXK_TYPE_BUF_LEN);
   return index;
 }
+#endif  // CONFIG_INSPECTION
 
-static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
-                                    int blk_row, int blk_col, TX_SIZE tx_size,
+static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row,
+                                    int blk_col, TX_SIZE tx_size,
                                     TX_TYPE tx_type) {
-  const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col);
-  txk_type[txk_type_idx] = tx_type;
+  const int stride = xd->tx_type_map_stride;
+  xd->tx_type_map[blk_row * stride + blk_col] = tx_type;
 
   const int txw = tx_size_wide_unit[tx_size];
   const int txh = tx_size_high_unit[tx_size];
@@ -843,71 +962,84 @@ static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
     const int tx_unit = tx_size_wide_unit[TX_16X16];
     for (int idy = 0; idy < txh; idy += tx_unit) {
       for (int idx = 0; idx < txw; idx += tx_unit) {
-        const int this_index =
-            av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx);
-        txk_type[this_index] = tx_type;
+        xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type;
       }
     }
   }
 }
 
-static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type,
-                                      const MACROBLOCKD *xd, int blk_row,
+static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd,
+                                      PLANE_TYPE plane_type, int blk_row,
                                       int blk_col, TX_SIZE tx_size,
                                       int reduced_tx_set) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
-  const struct macroblockd_plane *const pd = &xd->plane[plane_type];
-  const TxSetType tx_set_type =
-      av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+    return DCT_DCT;
+  }
 
   TX_TYPE tx_type;
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
-    tx_type = DCT_DCT;
+  if (plane_type == PLANE_TYPE_Y) {
+    tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
   } else {
-    if (plane_type == PLANE_TYPE_Y) {
-      const int txk_type_idx =
-          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
-      tx_type = mbmi->txk_type[txk_type_idx];
-    } else if (is_inter_block(mbmi)) {
+    if (is_inter_block(mbmi)) {
       // scale back to y plane's coordinate
+      const struct macroblockd_plane *const pd = &xd->plane[plane_type];
       blk_row <<= pd->subsampling_y;
       blk_col <<= pd->subsampling_x;
-      const int txk_type_idx =
-          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
-      tx_type = mbmi->txk_type[txk_type_idx];
+      tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
     } else {
       // In intra mode, uv planes don't share the same prediction mode as y
       // plane, so the tx_type should not be shared
       tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
     }
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+    if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
   }
   assert(tx_type < TX_TYPES);
-  if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
+  assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
+                                                 reduced_tx_set)][tx_type]);
   return tx_type;
 }
 
 void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
                             const int num_planes);
 
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * int depth = 0;
+ * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+ *   depth++;
+ *   tx_size = sub_tx_size_map[tx_size];
+ * }
+ */
 static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
-  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  int depth = 0;
-  while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
-    depth++;
-    tx_size = sub_tx_size_map[tx_size];
-  }
-  return depth;
+  static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
+    0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  };
+  return bsize_to_max_depth_table[bsize];
 }
 
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * assert(tx_size != TX_4X4);
+ * int depth = 0;
+ * while (tx_size != TX_4X4) {
+ *   depth++;
+ *   tx_size = sub_tx_size_map[tx_size];
+ * }
+ * assert(depth < 10);
+ */
 static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
-  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  assert(tx_size != TX_4X4);
-  int depth = 0;
-  while (tx_size != TX_4X4) {
-    depth++;
-    tx_size = sub_tx_size_map[tx_size];
-    assert(depth < 10);
-  }
+  assert(bsize < BLOCK_SIZES_ALL);
+  static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
+    0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
+  };
+  const int depth = bsize_to_tx_size_depth_table[bsize];
   assert(depth <= MAX_TX_CATS);
   return depth - 1;
 }
@@ -948,8 +1080,8 @@ static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
                                pd->subsampling_y);
 }
 
-void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, const int num_planes);
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                               const int num_planes);
 
 void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
 
@@ -960,9 +1092,10 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size, void *arg);
 
-void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                      int has_eob, int aoff, int loff);
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+                              struct macroblockd_plane *pd, int plane,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              int has_eob, int aoff, int loff);
 
 #define MAX_INTERINTRA_SB_SQUARE 32 * 32
 static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
@@ -1013,15 +1146,13 @@ static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
 }
 
 static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
 }
 
 static INLINE int is_motion_variation_allowed_compound(
     const MB_MODE_INFO *mbmi) {
-  if (!has_second_ref(mbmi))
-    return 1;
-  else
-    return 0;
+  return !has_second_ref(mbmi);
 }
 
 // input: log2 of length, 0(4), 1(8), ...
@@ -1045,7 +1176,8 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
     if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
     assert(!has_second_ref(mbmi));
     if (mbmi->num_proj_ref >= 1 &&
-        (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+        (allow_warped_motion &&
+         !av1_is_scaled(xd->block_ref_scale_factors[0]))) {
       if (xd->cur_frame_force_integer_mv) {
         return OBMC_CAUSAL;
       }
@@ -1057,25 +1189,13 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
   }
 }
 
-static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
-                                            const WarpedMotionParams *gm_params,
-                                            const MACROBLOCKD *xd,
-                                            const MB_MODE_INFO *mbmi,
-                                            int allow_warped_motion) {
-  const MOTION_MODE last_motion_mode_allowed =
-      motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion);
-
-  // Check that the input mode is not illegal
-  if (last_motion_mode_allowed < mode)
-    assert(0 && "Illegal motion mode selected");
-}
-
 static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
   return (is_inter_block(mbmi));
 }
 
 static INLINE int av1_allow_palette(int allow_screen_content_tools,
                                     BLOCK_SIZE sb_type) {
+  assert(sb_type < BLOCK_SIZES_ALL);
   return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
          block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
 }
diff --git a/media/libaom/src/av1/common/cdef.c b/media/libaom/src/av1/common/cdef.c
index e9e2b0e42..ef7b866b5 100644
--- a/media/libaom/src/av1/common/cdef.c
+++ b/media/libaom/src/av1/common/cdef.c
@@ -16,45 +16,29 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cdef.h"
 #include "av1/common/cdef_block.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 
-int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
-  int maxc, maxr;
-  int skip = 1;
-  maxc = cm->mi_cols - mi_col;
-  maxr = cm->mi_rows - mi_row;
-
-  maxr = AOMMIN(maxr, MI_SIZE_64X64);
-  maxc = AOMMIN(maxc, MI_SIZE_64X64);
-
-  for (int r = 0; r < maxr; r++) {
-    for (int c = 0; c < maxc; c++) {
-      skip =
-          skip &&
-          cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
-    }
-  }
-  return skip;
-}
-
 static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
                              int mi_stride) {
-  int is_skip = 1;
-  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
-    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
-      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip;
+  MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
+  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
+    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
+      if (!mbmi[c]->skip) return 0;
+    }
+  }
 
-  return is_skip;
+  return 1;
 }
 
-int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, BLOCK_SIZE bs) {
-  MB_MODE_INFO **grid = cm->mi_grid_visible;
-  int maxc = cm->mi_cols - mi_col;
-  int maxr = cm->mi_rows - mi_row;
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, cdef_list *dlist,
+                             BLOCK_SIZE bs) {
+  MB_MODE_INFO **grid = mi_params->mi_grid_base;
+  int maxc = mi_params->mi_cols - mi_col;
+  int maxr = mi_params->mi_rows - mi_row;
 
   if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
     maxc = AOMMIN(maxc, MI_SIZE_128X128);
@@ -65,22 +49,17 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
   else
     maxr = AOMMIN(maxr, MI_SIZE_64X64);
 
-  const int r_step = mi_size_high[BLOCK_8X8];
-  const int c_step = mi_size_wide[BLOCK_8X8];
-  const int r_shift = (r_step == 2);
-  const int c_shift = (c_step == 2);
-
-  assert(r_step == 1 || r_step == 2);
-  assert(c_step == 1 || c_step == 2);
-
+  const int r_step = 2;  // mi_size_high[BLOCK_8X8]
+  const int c_step = 2;  // mi_size_wide[BLOCK_8X8]
+  const int r_shift = 1;
+  const int c_shift = 1;
   int count = 0;
-
   for (int r = 0; r < maxr; r += r_step) {
     for (int c = 0; c < maxc; c += c_step) {
-      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
+      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c,
+                             mi_params->mi_stride)) {
         dlist[count].by = r >> r_shift;
         dlist[count].bx = c >> c_shift;
-        dlist[count].skip = 0;
         count++;
       }
     }
@@ -88,8 +67,9 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
   return count;
 }
 
-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
-                                int sstride, int v, int h) {
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride,
+                                     const uint8_t *src, int sstride, int v,
+                                     int h) {
   for (int i = 0; i < v; i++) {
     for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
@@ -97,9 +77,9 @@ void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
   }
 }
 
-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
-                                 const uint16_t *src, int sstride, int v,
-                                 int h) {
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+                                      const uint16_t *src, int sstride, int v,
+                                      int h) {
   for (int i = 0; i < v; i++) {
     for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
@@ -107,16 +87,16 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
   }
 }
 
-static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride,
                         const uint8_t *src, int src_voffset, int src_hoffset,
                         int sstride, int vsize, int hsize) {
   if (cm->seq_params.use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
-    copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+    cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
   } else {
     const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
-    copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+    cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
   }
 }
 
@@ -140,6 +120,8 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
 
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                     MACROBLOCKD *xd) {
+  const CdefInfo *const cdef_info = &cm->cdef_info;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
   DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
   uint16_t *linebuf[3];
@@ -154,8 +136,8 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   int xdec[3];
   int ydec[3];
   int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
-  const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
                        num_planes);
   row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
@@ -168,7 +150,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
     mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
   }
-  const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
   for (int pli = 0; pli < num_planes; pli++) {
     linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
     colbuf[pli] =
@@ -190,17 +172,18 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
       int nhb, nvb;
       int cstart = 0;
       curr_row_cdef[fbc] = 0;
-      if (cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                              MI_SIZE_64X64 * fbc] == NULL ||
-          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                              MI_SIZE_64X64 * fbc]
+      if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                  MI_SIZE_64X64 * fbc] == NULL ||
+          mi_params
+                  ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                 MI_SIZE_64X64 * fbc]
                   ->cdef_strength == -1) {
         cdef_left = 0;
         continue;
       }
       if (!cdef_left) cstart = -CDEF_HBORDER;
-      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
-      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+      nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
       int frame_top, frame_left, frame_bottom, frame_right;
 
       int mi_row = MI_SIZE_64X64 * fbr;
@@ -218,32 +201,35 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
       frame_left = (mi_col == 0) ? 1 : 0;
 
       if (fbr != nvfb - 1)
-        frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
+        frame_bottom = (mi_row + MI_SIZE_64X64 == mi_params->mi_rows) ? 1 : 0;
       else
         frame_bottom = 1;
 
       if (fbc != nhfb - 1)
-        frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
+        frame_right = (mi_col + MI_SIZE_64X64 == mi_params->mi_cols) ? 1 : 0;
       else
         frame_right = 1;
 
       const int mbmi_cdef_strength =
-          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                              MI_SIZE_64X64 * fbc]
+          mi_params
+              ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                             MI_SIZE_64X64 * fbc]
               ->cdef_strength;
-      level = cm->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+      level =
+          cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
       sec_strength =
-          cm->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+          cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
       sec_strength += sec_strength == 3;
-      uv_level = cm->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+      uv_level =
+          cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
       uv_sec_strength =
-          cm->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+          cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
       uv_sec_strength += uv_sec_strength == 3;
       if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
            uv_sec_strength == 0) ||
-          (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
-                                             fbc * MI_SIZE_64X64, dlist,
-                                             BLOCK_64X64)) == 0) {
+          (cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
+                                                 fbc * MI_SIZE_64X64, dlist,
+                                                 BLOCK_64X64)) == 0) {
         cdef_left = 0;
         continue;
       }
@@ -252,8 +238,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
       for (int pli = 0; pli < num_planes; pli++) {
         int coffset;
         int rend, cend;
-        int pri_damping = cm->cdef_pri_damping;
-        int sec_damping = cm->cdef_sec_damping;
+        int damping = cdef_info->cdef_damping;
         int hsize = nhb << mi_wide_l2[pli];
         int vsize = nvb << mi_high_l2[pli];
 
@@ -364,7 +349,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
         }
 
         if (cm->seq_params.use_highbitdepth) {
-          cdef_filter_fb(
+          av1_cdef_filter_fb(
               NULL,
               &CONVERT_TO_SHORTPTR(
                   xd->plane[pli]
@@ -374,9 +359,9 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
               xd->plane[pli].dst.stride,
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, pri_damping, sec_damping, coeff_shift);
+              sec_strength, damping, coeff_shift);
         } else {
-          cdef_filter_fb(
+          av1_cdef_filter_fb(
               &xd->plane[pli]
                    .dst.buf[xd->plane[pli].dst.stride *
                                 (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
@@ -384,7 +369,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
               NULL, xd->plane[pli].dst.stride,
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, pri_damping, sec_damping, coeff_shift);
+              sec_strength, damping, coeff_shift);
         }
       }
       cdef_left = 1;
diff --git a/media/libaom/src/av1/common/cdef.h b/media/libaom/src/av1/common/cdef.h
index 3b2eac8a5..c36fd135a 100644
--- a/media/libaom/src/av1/common/cdef.h
+++ b/media/libaom/src/av1/common/cdef.h
@@ -20,8 +20,8 @@
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cdef_block.h"
-#include "av1/common/onyxc_int.h"
 
 static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
 
@@ -37,13 +37,14 @@ static INLINE int constrain(int diff, int threshold, int damping) {
 extern "C" {
 #endif
 
-int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
-int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, BLOCK_SIZE bsize);
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, cdef_list *dlist,
+                             BLOCK_SIZE bsize);
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd, int fast);
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
+                     int rdmult);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/cdef_block.c b/media/libaom/src/av1/common/cdef_block.c
index df1de89be..7120705d3 100644
--- a/media/libaom/src/av1/common/cdef_block.c
+++ b/media/libaom/src/av1/common/cdef_block.c
@@ -108,17 +108,17 @@ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
 }
 
 const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
-const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+const int cdef_sec_taps[2] = { 2, 1 };
 
 /* Smooth in the direction detected. */
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
                          const uint16_t *in, int pri_strength, int sec_strength,
                          int dir, int pri_damping, int sec_damping, int bsize,
-                         AOM_UNUSED int max_unused, int coeff_shift) {
+                         int coeff_shift) {
   int i, j, k;
   const int s = CDEF_BSTRIDE;
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
   for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
     for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
       int16_t sum = 0;
@@ -173,25 +173,20 @@ static INLINE int adjust_strength(int strength, int32_t var) {
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
-void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int pri_damping, int sec_damping,
-                    int coeff_shift) {
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+                        uint16_t *in, int xdec, int ydec,
+                        int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+                        int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                        cdef_list *dlist, int cdef_count, int level,
+                        int sec_strength, int damping, int coeff_shift) {
   int bi;
   int bx;
   int by;
-  int bsize, bsizex, bsizey;
-
-  int pri_strength = level << coeff_shift;
+  const int pri_strength = level << coeff_shift;
   sec_strength <<= coeff_shift;
-  sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
-  pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
-  bsize =
-      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
-  bsizex = 3 - xdec;
-  bsizey = 3 - ydec;
+  damping += coeff_shift - (pli != AOM_PLANE_Y);
+  const int bw_log2 = 3 - xdec;
+  const int bh_log2 = 3 - ydec;
   if (dirinit && pri_strength == 0 && sec_strength == 0) {
     // If we're here, both primary and secondary strengths are 0, and
     // we still haven't written anything to y[] yet, so we just copy
@@ -200,12 +195,12 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
     for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
-      int iy, ix;
       // TODO(stemidts/jmvalin): SIMD optimisations
-      for (iy = 0; iy < 1 << bsizey; iy++)
-        for (ix = 0; ix < 1 << bsizex; ix++)
-          dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
-              in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
+      for (int iy = 0; iy < 1 << bh_log2; iy++) {
+        memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)],
+               &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)],
+               ((size_t)1 << bw_log2) * sizeof(*dst16));
+      }
     }
     return;
   }
@@ -231,27 +226,28 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
     }
   }
 
+  const int bsize =
+      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
+  const int t = pri_strength;
+  const int s = sec_strength;
   for (bi = 0; bi < cdef_count; bi++) {
-    int t = dlist[bi].skip ? 0 : pri_strength;
-    int s = dlist[bi].skip ? 0 : sec_strength;
     by = dlist[bi].by;
     bx = dlist[bi].bx;
-    if (dst8)
-      cdef_filter_block(&dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL,
-                        dstride,
-                        &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
-                        (pli ? t : adjust_strength(t, var[by][bx])), s,
-                        t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize,
-                        (256 << coeff_shift) - 1, coeff_shift);
-    else
+    if (dst8) {
+      cdef_filter_block(
+          &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], NULL, dstride,
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
+          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+          damping, damping, bsize, coeff_shift);
+    } else {
       cdef_filter_block(
           NULL,
-          &dst16[dirinit ? bi << (bsizex + bsizey)
-                         : (by << bsizey) * dstride + (bx << bsizex)],
-          dirinit ? 1 << bsizex : dstride,
-          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+          &dst16[dirinit ? bi << (bw_log2 + bh_log2)
+                         : (by << bh_log2) * dstride + (bx << bw_log2)],
+          dirinit ? 1 << bw_log2 : dstride,
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
           (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1,
-          coeff_shift);
+          damping, damping, bsize, coeff_shift);
+    }
   }
 }
diff --git a/media/libaom/src/av1/common/cdef_block.h b/media/libaom/src/av1/common/cdef_block.h
index 6b4452cd6..6b0ae0a9d 100644
--- a/media/libaom/src/av1/common/cdef_block.h
+++ b/media/libaom/src/av1/common/cdef_block.h
@@ -32,28 +32,27 @@
   (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
 
 extern const int cdef_pri_taps[2][2];
-extern const int cdef_sec_taps[2][2];
+extern const int cdef_sec_taps[2];
 DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
 
 typedef struct {
   uint8_t by;
   uint8_t bx;
-  uint8_t skip;
 } cdef_list;
 
 typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
                                        int dstride, const uint16_t *in,
                                        int pri_strength, int sec_strength,
                                        int dir, int pri_damping,
-                                       int sec_damping, int bsize, int max,
+                                       int sec_damping, int bsize,
                                        int coeff_shift);
 void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
                               cdef_list *dlist, int cdef_count, int bsize);
 
-void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int pri_damping, int sec_damping,
-                    int coeff_shift);
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+                        uint16_t *in, int xdec, int ydec,
+                        int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+                        int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                        cdef_list *dlist, int cdef_count, int level,
+                        int sec_strength, int damping, int coeff_shift);
 #endif  // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/media/libaom/src/av1/common/cdef_block_simd.h b/media/libaom/src/av1/common/cdef_block_simd.h
index 14587a023..5a52bc1e4 100644
--- a/media/libaom/src/av1/common/cdef_block_simd.h
+++ b/media/libaom/src/av1/common/cdef_block_simd.h
@@ -226,7 +226,6 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                                         const uint16_t *in, int pri_strength,
                                         int sec_strength, int dir,
                                         int pri_damping, int sec_damping,
-                                        AOM_UNUSED int max_unused,
                                         int coeff_shift) {
   v128 p0, p1, p2, p3;
   v256 sum, row, tap, res;
@@ -239,7 +238,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -393,7 +392,6 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
                                         const uint16_t *in, int pri_strength,
                                         int sec_strength, int dir,
                                         int pri_damping, int sec_damping,
-                                        AOM_UNUSED int max_unused,
                                         int coeff_shift) {
   int i;
   v128 p0, p1, p2, p3;
@@ -407,7 +405,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -541,7 +539,6 @@ void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
                                          const uint16_t *in, int pri_strength,
                                          int sec_strength, int dir,
                                          int pri_damping, int sec_damping,
-                                         AOM_UNUSED int max_unused,
                                          int coeff_shift) {
   int i;
   v256 p0, p1, p2, p3, sum, row, res;
@@ -554,7 +551,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -699,7 +696,6 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
                                          const uint16_t *in, int pri_strength,
                                          int sec_strength, int dir,
                                          int pri_damping, int sec_damping,
-                                         AOM_UNUSED int max_unused,
                                          int coeff_shift) {
   int i;
   v256 sum, p0, p1, p2, p3, row, res;
@@ -712,7 +708,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
-  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -833,63 +829,62 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
 void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
                                   const uint16_t *in, int pri_strength,
                                   int sec_strength, int dir, int pri_damping,
-                                  int sec_damping, int bsize, int max,
-                                  int coeff_shift) {
+                                  int sec_damping, int bsize, int coeff_shift) {
   if (dst8) {
     if (bsize == BLOCK_8X8) {
       SIMD_FUNC(cdef_filter_block_8x8_8)
       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
     } else if (bsize == BLOCK_4X8) {
       SIMD_FUNC(cdef_filter_block_4x4_8)
       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
       SIMD_FUNC(cdef_filter_block_4x4_8)
       (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
-       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+       sec_strength, dir, pri_damping, sec_damping, coeff_shift);
     } else if (bsize == BLOCK_8X4) {
       SIMD_FUNC(cdef_filter_block_4x4_8)
       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
       SIMD_FUNC(cdef_filter_block_4x4_8)
       (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
     } else {
       SIMD_FUNC(cdef_filter_block_4x4_8)
       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
     }
   } else {
     if (bsize == BLOCK_8X8) {
       SIMD_FUNC(cdef_filter_block_8x8_16)
       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
     } else if (bsize == BLOCK_4X8) {
       SIMD_FUNC(cdef_filter_block_4x4_16)
       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
       SIMD_FUNC(cdef_filter_block_4x4_16)
       (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
-       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+       sec_strength, dir, pri_damping, sec_damping, coeff_shift);
     } else if (bsize == BLOCK_8X4) {
       SIMD_FUNC(cdef_filter_block_4x4_16)
       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
       SIMD_FUNC(cdef_filter_block_4x4_16)
       (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
     } else {
       assert(bsize == BLOCK_4X4);
       SIMD_FUNC(cdef_filter_block_4x4_16)
       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-       sec_damping, max, coeff_shift);
+       sec_damping, coeff_shift);
     }
   }
 }
 
-void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
-                                         const uint8_t *src, int sstride, int v,
-                                         int h) {
+void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+                                              const uint8_t *src, int sstride,
+                                              int v, int h) {
   int i, j;
   for (i = 0; i < v; i++) {
     for (j = 0; j < (h & ~0x7); j += 8) {
@@ -902,9 +897,9 @@ void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
   }
 }
 
-void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
-                                          const uint16_t *src, int sstride,
-                                          int v, int h) {
+void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                               const uint16_t *src, int sstride,
+                                               int v, int h) {
   int i, j;
   for (i = 0; i < v; i++) {
     for (j = 0; j < (h & ~0x7); j += 8) {
diff --git a/media/libaom/src/av1/common/cfl.c b/media/libaom/src/av1/common/cfl.c
index ccc59b4eb..98199cb95 100644
--- a/media/libaom/src/av1/common/cfl.c
+++ b/media/libaom/src/av1/common/cfl.c
@@ -9,9 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
 #include "av1/common/common_data.h"
-#include "av1/common/onyxc_int.h"
 
 #include "config/av1_rtcd.h"
 
@@ -37,7 +37,7 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
   assert(pred_plane < CFL_PRED_PLANES);
   assert(width <= CFL_BUF_LINE);
 
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
     uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
     memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
     return;
@@ -69,7 +69,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
   assert(pred_plane < CFL_PRED_PLANES);
   assert(width <= CFL_BUF_LINE);
   assert(height <= CFL_BUF_LINE);
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
     cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
                          width, height);
@@ -136,7 +136,7 @@ static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
 
 CFL_SUB_AVG_FN(c)
 
-static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
+static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
                                    CFL_PRED_TYPE pred_type) {
   const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
                                                    : CFL_SIGN_V(joint_sign);
@@ -158,18 +158,9 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
   }
 }
 
-// Null function used for invalid tx_sizes
-void cfl_predict_lbd_null(const int16_t *ac_buf_q3, uint8_t *dst,
-                          int dst_stride, int alpha_q3) {
-  (void)ac_buf_q3;
-  (void)dst;
-  (void)dst_stride;
-  (void)alpha_q3;
-  assert(0);
-}
-
 CFL_PREDICT_FN(c, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
                        int alpha_q3, int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
@@ -182,18 +173,8 @@ void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
   }
 }
 
-// Null function used for invalid tx_sizes
-void cfl_predict_hbd_null(const int16_t *ac_buf_q3, uint16_t *dst,
-                          int dst_stride, int alpha_q3, int bd) {
-  (void)ac_buf_q3;
-  (void)dst;
-  (void)dst_stride;
-  (void)alpha_q3;
-  (void)bd;
-  assert(0);
-}
-
 CFL_PREDICT_FN(c, hbd)
+#endif
 
 static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
   CFL_CTX *const cfl = &xd->cfl;
@@ -201,7 +182,7 @@ static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
   assert(cfl->are_parameters_computed == 0);
 
   cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
-  get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+  cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
   cfl->are_parameters_computed = 1;
 }
 
@@ -217,31 +198,15 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
   assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
          CFL_BUF_SQUARE);
-  if (get_bitdepth_data_path_index(xd)) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
-                                xd->bd);
+    cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
+                                    alpha_q3, xd->bd);
     return;
   }
-  get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
-}
-
-// Null function used for invalid tx_sizes
-void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
-                            uint16_t *output_q3) {
-  (void)input;
-  (void)input_stride;
-  (void)output_q3;
-  assert(0);
-}
-
-// Null function used for invalid tx_sizes
-void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
-                            uint16_t *output_q3) {
-  (void)input;
-  (void)input_stride;
-  (void)output_q3;
-  assert(0);
+#endif
+  cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
 }
 
 static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
@@ -287,6 +252,7 @@ static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
                                            int input_stride,
                                            uint16_t *output_q3, int width,
@@ -329,9 +295,11 @@ static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
     output_q3 += CFL_BUF_LINE;
   }
 }
+#endif
 
 CFL_GET_SUBSAMPLE_FUNCTION(c)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
                                                        int sub_x, int sub_y) {
   if (sub_x == 1) {
@@ -342,6 +310,7 @@ static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
   }
   return cfl_get_luma_subsampling_444_hbd(tx_size);
 }
+#endif
 
 static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
                                                        int sub_x, int sub_y) {
@@ -358,7 +327,7 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
                       int row, int col, TX_SIZE tx_size, int use_hbd) {
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
-  const int tx_off_log2 = tx_size_wide_log2[0];
+  const int tx_off_log2 = MI_SIZE_LOG2;
   const int sub_x = cfl->subsampling_x;
   const int sub_y = cfl->subsampling_y;
   const int store_row = row << (tx_off_log2 - sub_y);
@@ -387,7 +356,7 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
   // Store the input into the CfL pixel buffer
   uint16_t *recon_buf_q3 =
       cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
-
+#if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd) {
     cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
                                                input_stride, recon_buf_q3);
@@ -395,20 +364,25 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
     cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
                                                recon_buf_q3);
   }
+#else
+  (void)use_hbd;
+  cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3);
+#endif
 }
 
 // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
 // and non-chroma-referenced blocks are stored together in the CfL buffer.
-static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
+                                        int mi_col, int *row_out,
                                         int *col_out) {
   // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
-  if ((cfl->mi_row & 0x01) && cfl->subsampling_y) {
+  if ((mi_row & 0x01) && cfl->subsampling_y) {
     assert(*row_out == 0);
     (*row_out)++;
   }
 
   // Increment col index for right: 4x8, 4x16 or both right 4x4s.
-  if ((cfl->mi_col & 0x01) && cfl->subsampling_x) {
+  if ((mi_col & 0x01) && cfl->subsampling_x) {
     assert(*col_out == 0);
     (*col_out)++;
   }
@@ -418,17 +392,31 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
                   BLOCK_SIZE bsize) {
   CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
-  uint8_t *dst =
-      &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+  uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
 
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     // Only dimensions of size 4 can have an odd offset.
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
-    sub8x8_adjust_offset(cfl, &row, &col);
+    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
   }
-  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
-            get_bitdepth_data_path_index(xd));
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
+}
+
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+                                        BLOCK_SIZE plane_bsize, int plane,
+                                        TX_SIZE tx_size) {
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+                              << MI_SIZE_LOG2;
+  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         TX_SIZE tx_size) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+                              << MI_SIZE_LOG2;
+  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
 }
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -438,11 +426,11 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
   int col = 0;
 
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
-    sub8x8_adjust_offset(cfl, &row, &col);
+    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
   }
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
   tx_size = get_tx_size(width, height);
   cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
-            get_bitdepth_data_path_index(xd));
+            is_cur_buf_hbd(xd));
 }
diff --git a/media/libaom/src/av1/common/cfl.h b/media/libaom/src/av1/common/cfl.h
index d627891bf..a1d6dc2ea 100644
--- a/media/libaom/src/av1/common/cfl.h
+++ b/media/libaom/src/av1/common/cfl.h
@@ -12,8 +12,8 @@
 #ifndef AOM_AV1_COMMON_CFL_H_
 #define AOM_AV1_COMMON_CFL_H_
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 
 // Can we use CfL for the current block?
 static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
@@ -41,7 +41,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
 
   if (cm->seq_params.monochrome) return CFL_DISALLOWED;
 
-  if (!xd->cfl.is_chroma_reference) {
+  if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,
     // in case the corresponding chroma-reference block uses CfL.
     // Note that this can only happen for block sizes which are <8 on
@@ -80,14 +80,6 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
 void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
                       TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
 
-// Null function used for invalid tx_sizes
-void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
-                            uint16_t *output_q3);
-
-// Null function used for invalid tx_sizes
-void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
-                            uint16_t *output_q3);
-
 // Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
 #define CFL_lbd_TYPE uint8_t *cfl_type
 #define CFL_hbd_TYPE uint16_t *cfl_type
@@ -97,7 +89,7 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
 #define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
-  void subsample_##bd##_##sub##_##width##x##height##_##arch(              \
+  void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
       const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
     cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
                                                output_q3, width, height); \
@@ -127,31 +119,32 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
 
 // Declare an architecture-specific array of function pointers for size-specific
 // wrappers.
-#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                       \
-  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {      \
-    subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
-    subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
-    subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
-    subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
-    cfl_subsample_##bd##_null,             /* 64x64 (invalid CFL size) */ \
-    subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
-    subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
-    subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
-    subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
-    subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
-    subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
-    cfl_subsample_##bd##_null,             /* 32x64 (invalid CFL size) */ \
-    cfl_subsample_##bd##_null,             /* 64x32 (invalid CFL size) */ \
-    subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
-    subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
-    subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
-    subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
-    cfl_subsample_##bd##_null,             /* 16x64 (invalid CFL size) */ \
-    cfl_subsample_##bd##_null,             /* 64x16 (invalid CFL size) */ \
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {          \
+    cfl_subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
+    cfl_subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
+    cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
+    cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
+    NULL,                                      /* 64x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
+    cfl_subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
+    cfl_subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
+    cfl_subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
+    cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
+    cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
+    NULL,                                      /* 32x64 (invalid CFL size) */ \
+    NULL,                                      /* 64x32 (invalid CFL size) */ \
+    cfl_subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
+    cfl_subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
+    cfl_subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
+    cfl_subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
+    NULL,                                      /* 16x64 (invalid CFL size) */ \
+    NULL,                                      /* 64x16 (invalid CFL size) */ \
   };
 
 // The RTCD script does not support passing in an array, so we wrap it in this
 // function.
+#if CONFIG_AV1_HIGHBITDEPTH
 #define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
@@ -159,144 +152,137 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
   CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
-
-// Null function used for invalid tx_sizes
-static INLINE void cfl_subtract_average_null(const uint16_t *src,
-                                             int16_t *dst) {
-  (void)dst;
-  (void)src;
-  assert(0);
-}
+#else
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd)
+#endif
 
 // Declare a size-specific wrapper for the size-generic function. The compiler
 // will inline the size generic function in here, the advantage is that the size
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
-#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)   \
-  void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
-                                                    int16_t *dst) {      \
-    subtract_average_##arch(src, dst, width, height, round_offset,       \
-                            num_pel_log2);                               \
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)       \
+  void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                        int16_t *dst) {      \
+    subtract_average_##arch(src, dst, width, height, round_offset,           \
+                            num_pel_log2);                                   \
   }
 
 // Declare size-specific wrappers for all valid CfL sizes.
-#define CFL_SUB_AVG_FN(arch)                                                \
-  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                           \
-  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                          \
-  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                         \
-  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                          \
-  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                          \
-  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                         \
-  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                        \
-  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                         \
-  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                         \
-  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                       \
-  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                       \
-  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                        \
-  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                       \
-  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                      \
-  cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
-    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {          \
-      subtract_average_4x4_##arch,   /* 4x4 */                              \
-      subtract_average_8x8_##arch,   /* 8x8 */                              \
-      subtract_average_16x16_##arch, /* 16x16 */                            \
-      subtract_average_32x32_##arch, /* 32x32 */                            \
-      cfl_subtract_average_null,     /* 64x64 (invalid CFL size) */         \
-      subtract_average_4x8_##arch,   /* 4x8 */                              \
-      subtract_average_8x4_##arch,   /* 8x4 */                              \
-      subtract_average_8x16_##arch,  /* 8x16 */                             \
-      subtract_average_16x8_##arch,  /* 16x8 */                             \
-      subtract_average_16x32_##arch, /* 16x32 */                            \
-      subtract_average_32x16_##arch, /* 32x16 */                            \
-      cfl_subtract_average_null,     /* 32x64 (invalid CFL size) */         \
-      cfl_subtract_average_null,     /* 64x32 (invalid CFL size) */         \
-      subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */          \
-      subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */          \
-      subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */          \
-      subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */          \
-      cfl_subtract_average_null,     /* 16x64 (invalid CFL size) */         \
-      cfl_subtract_average_null,     /* 64x16 (invalid CFL size) */         \
-    };                                                                      \
-    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
-    /* index the function pointer array out of bounds. */                   \
-    return sub_avg[tx_size % TX_SIZES_ALL];                                 \
+#define CFL_SUB_AVG_FN(arch)                                              \
+  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                         \
+  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                        \
+  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                       \
+  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                        \
+  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                        \
+  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                       \
+  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                      \
+  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                       \
+  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                       \
+  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                     \
+  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                     \
+  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                      \
+  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                     \
+  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                    \
+  cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch(             \
+      TX_SIZE tx_size) {                                                  \
+    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {        \
+      cfl_subtract_average_4x4_##arch,   /* 4x4 */                        \
+      cfl_subtract_average_8x8_##arch,   /* 8x8 */                        \
+      cfl_subtract_average_16x16_##arch, /* 16x16 */                      \
+      cfl_subtract_average_32x32_##arch, /* 32x32 */                      \
+      NULL,                              /* 64x64 (invalid CFL size) */   \
+      cfl_subtract_average_4x8_##arch,   /* 4x8 */                        \
+      cfl_subtract_average_8x4_##arch,   /* 8x4 */                        \
+      cfl_subtract_average_8x16_##arch,  /* 8x16 */                       \
+      cfl_subtract_average_16x8_##arch,  /* 16x8 */                       \
+      cfl_subtract_average_16x32_##arch, /* 16x32 */                      \
+      cfl_subtract_average_32x16_##arch, /* 32x16 */                      \
+      NULL,                              /* 32x64 (invalid CFL size) */   \
+      NULL,                              /* 64x32 (invalid CFL size) */   \
+      cfl_subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */    \
+      cfl_subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */    \
+      cfl_subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */    \
+      cfl_subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */    \
+      NULL,                              /* 16x64 (invalid CFL size) */   \
+      NULL,                              /* 64x16 (invalid CFL size) */   \
+    };                                                                    \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+    /* index the function pointer array out of bounds. */                 \
+    return sub_avg[tx_size % TX_SIZES_ALL];                               \
   }
 
 // For VSX SIMD optimization, the C versions of width == 4 subtract are
 // faster than the VSX. As such, the VSX code calls the C versions.
-void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
-void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
-void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
-
-#define CFL_PREDICT_lbd(arch, width, height)                                 \
-  void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,   \
-                                               uint8_t *dst, int dst_stride, \
-                                               int alpha_q3) {               \
-    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,    \
-                           height);                                          \
+void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height)                              \
+  void cfl_predict_lbd_##width##x##height##_##arch(                       \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,           \
+      int alpha_q3) {                                                     \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
+                           height);                                       \
   }
 
-#define CFL_PREDICT_hbd(arch, width, height)                                  \
-  void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,    \
-                                               uint16_t *dst, int dst_stride, \
-                                               int alpha_q3, int bd) {        \
-    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
-                           height);                                           \
+#if CONFIG_AV1_HIGHBITDEPTH
+#define CFL_PREDICT_hbd(arch, width, height)                                   \
+  void cfl_predict_hbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+      int bd) {                                                                \
+    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
+                           height);                                            \
   }
+#endif
 
 // This wrapper exists because clang format does not like calling macros with
 // lowercase letters.
 #define CFL_PREDICT_X(arch, width, height, bd) \
   CFL_PREDICT_##bd(arch, width, height)
 
-// Null function used for invalid tx_sizes
-void cfl_predict_lbd_null(const int16_t *pred_buf_q3, uint8_t *dst,
-                          int dst_stride, int alpha_q3);
-
-// Null function used for invalid tx_sizes
-void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
-                          int dst_stride, int alpha_q3, int bd);
-
-#define CFL_PREDICT_FN(arch, bd)                                          \
-  CFL_PREDICT_X(arch, 4, 4, bd)                                           \
-  CFL_PREDICT_X(arch, 4, 8, bd)                                           \
-  CFL_PREDICT_X(arch, 4, 16, bd)                                          \
-  CFL_PREDICT_X(arch, 8, 4, bd)                                           \
-  CFL_PREDICT_X(arch, 8, 8, bd)                                           \
-  CFL_PREDICT_X(arch, 8, 16, bd)                                          \
-  CFL_PREDICT_X(arch, 8, 32, bd)                                          \
-  CFL_PREDICT_X(arch, 16, 4, bd)                                          \
-  CFL_PREDICT_X(arch, 16, 8, bd)                                          \
-  CFL_PREDICT_X(arch, 16, 16, bd)                                         \
-  CFL_PREDICT_X(arch, 16, 32, bd)                                         \
-  CFL_PREDICT_X(arch, 32, 8, bd)                                          \
-  CFL_PREDICT_X(arch, 32, 16, bd)                                         \
-  CFL_PREDICT_X(arch, 32, 32, bd)                                         \
-  cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) {   \
-    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {             \
-      predict_##bd##_4x4_##arch,   /* 4x4 */                              \
-      predict_##bd##_8x8_##arch,   /* 8x8 */                              \
-      predict_##bd##_16x16_##arch, /* 16x16 */                            \
-      predict_##bd##_32x32_##arch, /* 32x32 */                            \
-      cfl_predict_##bd##_null,     /* 64x64 (invalid CFL size) */         \
-      predict_##bd##_4x8_##arch,   /* 4x8 */                              \
-      predict_##bd##_8x4_##arch,   /* 8x4 */                              \
-      predict_##bd##_8x16_##arch,  /* 8x16 */                             \
-      predict_##bd##_16x8_##arch,  /* 16x8 */                             \
-      predict_##bd##_16x32_##arch, /* 16x32 */                            \
-      predict_##bd##_32x16_##arch, /* 32x16 */                            \
-      cfl_predict_##bd##_null,     /* 32x64 (invalid CFL size) */         \
-      cfl_predict_##bd##_null,     /* 64x32 (invalid CFL size) */         \
-      predict_##bd##_4x16_##arch,  /* 4x16  */                            \
-      predict_##bd##_16x4_##arch,  /* 16x4  */                            \
-      predict_##bd##_8x32_##arch,  /* 8x32  */                            \
-      predict_##bd##_32x8_##arch,  /* 32x8  */                            \
-      cfl_predict_##bd##_null,     /* 16x64 (invalid CFL size) */         \
-      cfl_predict_##bd##_null,     /* 64x16 (invalid CFL size) */         \
-    };                                                                    \
-    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
-    /* index the function pointer array out of bounds. */                 \
-    return pred[tx_size % TX_SIZES_ALL];                                  \
+#define CFL_PREDICT_FN(arch, bd)                                            \
+  CFL_PREDICT_X(arch, 4, 4, bd)                                             \
+  CFL_PREDICT_X(arch, 4, 8, bd)                                             \
+  CFL_PREDICT_X(arch, 4, 16, bd)                                            \
+  CFL_PREDICT_X(arch, 8, 4, bd)                                             \
+  CFL_PREDICT_X(arch, 8, 8, bd)                                             \
+  CFL_PREDICT_X(arch, 8, 16, bd)                                            \
+  CFL_PREDICT_X(arch, 8, 32, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 4, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 8, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 16, bd)                                           \
+  CFL_PREDICT_X(arch, 16, 32, bd)                                           \
+  CFL_PREDICT_X(arch, 32, 8, bd)                                            \
+  CFL_PREDICT_X(arch, 32, 16, bd)                                           \
+  CFL_PREDICT_X(arch, 32, 32, bd)                                           \
+  cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \
+    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {               \
+      cfl_predict_##bd##_4x4_##arch,   /* 4x4 */                            \
+      cfl_predict_##bd##_8x8_##arch,   /* 8x8 */                            \
+      cfl_predict_##bd##_16x16_##arch, /* 16x16 */                          \
+      cfl_predict_##bd##_32x32_##arch, /* 32x32 */                          \
+      NULL,                            /* 64x64 (invalid CFL size) */       \
+      cfl_predict_##bd##_4x8_##arch,   /* 4x8 */                            \
+      cfl_predict_##bd##_8x4_##arch,   /* 8x4 */                            \
+      cfl_predict_##bd##_8x16_##arch,  /* 8x16 */                           \
+      cfl_predict_##bd##_16x8_##arch,  /* 16x8 */                           \
+      cfl_predict_##bd##_16x32_##arch, /* 16x32 */                          \
+      cfl_predict_##bd##_32x16_##arch, /* 32x16 */                          \
+      NULL,                            /* 32x64 (invalid CFL size) */       \
+      NULL,                            /* 64x32 (invalid CFL size) */       \
+      cfl_predict_##bd##_4x16_##arch,  /* 4x16  */                          \
+      cfl_predict_##bd##_16x4_##arch,  /* 16x4  */                          \
+      cfl_predict_##bd##_8x32_##arch,  /* 8x32  */                          \
+      cfl_predict_##bd##_32x8_##arch,  /* 32x8  */                          \
+      NULL,                            /* 16x64 (invalid CFL size) */       \
+      NULL,                            /* 64x16 (invalid CFL size) */       \
+    };                                                                      \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
+    /* index the function pointer array out of bounds. */                   \
+    return pred[tx_size % TX_SIZES_ALL];                                    \
   }
 
 #endif  // AOM_AV1_COMMON_CFL_H_
diff --git a/media/libaom/src/av1/common/common_data.h b/media/libaom/src/av1/common/common_data.h
index 46e455fdb..402845caf 100644
--- a/media/libaom/src/av1/common/common_data.h
+++ b/media/libaom/src/av1/common/common_data.h
@@ -82,16 +82,16 @@ static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
     BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
     BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
   }, {  // PARTITION_HORZ_A
-    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
     BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_HORZ_B
-    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
     BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_VERT_A
-    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
     BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_VERT_B
-    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
     BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_HORZ_4
     BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
diff --git a/media/libaom/src/av1/common/convolve.c b/media/libaom/src/av1/common/convolve.c
index 1f11126fc..e177e3cad 100644
--- a/media/libaom/src/av1/common/convolve.c
+++ b/media/libaom/src/av1/common/convolve.c
@@ -15,10 +15,10 @@
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
@@ -73,15 +73,55 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
   }
 }
 
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+                               int dst_stride, int w, int h, int dir,
+                               double norm) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
+  DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
+  const int taps = 3;
+  int im_h = h + taps - 1;
+  int im_stride = w;
+  const int fo_vert = 1;
+  const int fo_horiz = 1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = dir ? sobel_a : sobel_b;
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      im_block[y * im_stride + x] = sum;
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = dir ? sobel_b : sobel_a;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      dst[y * dst_stride + x] = sum * norm;
+    }
+  }
+}
+
 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
                           const InterpFilterParams *filter_params_x,
                           const InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
+                          const int subpel_x_qn, const int subpel_y_qn,
                           ConvolveParams *conv_params) {
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bd = 8;
@@ -91,7 +131,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -107,7 +147,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -128,11 +168,11 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
                          const InterpFilterParams *filter_params_x,
                          const InterpFilterParams *filter_params_y,
-                         const int subpel_x_q4, const int subpel_y_q4,
+                         const int subpel_x_qn, const int subpel_y_qn,
                          ConvolveParams *conv_params) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -141,7 +181,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -158,12 +198,12 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
                          const InterpFilterParams *filter_params_x,
                          const InterpFilterParams *filter_params_y,
-                         const int subpel_x_q4, const int subpel_y_q4,
+                         const int subpel_x_qn, const int subpel_y_qn,
                          ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   assert(bits >= 0);
@@ -172,7 +212,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -190,27 +230,27 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, int w, int h,
                                const InterpFilterParams *filter_params_x,
                                const InterpFilterParams *filter_params_y,
-                               const int subpel_x_q4, const int subpel_y_q4,
+                               const int subpel_x_qn, const int subpel_y_qn,
                                ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   for (int y = 0; y < h; ++y) {
-    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                           int dst8_stride, int w, int h,
-                           const InterpFilterParams *filter_params_x,
-                           const InterpFilterParams *filter_params_y,
-                           const int subpel_x_q4, const int subpel_y_q4,
-                           ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_qn, const int subpel_y_qn,
+                                ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
@@ -223,7 +263,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -239,7 +279,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -250,8 +290,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -260,23 +300,23 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
         }
         tmp -= (1 << (offset_bits - conv_params->round_1)) +
                (1 << (offset_bits - conv_params->round_1 - 1));
-        dst8[y * dst8_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                          int dst8_stride, int w, int h,
-                          const InterpFilterParams *filter_params_x,
-                          const InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_qn, const int subpel_y_qn,
+                               ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   const int bd = 8;
@@ -286,11 +326,11 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -301,8 +341,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -310,23 +350,23 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst8[y * dst8_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                          int dst8_stride, int w, int h,
-                          const InterpFilterParams *filter_params_x,
-                          const InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_qn, const int subpel_y_qn,
+                               ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_1;
   const int bd = 8;
@@ -336,11 +376,11 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -351,8 +391,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       res += round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -360,23 +400,24 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst8[y * dst8_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst8, int dst8_stride, int w, int h,
-                                const InterpFilterParams *filter_params_x,
-                                const InterpFilterParams *filter_params_y,
-                                const int subpel_x_q4, const int subpel_y_q4,
-                                ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     const InterpFilterParams *filter_params_y,
+                                     const int subpel_x_qn,
+                                     const int subpel_y_qn,
+                                     ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int bd = 8;
@@ -385,8 +426,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
                            (1 << (offset_bits - conv_params->round_1 - 1));
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -394,8 +435,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
       res += round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -403,16 +444,16 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
-                             int dst8_stride, int w, int h,
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int x_step_qn,
@@ -472,7 +513,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
@@ -482,7 +523,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
           /* Subtract round offset and convolve round */
           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
                        (1 << (offset_bits - conv_params->round_1 - 1)));
-          dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
         } else {
           dst16[y * dst16_stride + x] = res;
         }
@@ -490,7 +531,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
         /* Subtract round offset and convolve round */
         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
                              (1 << (offset_bits - conv_params->round_1 - 1)));
-        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
       }
     }
     src_vert++;
@@ -511,89 +552,71 @@ static void convolve_2d_scale_wrapper(
                         y_step_qn, conv_params);
 }
 
-// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
-// we may create optimized code to do 2-tap filtering for all bilinear filtering
-// usages, not just IntraBC.
-static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
-                                    uint8_t *dst, int dst_stride, int w, int h,
-                                    int subpel_x_q4, int subpel_y_q4,
-                                    ConvolveParams *conv_params) {
-  const InterpFilterParams *filter_params_x =
-      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
-  const InterpFilterParams *filter_params_y =
-      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
-  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
-    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                         filter_params_x, filter_params_y, 0, 0, conv_params);
-  } else if (subpel_x_q4 != 0) {
-    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                        filter_params_y, 0, 0, conv_params);
-  } else {
-    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                        filter_params_y, 0, 0, conv_params);
-  }
-}
-
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilters interp_filters, const int subpel_x_q4,
-                            int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf, int is_intrabc) {
-  assert(IMPLIES(is_intrabc, !scaled));
+                            const InterpFilterParams *interp_filters[2],
+                            const int subpel_x_qn, int x_step_q4,
+                            const int subpel_y_qn, int y_step_q4, int scaled,
+                            ConvolveParams *conv_params,
+                            const struct scale_factors *sf) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
   (void)dst_stride;
 
-  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
-    convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
-                            subpel_y_q4, conv_params);
-    return;
+  const InterpFilterParams *filter_params_x = interp_filters[0];
+  const InterpFilterParams *filter_params_y = interp_filters[1];
+
+  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
+  // Do we have SIMD support to 4-tap case?
+  // 2-tap filter indicates that it is for IntraBC.
+  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+    assert(!scaled);
+    if (subpel_x_qn && subpel_y_qn) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+      return;
+    } else if (subpel_x_qn) {
+      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, filter_params_y, subpel_x_qn,
+                          subpel_y_qn, conv_params);
+      return;
+    } else if (subpel_y_qn) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, filter_params_y, subpel_x_qn,
+                          subpel_y_qn, conv_params);
+      return;
+    }
   }
 
-  InterpFilter filter_x = 0;
-  InterpFilter filter_y = 0;
-  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
-  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
-  if (need_filter_params_x)
-    filter_x = av1_extract_interp_filter(interp_filters, 1);
-  if (need_filter_params_y)
-    filter_y = av1_extract_interp_filter(interp_filters, 0);
-  const InterpFilterParams *filter_params_x =
-      need_filter_params_x
-          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
-          : NULL;
-  const InterpFilterParams *filter_params_y =
-      need_filter_params_y
-          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
-          : NULL;
-
   if (scaled) {
     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
-                              filter_params_x, filter_params_y, subpel_x_q4,
-                              x_step_q4, subpel_y_q4, y_step_q4, conv_params);
+                              filter_params_x, filter_params_y, subpel_x_qn,
+                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
   } else {
-    sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
+    sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
-        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_convolve_2d_copy_sr_c(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)bd;
 
   for (int y = 0; y < h; ++y) {
-    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
@@ -601,12 +624,12 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
                                 const InterpFilterParams *filter_params_y,
-                                const int subpel_x_q4, const int subpel_y_q4,
+                                const int subpel_x_qn, const int subpel_y_qn,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -614,7 +637,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
 
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -632,11 +655,11 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
                                 const InterpFilterParams *filter_params_y,
-                                const int subpel_x_q4, const int subpel_y_q4,
+                                const int subpel_x_qn, const int subpel_y_qn,
                                 ConvolveParams *conv_params, int bd) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -644,7 +667,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -661,11 +684,12 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
                                  uint16_t *dst, int dst_stride, int w, int h,
                                  const InterpFilterParams *filter_params_x,
                                  const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 const int subpel_x_qn, const int subpel_y_qn,
                                  ConvolveParams *conv_params, int bd) {
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits =
@@ -675,7 +699,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < im_h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -691,7 +715,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
   // vertical filter
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
@@ -709,17 +733,15 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
-                                  uint16_t *dst16, int dst16_stride, int w,
-                                  int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
-                                  ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   int x, y, k;
   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -731,7 +753,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (y = 0; y < im_h; ++y) {
     for (x = 0; x < w; ++x) {
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -749,7 +771,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
   int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) {
       int32_t sum = 1 << offset_bits;
@@ -759,8 +781,8 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -769,24 +791,22 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
         }
         tmp -= (1 << (offset_bits - conv_params->round_1)) +
                (1 << (offset_bits - conv_params->round_1 - 1));
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
-                                 uint16_t *dst16, int dst16_stride, int w,
-                                 int h,
-                                 const InterpFilterParams *filter_params_x,
-                                 const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_q4, const int subpel_y_q4,
-                                 ConvolveParams *conv_params, int bd) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_highbd_dist_wtd_convolve_x_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_1;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -796,11 +816,11 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   assert(bits >= 0);
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -811,8 +831,8 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
       res += round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -820,24 +840,22 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
-                                 uint16_t *dst16, int dst16_stride, int w,
-                                 int h,
-                                 const InterpFilterParams *filter_params_x,
-                                 const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_q4, const int subpel_y_q4,
-                                 ConvolveParams *conv_params, int bd) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_highbd_dist_wtd_convolve_y_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int bits = FILTER_BITS - conv_params->round_0;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -847,11 +865,11 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   assert(round_bits >= 0);
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   assert(bits >= 0);
   // vertical filter
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       int32_t res = 0;
@@ -862,8 +880,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
 
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -871,22 +889,22 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
 }
 
-void av1_highbd_jnt_convolve_2d_copy_c(
-    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
-    int w, int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
+void av1_highbd_dist_wtd_convolve_2d_copy_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -895,16 +913,16 @@ void av1_highbd_jnt_convolve_2d_copy_c(
   assert(bits >= 0);
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
       res += round_offset;
       if (conv_params->do_average) {
-        int32_t tmp = dst[y * dst_stride + x];
-        if (conv_params->use_jnt_comp_avg) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
           tmp = tmp >> DIST_PRECISION_BITS;
         } else {
@@ -912,10 +930,10 @@ void av1_highbd_jnt_convolve_2d_copy_c(
           tmp = tmp >> 1;
         }
         tmp -= round_offset;
-        dst16[y * dst16_stride + x] =
+        dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
       } else {
-        dst[y * dst_stride + x] = res;
+        dst16[y * dst16_stride + x] = res;
       }
     }
   }
@@ -980,7 +998,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
@@ -1007,68 +1025,24 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
   }
 }
 
-static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
-                                           uint16_t *dst, int dst_stride, int w,
-                                           int h, int subpel_x_q4,
-                                           int subpel_y_q4,
-                                           ConvolveParams *conv_params,
-                                           int bd) {
-  const InterpFilterParams *filter_params_x =
-      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
-  const InterpFilterParams *filter_params_y =
-      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
-  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
-    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                                filter_params_x, filter_params_y, 0, 0,
-                                conv_params, bd);
-  } else if (subpel_x_q4 != 0) {
-    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params_x, filter_params_y, 0, 0,
-                               conv_params, bd);
-  } else {
-    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params_x, filter_params_y, 0, 0,
-                               conv_params, bd);
-  }
-}
-
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst8, int dst_stride, int w, int h,
-                                   InterpFilters interp_filters,
-                                   const int subpel_x_q4, int x_step_q4,
-                                   const int subpel_y_q4, int y_step_q4,
+                                   const InterpFilterParams *interp_filters[2],
+                                   const int subpel_x_qn, int x_step_q4,
+                                   const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf,
-                                   int is_intrabc, int bd) {
-  assert(IMPLIES(is_intrabc, !scaled));
+                                   const struct scale_factors *sf, int bd) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst_stride;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 
-  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-    highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
-                                   subpel_x_q4, subpel_y_q4, conv_params, bd);
-    return;
-  }
-
-  InterpFilter filter_x = 0;
-  InterpFilter filter_y = 0;
-  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
-  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
-  if (need_filter_params_x)
-    filter_x = av1_extract_interp_filter(interp_filters, 1);
-  if (need_filter_params_y)
-    filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
   const InterpFilterParams *filter_params_x =
-      need_filter_params_x
-          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
-          : NULL;
+      need_filter_params_x ? interp_filters[0] : NULL;
   const InterpFilterParams *filter_params_y =
-      need_filter_params_y
-          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
-          : NULL;
+      need_filter_params_y ? interp_filters[1] : NULL;
 
   if (scaled) {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -1076,18 +1050,19 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
       assert(conv_params->dst != NULL);
     }
     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
-                                 filter_params_x, filter_params_y, subpel_x_q4,
-                                 x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+                                 filter_params_x, filter_params_y, subpel_x_qn,
+                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
                                  bd);
   } else {
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
-    sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
+    sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
                                           0][conv_params->is_compound](
         src, src_stride, dst, dst_stride, w, h, filter_params_x,
-        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Note: Fixed size intermediate buffers, place limits on parameters
 // of some functions. 2d filtering proceeds in 2 steps:
@@ -1109,12 +1084,14 @@ static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
   return sum;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
                                              const int16_t *b) {
   int sum = 0;
   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   return sum;
 }
+#endif
 
 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                              ptrdiff_t a_stride,
@@ -1215,6 +1192,7 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
                             y_step_q4, w, h, conv_params->round_1);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_convolve_add_src_horiz_hip(
     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
@@ -1293,3 +1271,4 @@ void av1_highbd_wiener_convolve_add_src_c(
       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/convolve.h b/media/libaom/src/av1/common/convolve.h
index 4109dd843..04df86c42 100644
--- a/media/libaom/src/av1/common/convolve.h
+++ b/media/libaom/src/av1/common/convolve.h
@@ -26,7 +26,8 @@ typedef struct ConvolveParams {
   int round_1;
   int plane;
   int is_compound;
-  int use_jnt_comp_avg;
+  int compound_index;  // 0: the first single in compound mode, 1: the second.
+  int use_dist_wtd_comp_avg;
   int fwd_offset;
   int bck_offset;
 } ConvolveParams;
@@ -41,32 +42,34 @@ typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params);
 
 typedef void (*aom_highbd_convolve_fn_t)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
 
 struct AV1Common;
 struct scale_factors;
 
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
-                            InterpFilters interp_filters, const int subpel_x_q4,
-                            int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params,
-                            const struct scale_factors *sf, int is_intrabc);
+                            const InterpFilterParams *interp_filters[2],
+                            const int subpel_x_qn, int x_step_q4,
+                            const int subpel_y_qn, int y_step_q4, int scaled,
+                            ConvolveParams *conv_params,
+                            const struct scale_factors *sf);
 
-static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
+static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
                                                       CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
   ConvolveParams conv_params;
-  conv_params.do_average = do_average;
-  assert(IMPLIES(do_average, is_compound));
+  conv_params.compound_index = cmp_index;
+  assert(IMPLIES(cmp_index, is_compound));
+
   conv_params.is_compound = is_compound;
   conv_params.round_0 = ROUND0_BITS;
   conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
@@ -82,6 +85,10 @@ static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
   conv_params.dst = dst;
   conv_params.dst_stride = dst_stride;
   conv_params.plane = plane;
+
+  // By default, set do average to 1 if this is the second single prediction
+  // in a compound mode.
+  conv_params.do_average = cmp_index;
   return conv_params;
 }
 
@@ -111,12 +118,16 @@ static INLINE ConvolveParams get_conv_params_wiener(int bd) {
 
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst, int dst_stride, int w, int h,
-                                   InterpFilters interp_filters,
-                                   const int subpel_x_q4, int x_step_q4,
-                                   const int subpel_y_q4, int y_step_q4,
+                                   const InterpFilterParams *interp_filters[2],
+                                   const int subpel_x_qn, int x_step_q4,
+                                   const int subpel_y_qn, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   const struct scale_factors *sf,
-                                   int is_intrabc, int bd);
+                                   const struct scale_factors *sf, int bd);
+
+// TODO(sarahparker) This will need to be integerized and optimized
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+                               int dst_stride, int w, int h, int dir,
+                               double norm);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/debugmodes.c b/media/libaom/src/av1/common/debugmodes.c
index 868f341b5..ff02ddde0 100644
--- a/media/libaom/src/av1/common/debugmodes.c
+++ b/media/libaom/src/av1/common/debugmodes.c
@@ -11,14 +11,14 @@
 
 #include <stdio.h>
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
 
 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
   fprintf(f, "%s", str);
-  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
-          cm->show_frame, cm->base_qindex);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
+          cm->show_frame, cm->quant_params.base_qindex);
 }
 /* This function dereferences a pointer to the mbmi structure
  * and uses the passed in member offset to print out the value of an integer
@@ -26,32 +26,31 @@ static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
  */
 static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
-  int mi_row, mi_col;
-  MB_MODE_INFO **mi = cm->mi_grid_visible;
-  int rows = cm->mi_rows;
-  int cols = cm->mi_cols;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base;
+  int rows = mi_params->mi_rows;
+  int cols = mi_params->mi_cols;
   char prefix = descriptor[0];
 
   log_frame_info(cm, descriptor, file);
-  for (mi_row = 0; mi_row < rows; mi_row++) {
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
-    for (mi_col = 0; mi_col < cols; mi_col++) {
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
       mi++;
     }
     fprintf(file, "\n");
-    mi += MAX_MIB_SIZE;
+    mi += mi_params->mi_stride - cols;
   }
   fprintf(file, "\n");
 }
 
 void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
-  int mi_row;
-  int mi_col;
+  CommonModeInfoParams *mi_params = &cm->mi_params;
   FILE *mvs = fopen(file, "a");
-  MB_MODE_INFO **mi = cm->mi_grid_visible;
-  int rows = cm->mi_rows;
-  int cols = cm->mi_cols;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base;
+  const int rows = mi_params->mi_rows;
+  const int cols = mi_params->mi_cols;
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
@@ -61,28 +60,28 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
 
   // output skip infomation.
   log_frame_info(cm, "Skips:", mvs);
-  for (mi_row = 0; mi_row < rows; mi_row++) {
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "S ");
-    for (mi_col = 0; mi_col < cols; mi_col++) {
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%2d ", mi[0]->skip);
       mi++;
     }
     fprintf(mvs, "\n");
-    mi += MAX_MIB_SIZE;
+    mi += mi_params->mi_stride - cols;
   }
   fprintf(mvs, "\n");
 
   // output motion vectors.
   log_frame_info(cm, "Vectors ", mvs);
-  mi = cm->mi_grid_visible;
-  for (mi_row = 0; mi_row < rows; mi_row++) {
+  mi = mi_params->mi_grid_base;
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
-    for (mi_col = 0; mi_col < cols; mi_col++) {
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
       mi++;
     }
     fprintf(mvs, "\n");
-    mi += MAX_MIB_SIZE;
+    mi += mi_params->mi_stride - cols;
   }
   fprintf(mvs, "\n");
 
@@ -93,6 +92,13 @@ void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
                                          const char *filename) {
   FILE *hdrFile = fopen(filename, "w");
   fwrite(data, size, sizeof(uint8_t), hdrFile);
+
+  // Reset order hints(7bit + a previous bit) to 0, so that all camera frame
+  // headers are identical in large scale coding.
+  uint8_t zero = 0;
+  fseek(hdrFile, 1, SEEK_SET);
+  // Reset second byte.
+  fwrite(&zero, 1, sizeof(uint8_t), hdrFile);
   fclose(hdrFile);
 }
 
diff --git a/media/libaom/src/av1/common/entropy.c b/media/libaom/src/av1/common/entropy.c
index 4f95ef69b..1f7a0efe0 100644
--- a/media/libaom/src/av1/common/entropy.c
+++ b/media/libaom/src/av1/common/entropy.c
@@ -13,10 +13,10 @@
 
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/scan.h"
 #include "av1/common/token_cdfs.h"
 #include "av1/common/txb_common.h"
@@ -29,7 +29,7 @@ static int get_q_ctx(int q) {
 }
 
 void av1_default_coef_probs(AV1_COMMON *cm) {
-  const int index = get_q_ctx(cm->base_qindex);
+  const int index = get_q_ctx(cm->quant_params.base_qindex);
 #if CONFIG_ENTROPY_STATS
   cm->coef_cdf_category = index;
 #endif
@@ -50,8 +50,9 @@ void av1_default_coef_probs(AV1_COMMON *cm) {
   av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
 }
 
-static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
-                                     int cdf_stride, int nsymbs) {
+static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr,
+                                                int num_cdfs, int cdf_stride,
+                                                int nsymbs) {
   for (int i = 0; i < num_cdfs; i++) {
     cdf_ptr[i * cdf_stride + nsymbs] = 0;
   }
@@ -68,7 +69,7 @@ static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
     reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
   } while (0)
 
-static void reset_nmv_counter(nmv_context *nmv) {
+static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) {
   RESET_CDF_COUNTER(nmv->joints_cdf, 4);
   for (int i = 0; i < 2; i++) {
     RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
@@ -101,7 +102,7 @@ void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
   RESET_CDF_COUNTER(fc->refmv_cdf, 2);
   RESET_CDF_COUNTER(fc->drl_cdf, 2);
   RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
-  RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+  RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
   RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
   RESET_CDF_COUNTER(fc->interintra_cdf, 2);
   RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
diff --git a/media/libaom/src/av1/common/entropy.h b/media/libaom/src/av1/common/entropy.h
index 991692c2f..ee78f56a3 100644
--- a/media/libaom/src/av1/common/entropy.h
+++ b/media/libaom/src/av1/common/entropy.h
@@ -48,18 +48,18 @@ extern "C" {
 #define BR_CDF_SIZE (4)
 #define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
 
-#define COEFF_CONTEXT_BITS 6
+#define COEFF_CONTEXT_BITS 3
 #define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
 #define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
 
 #define BASE_CONTEXT_POSITION_NUM 12
 
-typedef enum TX_CLASS {
+enum {
   TX_CLASS_2D = 0,
   TX_CLASS_HORIZ = 1,
   TX_CLASS_VERT = 2,
   TX_CLASSES = 3,
-} TX_CLASS;
+} UENUM1BYTE(TX_CLASS);
 
 #define DCT_MAX_VALUE 16384
 #define DCT_MAX_VALUE_HIGH10 65536
diff --git a/media/libaom/src/av1/common/entropymode.c b/media/libaom/src/av1/common/entropymode.c
index 41dc30ddb..5f061be35 100644
--- a/media/libaom/src/av1/common/entropymode.c
+++ b/media/libaom/src/av1/common/entropymode.c
@@ -11,9 +11,9 @@
 
 #include "aom_mem/aom_mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/scan.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/txb_common.h"
 
@@ -435,16 +435,16 @@ static const aom_cdf_prob
       { AOM_CDF3(601, 943) },     { AOM_CDF3(14969, 21398) }
     };
 
-static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
-      { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+            { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
 
 static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
     2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
 
-static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
-      { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+            { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
 
 static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
   { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
@@ -470,11 +470,11 @@ static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
             { AOM_CDF2(30237) } };
 
 static const aom_cdf_prob
-    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] =
-        { { AOM_CDF4(8192, 16384, 24576) },
-          { AOM_CDF4(1875, 11082, 27332) },
-          { AOM_CDF4(2473, 9996, 26388) },
-          { AOM_CDF4(4238, 11537, 25926) } };
+    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+        INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) },
+                               { AOM_CDF4(1875, 11082, 27332) },
+                               { AOM_CDF4(2473, 9996, 26388) },
+                               { AOM_CDF4(4238, 11537, 25926) } };
 
 static const aom_cdf_prob
     default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
@@ -488,63 +488,63 @@ static const aom_cdf_prob
       { AOM_CDF2(16384) }
     };
 
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
-      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
-      { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
-      { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
-      { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
-      { AOM_CDF2(16384) }
-    };
+static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    MASKED_COMPOUND_TYPES)] = {
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+  { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
+  { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }
+};
 
-static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
-    { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359,
-                  22362, 24127, 25702, 27752, 29450, 31171) },
-      { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367,
-                  18452, 19422, 22839, 26127, 29629) },
-      { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332,
-                  24520, 27470, 29456, 30529, 31656) },
-      { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163,
-                  20961, 22884, 24471, 26719, 28714, 30877) },
-      { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730,
-                  18114, 19313, 22521, 26012, 29550) },
-      { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270,
-                  20533, 23434, 25972, 27944, 29570, 31416) },
-      { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638,
-                  22038, 23963, 25311, 26988, 28766, 31012) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284,
-                  24985, 25684, 27259, 28883, 30911) },
-      { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057,
-                  27251, 29173, 30089, 30960, 31933) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) },
-      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
-                  20480, 22528, 24576, 26624, 28672, 30720) } };
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+                         20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+             { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323,
+                         17367, 18452, 19422, 22839, 26127, 29629) },
+             { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939,
+                         21332, 24520, 27470, 29456, 30529, 31656) },
+             { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+                         19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+             { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369,
+                         16730, 18114, 19313, 22521, 26012, 29550) },
+             { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+                         17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+             { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+                         20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703,
+                         24284, 24985, 25684, 27259, 28883, 30911) },
+             { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935,
+                         25057, 27251, 29173, 30089, 30960, 31933) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) } };
 
 static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
     MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
@@ -1068,9 +1068,16 @@ void av1_setup_frame_contexts(AV1_COMMON *cm) {
   // This function must ONLY be called when cm->fc has been initialized with
   // default probs, either by av1_setup_past_independence or after manually
   // initializing them
-  cm->frame_contexts[FRAME_CONTEXT_DEFAULTS] = *cm->fc;
-  if (cm->large_scale_tile) {
-    for (int i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+  *cm->default_frame_context = *cm->fc;
+  // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
+  // but could do with fuller testing
+  if (cm->tiles.large_scale) {
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
+      if (buf != NULL) buf->frame_context = *cm->fc;
+    }
+    for (int i = 0; i < FRAME_BUFFERS; ++i)
+      cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
   }
 }
 
@@ -1079,10 +1086,9 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
   // Features disabled, 0, with delta coding (Default state).
   av1_clearall_segfeatures(&cm->seg);
 
-  cm->current_frame_seg_map = cm->cur_frame->seg_map;
-
-  if (cm->current_frame_seg_map)
-    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+  if (cm->cur_frame->seg_map)
+    memset(cm->cur_frame->seg_map, 0,
+           (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
 
   // reset mode ref deltas
   av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
@@ -1092,12 +1098,6 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
   av1_default_coef_probs(cm);
   init_mode_probs(cm->fc);
   av1_init_mv_probs(cm);
-  av1_init_lv_map(cm);
   cm->fc->initialized = 1;
   av1_setup_frame_contexts(cm);
-
-  // prev_mip will only be allocated in encoder.
-  if (frame_is_intra_only(cm) && cm->prev_mip)
-    memset(cm->prev_mip, 0,
-           cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip));
 }
diff --git a/media/libaom/src/av1/common/entropymode.h b/media/libaom/src/av1/common/entropymode.h
index 7047f34d2..bbbf55dc8 100644
--- a/media/libaom/src/av1/common/entropymode.h
+++ b/media/libaom/src/av1/common/entropymode.h
@@ -63,7 +63,6 @@ struct AV1Common;
 typedef struct {
   const int16_t *scan;
   const int16_t *iscan;
-  const int16_t *neighbors;
 } SCAN_ORDER;
 
 typedef struct frame_contexts {
@@ -92,7 +91,8 @@ typedef struct frame_contexts {
 
   aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
                                       [CDF_SIZE(INTER_COMPOUND_MODES)];
-  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL]
+                                [CDF_SIZE(MASKED_COMPOUND_TYPES)];
   aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
   aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
   aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
diff --git a/media/libaom/src/av1/common/entropymv.c b/media/libaom/src/av1/common/entropymv.c
index 491337387..e1e42f2f1 100644
--- a/media/libaom/src/av1/common/entropymv.c
+++ b/media/libaom/src/av1/common/entropymv.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/entropymv.h"
 
 static const nmv_context default_nmv_context = {
diff --git a/media/libaom/src/av1/common/entropymv.h b/media/libaom/src/av1/common/entropymv.h
index fa818a2c1..cddc80768 100644
--- a/media/libaom/src/av1/common/entropymv.h
+++ b/media/libaom/src/av1/common/entropymv.h
@@ -30,12 +30,12 @@ void av1_init_mv_probs(struct AV1Common *cm);
 
 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS 4
-typedef enum {
+enum {
   MV_JOINT_ZERO = 0,   /* Zero vector */
   MV_JOINT_HNZVZ = 1,  /* Vert zero, hor nonzero */
   MV_JOINT_HZVNZ = 2,  /* Hor zero, vert nonzero */
   MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
-} MV_JOINT_TYPE;
+} UENUM1BYTE(MV_JOINT_TYPE);
 
 static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
   return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
@@ -47,7 +47,7 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
 
 /* Symbols for coding magnitude class of nonzero components */
 #define MV_CLASSES 11
-typedef enum {
+enum {
   MV_CLASS_0 = 0,   /* (0, 2]     integer pel */
   MV_CLASS_1 = 1,   /* (2, 4]     integer pel */
   MV_CLASS_2 = 2,   /* (4, 8]     integer pel */
@@ -59,7 +59,7 @@ typedef enum {
   MV_CLASS_8 = 8,   /* (256, 512] integer pel */
   MV_CLASS_9 = 9,   /* (512, 1024] integer pel */
   MV_CLASS_10 = 10, /* (1024,2048] integer pel */
-} MV_CLASS_TYPE;
+} UENUM1BYTE(MV_CLASS_TYPE);
 
 #define CLASS0_BITS 1 /* bits at integer precision for class 0 */
 #define CLASS0_SIZE (1 << CLASS0_BITS)
@@ -91,11 +91,11 @@ typedef struct {
   nmv_component comps[2];
 } nmv_context;
 
-typedef enum {
+enum {
   MV_SUBPEL_NONE = -1,
   MV_SUBPEL_LOW_PRECISION = 0,
   MV_SUBPEL_HIGH_PRECISION,
-} MvSubpelPrecision;
+} SENUM1BYTE(MvSubpelPrecision);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/enums.h b/media/libaom/src/av1/common/enums.h
index 869c06ef2..0c09a1bc7 100644
--- a/media/libaom/src/av1/common/enums.h
+++ b/media/libaom/src/av1/common/enums.h
@@ -16,6 +16,7 @@
 
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -63,17 +64,6 @@ extern "C" {
 #define FRAME_OFFSET_BITS 5
 #define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
 
-#define REF_FRAMES_LOG2 3
-#define REF_FRAMES (1 << REF_FRAMES_LOG2)
-
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
-// TODO(jkoleszar): These 3 extra references could probably come from the
-// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
-
 // 4 frame filter levels: y plane vertical, y plane horizontal,
 // u plane, and v plane
 #define FRAME_LF_COUNT 4
@@ -83,11 +73,6 @@ extern "C" {
 #define DIST_PRECISION_BITS 4
 #define DIST_PRECISION (1 << DIST_PRECISION_BITS)  // 16
 
-// TODO(chengchen): Temporal flag serve as experimental flag for WIP
-// bitmask construction.
-// Shall be removed when bitmask code is completely checkedin
-#define LOOP_FILTER_BITMASK 0
-
 #define PROFILE_BITS 3
 // The following three profiles are currently defined.
 // Profile 0.  8-bit and 10-bit 4:2:0 and 4:0:0 only.
@@ -95,21 +80,12 @@ extern "C" {
 // Profile 2.  8-bit and 10-bit 4:2:2
 //            12-bit  4:0:0, 4:2:2 and 4:4:4
 // Since we have three bits for the profiles, it can be extended later.
-typedef enum BITSTREAM_PROFILE {
+enum {
   PROFILE_0,
   PROFILE_1,
   PROFILE_2,
   MAX_PROFILES,
-} BITSTREAM_PROFILE;
-
-#define LEVEL_MAJOR_BITS 3
-#define LEVEL_MINOR_BITS 2
-#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
-
-#define LEVEL_MAJOR_MIN 2
-#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
-#define LEVEL_MINOR_MIN 0
-#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+} SENUM1BYTE(BITSTREAM_PROFILE);
 
 #define OP_POINTS_CNT_MINUS_1_BITS 5
 #define OP_POINTS_IDC_BITS 12
@@ -149,7 +125,28 @@ typedef enum ATTRIBUTE_PACKED {
 // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
 #define SQR_BLOCK_SIZES 6
 
-typedef enum ATTRIBUTE_PACKED {
+//  Partition types.  R: Recursive
+//
+//  NONE          HORZ          VERT          SPLIT
+//  +-------+     +-------+     +---+---+     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  |       |     +-------+     |   |   |     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  +-------+     +-------+     +---+---+     +---+---+
+//
+//  HORZ_A        HORZ_B        VERT_A        VERT_B
+//  +---+---+     +-------+     +---+---+     +---+---+
+//  |   |   |     |       |     |   |   |     |   |   |
+//  +---+---+     +---+---+     +---+   |     |   +---+
+//  |       |     |   |   |     |   |   |     |   |   |
+//  +-------+     +---+---+     +---+---+     +---+---+
+//
+//  HORZ_4        VERT_4
+//  +-----+       +-+-+-+
+//  +-----+       | | | |
+//  +-----+       | | | |
+//  +-----+       +-+-+-+
+enum {
   PARTITION_NONE,
   PARTITION_HORZ,
   PARTITION_VERT,
@@ -163,7 +160,7 @@ typedef enum ATTRIBUTE_PACKED {
   EXT_PARTITION_TYPES,
   PARTITION_TYPES = PARTITION_SPLIT + 1,
   PARTITION_INVALID = 255
-} PARTITION_TYPE;
+} UENUM1BYTE(PARTITION_TYPE);
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET 4  // number of probability models per block size
@@ -171,12 +168,7 @@ typedef char PARTITION_CONTEXT;
 #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
 
 // block transform size
-#if defined(_MSC_VER)
-typedef uint8_t TX_SIZE;
-enum ATTRIBUTE_PACKED {
-#else
-typedef enum ATTRIBUTE_PACKED {
-#endif
+enum {
   TX_4X4,             // 4x4 transform
   TX_8X8,             // 8x8 transform
   TX_16X16,           // 16x16 transform
@@ -200,11 +192,7 @@ typedef enum ATTRIBUTE_PACKED {
   TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
   TX_SIZES_LARGEST = TX_64X64,
   TX_INVALID = 255  // Invalid transform size
-#if defined(_MSC_VER)
-};
-#else
-} TX_SIZE;
-#endif
+} UENUM1BYTE(TX_SIZE);
 
 #define TX_SIZE_LUMA_MIN (TX_4X4)
 /* We don't need to code a transform size unless the allowed size is at least
@@ -226,7 +214,7 @@ typedef enum ATTRIBUTE_PACKED {
 #define TX_PAD_HOR 4
 // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
 // check.
-#define TX_PAD_TOP 2
+#define TX_PAD_TOP 0
 #define TX_PAD_BOTTOM 4
 #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
 // Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
@@ -238,43 +226,44 @@ typedef enum ATTRIBUTE_PACKED {
 #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
 
 // frame transform mode
-typedef enum ATTRIBUTE_PACKED {
+enum {
   ONLY_4X4,         // use only 4x4 transform
   TX_MODE_LARGEST,  // transform size is the largest possible for pu size
   TX_MODE_SELECT,   // transform specified for each block
   TX_MODES,
-} TX_MODE;
+} UENUM1BYTE(TX_MODE);
 
 // 1D tx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
   DCT_1D,
   ADST_1D,
   FLIPADST_1D,
   IDTX_1D,
   TX_TYPES_1D,
-} TX_TYPE_1D;
-
-typedef enum ATTRIBUTE_PACKED {
-  DCT_DCT,    // DCT  in both horizontal and vertical
-  ADST_DCT,   // ADST in vertical, DCT in horizontal
-  DCT_ADST,   // DCT  in vertical, ADST in horizontal
-  ADST_ADST,  // ADST in both directions
-  FLIPADST_DCT,
-  DCT_FLIPADST,
-  FLIPADST_FLIPADST,
-  ADST_FLIPADST,
-  FLIPADST_ADST,
-  IDTX,
-  V_DCT,
-  H_DCT,
-  V_ADST,
-  H_ADST,
-  V_FLIPADST,
-  H_FLIPADST,
+} UENUM1BYTE(TX_TYPE_1D);
+
+enum {
+  DCT_DCT,            // DCT in both horizontal and vertical
+  ADST_DCT,           // ADST in vertical, DCT in horizontal
+  DCT_ADST,           // DCT in vertical, ADST in horizontal
+  ADST_ADST,          // ADST in both directions
+  FLIPADST_DCT,       // FLIPADST in vertical, DCT in horizontal
+  DCT_FLIPADST,       // DCT in vertical, FLIPADST in horizontal
+  FLIPADST_FLIPADST,  // FLIPADST in both directions
+  ADST_FLIPADST,      // ADST in vertical, FLIPADST in horizontal
+  FLIPADST_ADST,      // FLIPADST in vertical, ADST in horizontal
+  IDTX,               // Identity in both directions
+  V_DCT,              // DCT in vertical, identity in horizontal
+  H_DCT,              // Identity in vertical, DCT in horizontal
+  V_ADST,             // ADST in vertical, identity in horizontal
+  H_ADST,             // Identity in vertical, ADST in horizontal
+  V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
+  H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
   TX_TYPES,
-} TX_TYPE;
+  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
+} UENUM1BYTE(TX_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   REG_REG,
   REG_SMOOTH,
   REG_SHARP,
@@ -284,9 +273,9 @@ typedef enum ATTRIBUTE_PACKED {
   SHARP_REG,
   SHARP_SMOOTH,
   SHARP_SHARP,
-} DUAL_FILTER_TYPE;
+} UENUM1BYTE(DUAL_FILTER_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   // DCT only
   EXT_TX_SET_DCTONLY,
   // DCT + Identity only
@@ -300,15 +289,13 @@ typedef enum ATTRIBUTE_PACKED {
   // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
   EXT_TX_SET_ALL16,
   EXT_TX_SET_TYPES
-} TxSetType;
-
-#define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
+} UENUM1BYTE(TxSetType);
 
 #define EXT_TX_SIZES 4       // number of sizes that use extended transforms
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   AOM_LAST_FLAG = 1 << 0,
   AOM_LAST2_FLAG = 1 << 1,
   AOM_LAST3_FLAG = 1 << 2,
@@ -317,19 +304,15 @@ typedef enum ATTRIBUTE_PACKED {
   AOM_ALT2_FLAG = 1 << 5,
   AOM_ALT_FLAG = 1 << 6,
   AOM_REFFRAME_ALL = (1 << 7) - 1
-} AOM_REFFRAME;
+} UENUM1BYTE(AOM_REFFRAME);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   UNIDIR_COMP_REFERENCE,
   BIDIR_COMP_REFERENCE,
   COMP_REFERENCE_TYPES,
-} COMP_REFERENCE_TYPE;
+} UENUM1BYTE(COMP_REFERENCE_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
-  PLANE_TYPE_Y,
-  PLANE_TYPE_UV,
-  PLANE_TYPES
-} PLANE_TYPE;
+enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
 
 #define CFL_ALPHABET_SIZE_LOG2 4
 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
@@ -337,24 +320,20 @@ typedef enum ATTRIBUTE_PACKED {
 #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
 #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
 
-typedef enum ATTRIBUTE_PACKED {
-  CFL_PRED_U,
-  CFL_PRED_V,
-  CFL_PRED_PLANES
-} CFL_PRED_TYPE;
+enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   CFL_SIGN_ZERO,
   CFL_SIGN_NEG,
   CFL_SIGN_POS,
   CFL_SIGNS
-} CFL_SIGN_TYPE;
+} UENUM1BYTE(CFL_SIGN_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   CFL_DISALLOWED,
   CFL_ALLOWED,
   CFL_ALLOWED_TYPES
-} CFL_ALLOWED_TYPE;
+} UENUM1BYTE(CFL_ALLOWED_TYPE);
 
 // CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
 #define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
@@ -371,12 +350,12 @@ typedef enum ATTRIBUTE_PACKED {
 #define CFL_CONTEXT_V(js) \
   (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   PALETTE_MAP,
   COLOR_MAP_TYPES,
-} COLOR_MAP_TYPE;
+} UENUM1BYTE(COLOR_MAP_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   TWO_COLORS,
   THREE_COLORS,
   FOUR_COLORS,
@@ -385,9 +364,9 @@ typedef enum ATTRIBUTE_PACKED {
   SEVEN_COLORS,
   EIGHT_COLORS,
   PALETTE_SIZES
-} PALETTE_SIZE;
+} UENUM1BYTE(PALETTE_SIZE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   PALETTE_COLOR_ONE,
   PALETTE_COLOR_TWO,
   PALETTE_COLOR_THREE,
@@ -397,11 +376,11 @@ typedef enum ATTRIBUTE_PACKED {
   PALETTE_COLOR_SEVEN,
   PALETTE_COLOR_EIGHT,
   PALETTE_COLORS
-} PALETTE_COLOR;
+} UENUM1BYTE(PALETTE_COLOR);
 
 // Note: All directional predictors must be between V_PRED and D67_PRED (both
 // inclusive).
-typedef enum ATTRIBUTE_PACKED {
+enum {
   DC_PRED,        // Average of above and left pixels
   V_PRED,         // Vertical
   H_PRED,         // Horizontal
@@ -431,6 +410,8 @@ typedef enum ATTRIBUTE_PACKED {
   MB_MODE_COUNT,
   INTRA_MODE_START = DC_PRED,
   INTRA_MODE_END = NEARESTMV,
+  DIR_MODE_START = V_PRED,
+  DIR_MODE_END = D67_PRED + 1,
   INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
   SINGLE_INTER_MODE_START = NEARESTMV,
   SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
@@ -442,11 +423,11 @@ typedef enum ATTRIBUTE_PACKED {
   INTER_MODE_END = MB_MODE_COUNT,
   INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
   INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
-} PREDICTION_MODE;
+} UENUM1BYTE(PREDICTION_MODE);
 
 // TODO(ltrudeau) Do we really want to pack this?
 // TODO(ltrudeau) Do we match with PREDICTION_MODE?
-typedef enum ATTRIBUTE_PACKED {
+enum {
   UV_DC_PRED,        // Average of above and left pixels
   UV_V_PRED,         // Vertical
   UV_H_PRED,         // Horizontal
@@ -463,38 +444,71 @@ typedef enum ATTRIBUTE_PACKED {
   UV_CFL_PRED,       // Chroma-from-Luma
   UV_INTRA_MODES,
   UV_MODE_INVALID,  // For uv_mode in inter blocks
-} UV_PREDICTION_MODE;
+} UENUM1BYTE(UV_PREDICTION_MODE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   SIMPLE_TRANSLATION,
   OBMC_CAUSAL,    // 2-sided OBMC
   WARPED_CAUSAL,  // 2-sided WARPED
   MOTION_MODES
-} MOTION_MODE;
+} UENUM1BYTE(MOTION_MODE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   II_DC_PRED,
   II_V_PRED,
   II_H_PRED,
   II_SMOOTH_PRED,
   INTERINTRA_MODES
-} INTERINTRA_MODE;
+} UENUM1BYTE(INTERINTRA_MODE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   COMPOUND_AVERAGE,
+  COMPOUND_DISTWTD,
   COMPOUND_WEDGE,
   COMPOUND_DIFFWTD,
   COMPOUND_TYPES,
-} COMPOUND_TYPE;
+  MASKED_COMPOUND_TYPES = 2,
+} UENUM1BYTE(COMPOUND_TYPE);
 
-typedef enum ATTRIBUTE_PACKED {
+enum {
   FILTER_DC_PRED,
   FILTER_V_PRED,
   FILTER_H_PRED,
   FILTER_D157_PRED,
   FILTER_PAETH_PRED,
   FILTER_INTRA_MODES,
-} FILTER_INTRA_MODE;
+} UENUM1BYTE(FILTER_INTRA_MODE);
+
+enum {
+  SEQ_LEVEL_2_0,
+  SEQ_LEVEL_2_1,
+  SEQ_LEVEL_2_2,
+  SEQ_LEVEL_2_3,
+  SEQ_LEVEL_3_0,
+  SEQ_LEVEL_3_1,
+  SEQ_LEVEL_3_2,
+  SEQ_LEVEL_3_3,
+  SEQ_LEVEL_4_0,
+  SEQ_LEVEL_4_1,
+  SEQ_LEVEL_4_2,
+  SEQ_LEVEL_4_3,
+  SEQ_LEVEL_5_0,
+  SEQ_LEVEL_5_1,
+  SEQ_LEVEL_5_2,
+  SEQ_LEVEL_5_3,
+  SEQ_LEVEL_6_0,
+  SEQ_LEVEL_6_1,
+  SEQ_LEVEL_6_2,
+  SEQ_LEVEL_6_3,
+  SEQ_LEVEL_7_0,
+  SEQ_LEVEL_7_1,
+  SEQ_LEVEL_7_2,
+  SEQ_LEVEL_7_3,
+  SEQ_LEVELS,
+  SEQ_LEVEL_MAX = 31
+} UENUM1BYTE(AV1_LEVEL);
+
+#define LEVEL_BITS 5
 
 #define DIRECTIONAL_MODES 8
 #define MAX_ANGLE_DELTA 3
@@ -529,7 +543,9 @@ typedef enum ATTRIBUTE_PACKED {
 
 #define DELTA_Q_SMALL 3
 #define DELTA_Q_PROBS (DELTA_Q_SMALL)
-#define DEFAULT_DELTA_Q_RES 4
+#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4
+#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4
+
 #define DELTA_LF_SMALL 3
 #define DELTA_LF_PROBS (DELTA_LF_SMALL)
 #define DEFAULT_DELTA_LF_RES 2
@@ -538,6 +554,7 @@ typedef enum ATTRIBUTE_PACKED {
 #define MAX_MV_REF_CANDIDATES 2
 
 #define MAX_REF_MV_STACK_SIZE 8
+#define USABLE_REF_MV_STACK_SIZE 4
 #define REF_CAT_LEVEL 640
 
 #define INTRA_INTER_CONTEXTS 4
@@ -550,28 +567,47 @@ typedef enum ATTRIBUTE_PACKED {
 #define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
 typedef uint8_t TXFM_CONTEXT;
 
-#define NONE_FRAME -1
-#define INTRA_FRAME 0
-#define LAST_FRAME 1
-#define LAST2_FRAME 2
-#define LAST3_FRAME 3
-#define GOLDEN_FRAME 4
-#define BWDREF_FRAME 5
-#define ALTREF2_FRAME 6
-#define ALTREF_FRAME 7
-#define EXTREF_FRAME REF_FRAMES
-#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
-
-#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
-
-#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
+// An enum for single reference types (and some derived values).
+enum {
+  NONE_FRAME = -1,
+  INTRA_FRAME,
+  LAST_FRAME,
+  LAST2_FRAME,
+  LAST3_FRAME,
+  GOLDEN_FRAME,
+  BWDREF_FRAME,
+  ALTREF2_FRAME,
+  ALTREF_FRAME,
+  REF_FRAMES,
+
+  // Extra/scratch reference frame. It may be:
+  // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or
+  // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()).
+  EXTREF_FRAME = REF_FRAMES,
+
+  // Number of inter (non-intra) reference types.
+  INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1,
+
+  // Number of forward (aka past) reference types.
+  FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1,
+
+  // Number of backward (aka future) reference types.
+  BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1,
+
+  SINGLE_REFS = FWD_REFS + BWD_REFS,
+};
+
+#define REF_FRAMES_LOG2 3
+
+// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new
+// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the
+// encoder in the cpi->scaled_ref_buf array.
+#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
+
 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
-#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
 #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
 
-#define SINGLE_REFS (FWD_REFS + BWD_REFS)
-
-typedef enum ATTRIBUTE_PACKED {
+enum {
   LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
   LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
   LAST_GOLDEN_FRAMES,     // { LAST_FRAME, GOLDEN_FRAME }
@@ -585,7 +621,7 @@ typedef enum ATTRIBUTE_PACKED {
   // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
   //       that are explicitly signaled.
   UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
-} UNIDIR_COMP_REF;
+} UENUM1BYTE(UNIDIR_COMP_REF);
 
 #define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
 
@@ -596,14 +632,37 @@ typedef enum ATTRIBUTE_PACKED {
 //       possible to have a reference pair not listed for explicit signaling.
 #define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
 
-typedef enum ATTRIBUTE_PACKED {
+// Note: It includes single and compound references. So, it can take values from
+// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
+typedef int8_t MV_REFERENCE_FRAME;
+
+enum {
   RESTORE_NONE,
   RESTORE_WIENER,
   RESTORE_SGRPROJ,
   RESTORE_SWITCHABLE,
   RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
   RESTORE_TYPES = 4,
-} RestorationType;
+} UENUM1BYTE(RestorationType);
+
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+enum {
+  SCALABILITY_L1T2 = 0,
+  SCALABILITY_L1T3 = 1,
+  SCALABILITY_L2T1 = 2,
+  SCALABILITY_L2T2 = 3,
+  SCALABILITY_L2T3 = 4,
+  SCALABILITY_S2T1 = 5,
+  SCALABILITY_S2T2 = 6,
+  SCALABILITY_S2T3 = 7,
+  SCALABILITY_L2T1h = 8,
+  SCALABILITY_L2T2h = 9,
+  SCALABILITY_L2T3h = 10,
+  SCALABILITY_S2T1h = 11,
+  SCALABILITY_S2T2h = 12,
+  SCALABILITY_S2T3h = 13,
+  SCALABILITY_SS = 14
+} UENUM1BYTE(SCALABILITY_STRUCTURES);
 
 #define SUPERRES_SCALE_BITS 3
 #define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
diff --git a/media/libaom/src/av1/common/filter.h b/media/libaom/src/av1/common/filter.h
index 571422d11..91791d3dc 100644
--- a/media/libaom/src/av1/common/filter.h
+++ b/media/libaom/src/av1/common/filter.h
@@ -19,6 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
+#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,29 +36,55 @@ typedef enum ATTRIBUTE_PACKED {
   SWITCHABLE_FILTERS = BILINEAR,
   SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
   EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+  INTERP_INVALID = 0xff,
 } InterpFilter;
 
-// With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
-// there are at most 10 filters, we can use 16 bits for each and have more than
-// enough space. This reduces argument passing and unifies the operation of
-// setting a (pair of) filters.
-//
-// Without CONFIG_DUAL_FILTER,
-typedef uint32_t InterpFilters;
-static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
-                                                     int x_filter) {
-  return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf);
-}
+enum {
+  USE_2_TAPS_ORIG = 0,  // This is used in temporal filtering.
+  USE_2_TAPS,
+  USE_4_TAPS,
+  USE_8_TAPS,
+} UENUM1BYTE(SUBPEL_SEARCH_TYPE);
+
+enum {
+  INTERP_EVAL_LUMA_EVAL_CHROMA = 0,
+  INTERP_SKIP_LUMA_EVAL_CHROMA,
+  INTERP_EVAL_INVALID,
+  INTERP_SKIP_LUMA_SKIP_CHROMA,
+} UENUM1BYTE(INTERP_EVAL_PLANE);
+
+enum {
+  INTERP_HORZ_NEQ_VERT_NEQ = 0,
+  INTERP_HORZ_EQ_VERT_NEQ,
+  INTERP_HORZ_NEQ_VERT_EQ,
+  INTERP_HORZ_EQ_VERT_EQ,
+  INTERP_PRED_TYPE_ALL,
+} UENUM1BYTE(INTERP_PRED_TYPE);
+// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters,
+// we can use 16 bits for each and have more than enough space. This reduces
+// argument passing and unifies the operation of setting a (pair of) filters.
+typedef struct InterpFilters {
+  uint16_t y_filter;
+  uint16_t x_filter;
+} InterpFilters;
 
-static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter,
-                                                    InterpFilter x_filter) {
-  uint16_t y16 = y_filter & 0xf;
-  uint16_t x16 = x_filter & 0xf;
-  return y16 | ((uint32_t)x16 << 16);
+typedef union int_interpfilters {
+  uint32_t as_int;
+  InterpFilters as_filters;
+} int_interpfilters;
+
+static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters,
+                                                     int dir) {
+  return (InterpFilter)((dir) ? filters.as_filters.x_filter
+                              : filters.as_filters.y_filter);
 }
 
-static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
-  return av1_make_interp_filters(filter, filter);
+static INLINE int_interpfilters
+av1_broadcast_interp_filter(InterpFilter filter) {
+  int_interpfilters filters;
+  filters.as_filters.x_filter = filter;
+  filters.as_filters.y_filter = filter;
+  return filters;
 }
 
 static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
@@ -67,10 +94,10 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
 #define LOG_SWITCHABLE_FILTERS 2
 
-#define MAX_SUBPEL_TAPS 12
 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
 #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
 #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff)
 
 typedef struct InterpFilterParams {
   const int16_t *filter_ptr;
@@ -141,9 +168,10 @@ static const InterpFilterParams
 
 // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
 // MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
-DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
-  64,
-  64,
+DECLARE_ALIGNED(256, static const int16_t,
+                av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+  128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
 static const InterpFilterParams av1_intrabc_filter_params = {
@@ -173,6 +201,16 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
 };
 
+static const uint16_t
+    av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = {
+      { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG),
+        (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH),
+        (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) },
+      { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP),
+        (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP),
+        (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) }
+    };
+
 // For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
 static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
   { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
@@ -192,14 +230,14 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
   return &av1_interp_filter_params_list[interp_filter];
 }
 
-static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
-    const InterpFilter interp_filter) {
-  return &av1_interp_4tap[interp_filter];
-}
-
 static INLINE const int16_t *av1_get_interp_filter_kernel(
-    const InterpFilter interp_filter) {
-  return av1_interp_filter_params_list[interp_filter].filter_ptr;
+    const InterpFilter interp_filter, int subpel_search) {
+  assert(subpel_search >= USE_2_TAPS);
+  return (subpel_search == USE_2_TAPS)
+             ? av1_interp_4tap[BILINEAR].filter_ptr
+             : ((subpel_search == USE_4_TAPS)
+                    ? av1_interp_4tap[interp_filter].filter_ptr
+                    : av1_interp_filter_params_list[interp_filter].filter_ptr);
 }
 
 static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
@@ -207,6 +245,33 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
   return filter_params->filter_ptr + filter_params->taps * subpel;
 }
 
+static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) {
+  assert(subpel_search >= USE_2_TAPS);
+
+  switch (subpel_search) {
+    case USE_2_TAPS: return &av1_interp_4tap[BILINEAR];
+    case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR];
+    case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR];
+    default: assert(0); return NULL;
+  }
+}
+
+static INLINE void reset_interp_filter_allowed_mask(
+    uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+  uint16_t tmp = (~(1 << filt_type)) & 0xffff;
+  *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK);
+}
+
+static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask,
+                                                  DUAL_FILTER_TYPE filt_type) {
+  *allow_interp_mask |= (1 << filt_type);
+}
+
+static INLINE uint8_t get_interp_filter_allowed_mask(
+    uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+  return (allow_interp_mask >> filt_type) & 1;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/frame_buffers.c b/media/libaom/src/av1/common/frame_buffers.c
index fd6c4bc79..f10ccd594 100644
--- a/media/libaom/src/av1/common/frame_buffers.c
+++ b/media/libaom/src/av1/common/frame_buffers.c
@@ -22,7 +22,11 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
       AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
   list->int_fb = (InternalFrameBuffer *)aom_calloc(
       list->num_internal_frame_buffers, sizeof(*list->int_fb));
-  return (list->int_fb == NULL);
+  if (list->int_fb == NULL) {
+    list->num_internal_frame_buffers = 0;
+    return 1;
+  }
+  return 0;
 }
 
 void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -36,6 +40,7 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
   }
   aom_free(list->int_fb);
   list->int_fb = NULL;
+  list->num_internal_frame_buffers = 0;
 }
 
 void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -69,7 +74,10 @@ int av1_get_frame_buffer(void *cb_priv, size_t min_size,
     // due to access uninitialized memory in frame border. It could be
     // skipped if border were totally removed.
     int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
-    if (!int_fb_list->int_fb[i].data) return -1;
+    if (!int_fb_list->int_fb[i].data) {
+      int_fb_list->int_fb[i].size = 0;
+      return -1;
+    }
     int_fb_list->int_fb[i].size = min_size;
   }
 
@@ -86,6 +94,5 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
   InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   (void)cb_priv;
   if (int_fb) int_fb->in_use = 0;
-  fb->priv = NULL;
   return 0;
 }
diff --git a/media/libaom/src/av1/common/idct.c b/media/libaom/src/av1/common/idct.c
index 2c1cb9827..bff438f3c 100644
--- a/media/libaom/src/av1/common/idct.c
+++ b/media/libaom/src/av1/common/idct.c
@@ -56,87 +56,87 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
   av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                            txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                            txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
-                                  int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
-                                  int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
-                                  int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
-                                  int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                             txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
-                                   int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
   av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                              txfm_param->tx_type, txfm_param->bd);
@@ -204,7 +204,7 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
   txfm_param->eob = eob;
   txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
   txfm_param->bd = xd->bd;
-  txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param->is_hbd = is_cur_buf_hbd(xd);
   txfm_param->tx_set_type = av1_get_ext_tx_set_type(
       txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
 }
@@ -224,10 +224,10 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
       av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
-      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param);
       break;
     case TX_8X16:
       av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
@@ -236,25 +236,25 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
       av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
       break;
     case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param);
       break;
     case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param);
       break;
     case TX_64X64:
       av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
       break;
     case TX_32X64:
-      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param);
       break;
     case TX_64X32:
-      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param);
       break;
     case TX_16X64:
-      av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param);
       break;
     case TX_64X16:
-      av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param);
       break;
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
@@ -263,16 +263,16 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
       av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
-      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param);
       break;
     case TX_4X16:
-      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param);
       break;
     case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param);
       break;
     case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param);
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
diff --git a/media/libaom/src/av1/common/idct.h b/media/libaom/src/av1/common/idct.h
index d9454e73f..004d25d49 100644
--- a/media/libaom/src/av1/common/idct.h
+++ b/media/libaom/src/av1/common/idct.h
@@ -44,22 +44,6 @@ static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
   return (const int32_t *)input;
 }
 
-typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
-                                  int stride, const TxfmParam *param);
-
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libaom/src/av1/common/loopfiltermask.c b/media/libaom/src/av1/common/loopfiltermask.c
new file mode 100644
index 000000000..157310f2d
--- /dev/null
+++ b/media/libaom/src/av1/common/loopfiltermask.c
@@ -0,0 +1,1458 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    -----------------
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    -----------------
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+#if CONFIG_LPF_MASK
+static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
+};
+
+static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
+
+static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
+
+static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,
+  2,  3,  -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = {
+  0,  47, 49, 19, 51, 53, 33, 55, 57, 42, 59,
+  60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66
+};
+
+static const FilterMask left_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+static const FilterMask above_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+      0x000000000000000fULL } },  // block size 16X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm,
+                                            int mi_row, int mi_col) {
+  assert(cm->lf.lfm != NULL);
+  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
+  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
+
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+                        const uint8_t *limit, const uint8_t *thresh);
+
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+                            const uint8_t *limit0, const uint8_t *thresh0,
+                            const uint8_t *blimit1, const uint8_t *limit1,
+                            const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+int get_index_shift(int mi_col, int mi_row, int *index) {
+  // *index = mi_row >> 2;
+  // rows = mi_row % 4;
+  // stride_log2 = 4;
+  // shift = (rows << stride_log2) + mi_col;
+  *index = mi_row >> 2;
+  return ((mi_row & 3) << 4) | mi_col;
+}
+
+static void filter_selectively_vert_row2(
+    int subsampling_factor, uint8_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                     lfi1->hev_thr);
+          }
+        } else if (mask_16x16_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          }
+        } else if (mask_8x8_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(
+    int subsampling_factor, uint16_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                            lfi0->hev_thr, lfi1->mblim,
+                                            lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_16x16_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_8x8_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+                                     int subsampling, uint64_t mask_16x16,
+                                     uint64_t mask_8x8, uint64_t mask_4x4,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+  int offset = 0;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds, when it is within current 64x64 block.
+    // If it is out of bound, its mask is zero, and it points to current edge's
+    // filter parameters, instead of next edge's.
+    int next_edge = step;
+    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, lfin->mblim, lfin->lim,
+                                       lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_8x8 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          count = 2;
+        } else {
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+    offset += step * count;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(
+    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+    uint8_t *lfl, int bd) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+  int offset = 0;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds, when it is within current 64x64 block.
+    // If it is out of bound, its mask is zero, and it points to current edge's
+    // filter parameters, instead of next edge's.
+    int next_edge = step;
+    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                              lfi->hev_thr, lfin->mblim,
+                                              lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_8x8 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+    offset += step * count;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_build_bitmask_vert_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  uint64_t skip, prev_skip = 0;
+  uint64_t is_coding_block_border;
+
+  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) {
+    const int mi_row = r << subsampling_y;
+    const int row = mi_row % MI_SIZE_64X64;
+    const int row_uv = row | subsampling_y;
+    int index = 0;
+    const int shift = get_index_shift(0, row, &index);
+
+    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+      const int mi_col = c << subsampling_x;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int col_in_unit = 0;
+           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+        if (x >= plane_ptr->dst.width) break;
+        const int col = col_in_unit << subsampling_x;
+        const int col_uv = col | subsampling_x;
+        const uint64_t mask = ((uint64_t)1 << (shift | col));
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
+          case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
+          case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((c + col_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+          switch (plane) {
+            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+          if (level == 0 && prev_level != 0) {
+            switch (plane) {
+              case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
+              case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
+              case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
+              default: assert(plane >= 0 && plane <= 2); return;
+            }
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        col_in_unit += tx_size_wide_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_build_bitmask_horz_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  uint64_t skip, prev_skip = 0;
+  uint64_t is_coding_block_border;
+
+  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) {
+    const int mi_col = c << subsampling_x;
+    const int col = mi_col % MI_SIZE_64X64;
+    const int col_uv = col | subsampling_x;
+
+    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+      const int mi_row = r << subsampling_y;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int r_in_unit = 0;
+           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+        if (y >= plane_ptr->dst.height) break;
+        const int row = r_in_unit << subsampling_y;
+        const int row_uv = row | subsampling_y;
+        int index = 0;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
+          case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
+          case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((r + r_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+          switch (plane) {
+            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+          if (level == 0 && prev_level != 0) {
+            switch (plane) {
+              case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
+              case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
+              case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
+              default: assert(plane >= 0 && plane <= 2); return;
+            }
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        r_in_unit += tx_size_high_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int two_row_step = 2 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  const int two_row_stride = row_stride << 1;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+
+  // 1. vertical filtering. filter two rows at a time
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += two_row_step) {
+    const int row = r | ssy;
+    const int row_next = row + row_step;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    int index_next = 0;
+    const int shift_next = get_index_shift(col, row_next, &index_next);
+    const int has_next_row = row_next < cm->mi_params.mi_rows;
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_ver[row][col];
+        lfl2 = &lfm->lfl_y_ver[row_next][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u_ver[row][col];
+        lfl2 = &lfm->lfl_u_ver[row_next][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v_ver[row][col];
+        lfl2 = &lfm->lfl_v_ver[row_next][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+    if (!has_next_row) {
+      mask_16x16_1 = 0;
+      mask_8x8_1 = 0;
+      mask_4x4_1 = 0;
+    }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_vert_row2(
+          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#else
+    filter_selectively_vert_row2(
+        ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+        mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+    dst->buf += two_row_stride;
+  }
+  // reset buf pointer for horizontal filtering
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += row_step) {
+    if (mi_row + r == 0) {
+      dst->buf += row_stride;
+      continue;
+    }
+    const int row = r | ssy;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_hor[row][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u_hor[row][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v_hor[row][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                             mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+    dst->buf += row_stride;
+  }
+  // reset buf pointer for next block
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int single_step = 1 << ssy;
+  const int r_step = 2 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+
+  // filter two rows at a time
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      // current and next row should belong to the same mask_idx and index
+      // next row's shift
+      const int row_next = row + single_step;
+      int index_next = 0;
+      const int shift_next = get_index_shift(col, row_next, &index_next);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_ver[row][col];
+          lfl2 = &lfm->lfl_y_ver[row_next][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_ver[row][col];
+          lfl2 = &lfm->lfl_u_ver[row_next][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_ver[row][col];
+          lfl2 = &lfm->lfl_v_ver[row_next][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_vert_row2(
+            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
+                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
+                                     &cm->lf_info, lfl, lfl2);
+#else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += 2 * MI_SIZE * dst->stride;
+  }
+}
+
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int r_step = 1 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      if (mi_row + r == 0) continue;
+
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_hor[row][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_hor[row][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_hor[row][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride, pl, ssx, mask_16x16,
+                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
+                                        (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += MI_SIZE * dst->stride;
+  }
+}
+
+void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             MB_MODE_INFO *mbmi) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (tx_size - TX_4X16);
+  }
+  int index = 0;
+  const int row = mi_row % MI_SIZE_64X64;
+  const int col = mi_col % MI_SIZE_64X64;
+  const int shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+  // Use a lookup table that provides one bitmask for a given block size and
+  // a univariant transform size.
+  int index;
+  int shift;
+  int row;
+  int col;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (mbmi->tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+    mask_id =
+        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (mbmi->tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (mbmi->tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (mbmi->tx_size - TX_4X16);
+  }
+  row = mi_row % MI_SIZE_64X64;
+  col = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                  int is_horz_coding_block_border,
+                                  int is_vert_coding_block_border) {
+  int index;
+  int shift;
+  int row;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int row_start = mi_row % MI_SIZE_64X64;
+  const int col_start = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col_start, row_start, &index);
+  if (is_horz_coding_block_border) {
+    const int block_shift = shift + mi_size_wide[bsize];
+    assert(block_shift <= 64);
+    const uint64_t right_edge_shift =
+        (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift);
+    const uint64_t left_edge_shift = (block_shift == 64)
+                                         ? (((uint64_t)1 << shift) - 1)
+                                         : ((uint64_t)1 << shift);
+    assert(right_edge_shift > left_edge_shift);
+    const uint64_t top_edge_mask = right_edge_shift - left_edge_shift;
+    lfm->is_horz_border.bits[index] |= top_edge_mask;
+  }
+  if (is_vert_coding_block_border) {
+    const int is_vert_border = mask_id_table_vert_border[bsize];
+    const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->is_vert_border.bits[i + index] |=
+          (left_mask_univariant_reordered[is_vert_border].bits[i]
+           << vert_shift);
+    }
+  }
+  const int is_skip = mbmi->skip && is_inter_block(mbmi);
+  if (is_skip) {
+    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->skip.bits[i + index] |=
+          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+    }
+  }
+  const uint8_t level_vert_y =
+      av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+  const uint8_t level_horz_y =
+      av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+  const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+  const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+    index = 0;
+    row = r % MI_SIZE_64X64;
+    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u_ver[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u_hor[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v_ver[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v_hor[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+  }
+}
+#endif  // CONFIG_LPF_MASK
diff --git a/media/libaom/src/av1/common/mv.h b/media/libaom/src/av1/common/mv.h
index 5b0225192..be539e820 100644
--- a/media/libaom/src/av1/common/mv.h
+++ b/media/libaom/src/av1/common/mv.h
@@ -21,17 +21,34 @@ extern "C" {
 #endif
 
 #define INVALID_MV 0x80008000
+#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3)
+#define GET_MV_SUBPEL(x) ((x)*8)
 
+#define MARK_MV_INVALID(mv)                \
+  do {                                     \
+    ((int_mv *)(mv))->as_int = INVALID_MV; \
+  } while (0);
+#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col))
+
+// The motion vector in units of full pixel
+typedef struct fullpel_mv {
+  int16_t row;
+  int16_t col;
+} FULLPEL_MV;
+
+// The motion vector in units of 1/8-pel
 typedef struct mv {
   int16_t row;
   int16_t col;
 } MV;
 
 static const MV kZeroMv = { 0, 0 };
+static const FULLPEL_MV kZeroFullMv = { 0, 0 };
 
 typedef union int_mv {
   uint32_t as_int;
   MV as_mv;
+  FULLPEL_MV as_fullmv;
 } int_mv; /* facilitates faster equality tests and copies */
 
 typedef struct mv32 {
@@ -39,6 +56,38 @@ typedef struct mv32 {
   int32_t col;
 } MV32;
 
+// The mv limit for fullpel mvs
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} FullMvLimits;
+
+// The mv limit for subpel mvs
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} SubpelMvLimits;
+
+static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) {
+  const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row),
+                               (int16_t)GET_MV_RAWPEL(subpel_mv->col) };
+  return full_mv;
+}
+
+static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
+  const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
+                         (int16_t)GET_MV_SUBPEL(full_mv->col) };
+  return subpel_mv;
+}
+
+static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) {
+  mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
+}
+
 // Bits of precision used for the model
 #define WARPEDMODEL_PREC_BITS 16
 #define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
@@ -56,13 +105,13 @@ typedef struct mv32 {
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
 /* clang-format off */
-typedef enum ATTRIBUTE_PACKED {
+enum {
   IDENTITY = 0,      // identity transformation, 0-parameter
   TRANSLATION = 1,   // translational motion 2-parameter
   ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
   AFFINE = 3,        // affine, 6-parameter
   TRANS_TYPES,
-} TransformationType;
+} UENUM1BYTE(TransformationType);
 /* clang-format on */
 
 // Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
@@ -87,18 +136,18 @@ static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
 //  z .  y'  =   m4 m5 m1 *  y
 //       1]      m6 m7 1)    1]
 typedef struct {
-  TransformationType wmtype;
   int32_t wmmat[8];
   int16_t alpha, beta, gamma, delta;
+  TransformationType wmtype;
   int8_t invalid;
 } WarpedMotionParams;
 
 /* clang-format off */
 static const WarpedMotionParams default_warp_params = {
-  IDENTITY,
   { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
     0 },
   0, 0, 0, 0,
+  IDENTITY,
   0,
 };
 /* clang-format on */
@@ -225,7 +274,8 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
     // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
     // bits of fractional precision. The offset for a translation is stored in
     // entries 0 and 1. For translations, all but the top three (two if
-    // cm->allow_high_precision_mv is false) fractional bits are always zero.
+    // cm->features.allow_high_precision_mv is false) fractional bits are always
+    // zero.
     //
     // After the right shifts, there are 3 fractional bits of precision. If
     // allow_hp is false, the bottom bit is always zero (so we don't need a
@@ -263,7 +313,7 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
   return res;
 }
 
-static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
+static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) {
   if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
       gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
     return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
@@ -277,7 +327,6 @@ static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
 typedef struct candidate_mv {
   int_mv this_mv;
   int_mv comp_mv;
-  int weight;
 } CANDIDATE_MV;
 
 static INLINE int is_zero_mv(const MV *mv) {
@@ -288,10 +337,14 @@ static INLINE int is_equal_mv(const MV *a, const MV *b) {
   return *((const uint32_t *)a) == *((const uint32_t *)b);
 }
 
-static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
-                            int max_row) {
-  mv->col = clamp(mv->col, min_col, max_col);
-  mv->row = clamp(mv->row, min_row, max_row);
+static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
+  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
+}
+
+static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) {
+  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
 }
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/common/mvref_common.c b/media/libaom/src/av1/common/mvref_common.c
index 7f24ab4e6..db3098cc0 100644
--- a/media/libaom/src/av1/common/mvref_common.c
+++ b/media/libaom/src/av1/common/mvref_common.c
@@ -23,7 +23,7 @@ static int div_mult[32] = { 0,    16384, 8192, 5461, 4096, 3276, 2730, 2340,
 
 // TODO(jingning): Consider the use of lookup table for (num / den)
 // altogether.
-static void get_mv_projection(MV *output, MV ref, int num, int den) {
+static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) {
   den = AOMMIN(den, MAX_FRAME_DISTANCE);
   num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
                 : AOMMAX(num, -MAX_FRAME_DISTANCE);
@@ -40,7 +40,7 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) {
 void av1_copy_frame_mvs(const AV1_COMMON *const cm,
                         const MB_MODE_INFO *const mi, int mi_row, int mi_col,
                         int x_mis, int y_mis) {
-  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
   MV_REF *frame_mvs =
       cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
@@ -71,34 +71,35 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm,
   }
 }
 
-static void add_ref_mv_candidate(
+static AOM_INLINE void add_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
     uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
-    CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
-    const WarpedMotionParams *gm_params, int col, int weight) {
-  if (!is_inter_block(candidate)) return;  // for intrabc
-  int index = 0, ref;
+    CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+    int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params,
+    uint16_t weight) {
+  if (!is_inter_block(candidate)) return;
   assert(weight % 2 == 0);
+  int index, ref;
 
   if (rf[1] == NONE_FRAME) {
     // single reference frame
     for (ref = 0; ref < 2; ++ref) {
       if (candidate->ref_frame[ref] == rf[0]) {
-        int_mv this_refmv;
-        if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
-          this_refmv = gm_mv_candidates[0];
-        else
-          this_refmv = get_sub_block_mv(candidate, ref, col);
-
-        for (index = 0; index < *refmv_count; ++index)
-          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
-
-        if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+        const int is_gm_block =
+            is_global_mv_block(candidate, gm_params[rf[0]].wmtype);
+        const int_mv this_refmv =
+            is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref);
+        for (index = 0; index < *refmv_count; ++index) {
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) {
+            ref_mv_weight[index] += weight;
+            break;
+          }
+        }
 
         // Add a new item to the list.
         if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
           ref_mv_stack[index].this_mv = this_refmv;
-          ref_mv_stack[index].weight = weight;
+          ref_mv_weight[index] = weight;
           ++(*refmv_count);
         }
         if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
@@ -114,21 +115,22 @@ static void add_ref_mv_candidate(
         if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
           this_refmv[ref] = gm_mv_candidates[ref];
         else
-          this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
+          this_refmv[ref] = get_block_mv(candidate, ref);
       }
 
-      for (index = 0; index < *refmv_count; ++index)
+      for (index = 0; index < *refmv_count; ++index) {
         if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
-            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
+            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) {
+          ref_mv_weight[index] += weight;
           break;
-
-      if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+        }
+      }
 
       // Add a new item to the list.
       if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
         ref_mv_stack[index].this_mv = this_refmv[0];
         ref_mv_stack[index].comp_mv = this_refmv[1];
-        ref_mv_stack[index].weight = weight;
+        ref_mv_weight[index] = weight;
         ++(*refmv_count);
       }
       if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
@@ -137,42 +139,39 @@ static void add_ref_mv_candidate(
   }
 }
 
-static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int row_offset,
-                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates, int max_row_offset,
-                          int *processed_rows) {
-  int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
+static AOM_INLINE void scan_row_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col,
+    const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack,
+    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset,
+    int *processed_rows) {
+  int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
   end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
-  const int n8_w_8 = mi_size_wide[BLOCK_8X8];
-  const int n8_w_16 = mi_size_wide[BLOCK_16X16];
-  int i;
+  const int width_8x8 = mi_size_wide[BLOCK_8X8];
+  const int width_16x16 = mi_size_wide[BLOCK_16X16];
   int col_offset = 0;
   // TODO(jingning): Revisit this part after cb4x4 is stable.
   if (abs(row_offset) > 1) {
     col_offset = 1;
-    if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
+    if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset;
   }
-  const int use_step_16 = (xd->n4_w >= 16);
+  const int use_step_16 = (xd->width >= 16);
   MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
-  (void)mi_row;
 
-  for (i = 0; i < end_mi;) {
+  for (int i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
     const int candidate_bsize = candidate->sb_type;
     const int n4_w = mi_size_wide[candidate_bsize];
-    int len = AOMMIN(xd->n4_w, n4_w);
+    int len = AOMMIN(xd->width, n4_w);
     if (use_step_16)
-      len = AOMMAX(n8_w_16, len);
+      len = AOMMAX(width_16x16, len);
     else if (abs(row_offset) > 1)
-      len = AOMMAX(len, n8_w_8);
+      len = AOMMAX(len, width_8x8);
 
-    int weight = 2;
-    if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
-      int inc = AOMMIN(-max_row_offset + row_offset + 1,
-                       mi_size_high[candidate_bsize]);
+    uint16_t weight = 2;
+    if (xd->width >= width_8x8 && xd->width <= n4_w) {
+      uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1,
+                            mi_size_high[candidate_bsize]);
       // Obtain range used in weight calculation.
       weight = AOMMAX(weight, inc);
       // Update processed rows.
@@ -180,21 +179,20 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     }
 
     add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, col_offset + i, len * weight);
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, len * weight);
 
     i += len;
   }
 }
 
-static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int col_offset,
-                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates, int max_col_offset,
-                          int *processed_cols) {
-  int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
+static AOM_INLINE void scan_col_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row,
+    const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack,
+    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset,
+    int *processed_cols) {
+  int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
   end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
   const int n8_h_8 = mi_size_high[BLOCK_8X8];
   const int n8_h_16 = mi_size_high[BLOCK_16X16];
@@ -202,24 +200,23 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   int row_offset = 0;
   if (abs(col_offset) > 1) {
     row_offset = 1;
-    if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
+    if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset;
   }
-  const int use_step_16 = (xd->n4_h >= 16);
-  (void)mi_col;
+  const int use_step_16 = (xd->height >= 16);
 
   for (i = 0; i < end_mi;) {
     const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
     const int candidate_bsize = candidate->sb_type;
     const int n4_h = mi_size_high[candidate_bsize];
-    int len = AOMMIN(xd->n4_h, n4_h);
+    int len = AOMMIN(xd->height, n4_h);
     if (use_step_16)
       len = AOMMAX(n8_h_16, len);
     else if (abs(col_offset) > 1)
       len = AOMMAX(len, n8_h_8);
 
     int weight = 2;
-    if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
+    if (xd->height >= n8_h_8 && xd->height <= n4_h) {
       int inc = AOMMIN(-max_col_offset + col_offset + 1,
                        mi_size_wide[candidate_bsize]);
       // Obtain range used in weight calculation.
@@ -229,20 +226,19 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     }
 
     add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, col_offset, len * weight);
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, len * weight);
 
     i += len;
   }
 }
 
-static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          const int mi_row, const int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int row_offset,
-                          int col_offset, CANDIDATE_MV *ref_mv_stack,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates,
-                          uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
+static AOM_INLINE void scan_blk_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row,
+    const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset,
+    int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+    uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates,
+    uint8_t *refmv_count) {
   const TileInfo *const tile = &xd->tile;
   POSITION mi_pos;
 
@@ -255,8 +251,8 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     const int len = mi_size_wide[BLOCK_8X8];
 
     add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, mi_pos.col, 2 * len);
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, 2 * len);
   }  // Analyze a single 8x8 block motion information.
 }
 
@@ -291,19 +287,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 
   // The left hand of two vertical rectangles always has a top right (as the
   // block above will have been decoded)
-  if (xd->n4_w < xd->n4_h)
+  if (xd->width < xd->height)
     if (!xd->is_sec_rect) has_tr = 1;
 
   // The bottom of two horizontal rectangles never has a top right (as the block
   // to the right won't have been decoded)
-  if (xd->n4_w > xd->n4_h)
+  if (xd->width > xd->height)
     if (xd->is_sec_rect) has_tr = 0;
 
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
   // rectangle of the partition
   if (xd->mi[0]->partition == PARTITION_VERT_A) {
-    if (xd->n4_w == xd->n4_h)
+    if (xd->width == xd->height)
       if (mask_row & bs) has_tr = 0;
   }
 
@@ -326,112 +322,98 @@ static int check_sb_border(const int mi_row, const int mi_col,
 static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                           int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
                           int blk_row, int blk_col, int_mv *gm_mv_candidates,
-                          uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-                          CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
+                          uint8_t *const refmv_count,
+                          CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+                          uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
                           int16_t *mode_context) {
   POSITION mi_pos;
-  int idx;
-  const int weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
-
   mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
   mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
 
   if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
 
   const TPL_MV_REF *prev_frame_mvs =
-      cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
+      cm->tpl_mvs +
+      ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) +
       ((mi_col + mi_pos.col) >> 1);
+  if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0;
 
   MV_REFERENCE_FRAME rf[2];
   av1_set_ref_frame(rf, ref_frame);
 
+  const uint16_t weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
+  const int cur_frame_index = cm->cur_frame->order_hint;
+  const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
+  const int frame0_index = buf_0->order_hint;
+  const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
+                                             cur_frame_index, frame0_index);
+  int idx;
+  const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+  const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+
+  int_mv this_refmv;
+  get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                    cur_offset_0, prev_frame_mvs->ref_frame_offset);
+  lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv,
+                     force_integer_mv);
+
   if (rf[1] == NONE_FRAME) {
-    int cur_frame_index = cm->cur_frame->cur_frame_offset;
-    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
-    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
-    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
-    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
-
-    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
-      int_mv this_refmv;
-
-      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
-
-      if (blk_row == 0 && blk_col == 0)
-        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
-            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
-      for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
-
-      if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
-
-      if (idx == refmv_count[rf[0]] &&
-          refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
-        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(refmv_count[rf[0]]);
-      }
-      return 1;
+    if (blk_row == 0 && blk_col == 0) {
+      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+    }
+
+    for (idx = 0; idx < *refmv_count; ++idx)
+      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+
+    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
+
+    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+      ref_mv_weight[idx] = 2 * weight_unit;
+      ++(*refmv_count);
     }
   } else {
     // Process compound inter mode
-    int cur_frame_index = cm->cur_frame->cur_frame_offset;
-    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
-    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
-
-    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
-    int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
-    int frame1_index = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
-    int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index);
-    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
-
-    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
-      int_mv this_refmv;
-      int_mv comp_refmv;
-      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
-      get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_1, prev_frame_mvs->ref_frame_offset);
-
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
-      lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
-
-      if (blk_row == 0 && blk_col == 0)
-        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
-            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
-            abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
-            abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
-      for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
-            comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
-          break;
+    const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
+    const int frame1_index = buf_1->order_hint;
+    const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
+                                               cur_frame_index, frame1_index);
+    int_mv comp_refmv;
+    get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                      cur_offset_1, prev_frame_mvs->ref_frame_offset);
+    lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv,
+                       force_integer_mv);
+
+    if (blk_row == 0 && blk_col == 0) {
+      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+          abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+          abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+    }
+
+    for (idx = 0; idx < *refmv_count; ++idx) {
+      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+          comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+        break;
+    }
 
-      if (idx < refmv_count[ref_frame])
-        ref_mv_stack[idx].weight += 2 * weight_unit;
+    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
 
-      if (idx == refmv_count[ref_frame] &&
-          refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
-        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
-        ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(refmv_count[ref_frame]);
-      }
-      return 1;
+    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+      ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
+      ref_mv_weight[idx] = 2 * weight_unit;
+      ++(*refmv_count);
     }
   }
-  return 0;
+
+  return 1;
 }
 
-static void process_compound_ref_mv_candidate(
+static AOM_INLINE void process_compound_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
     const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
     int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
@@ -456,10 +438,11 @@ static void process_compound_ref_mv_candidate(
   }
 }
 
-static void process_single_ref_mv_candidate(
+static AOM_INLINE void process_single_ref_mv_candidate(
     const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
-    MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+    MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count,
+    CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) {
   for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
     if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
       int_mv this_mv = candidate->mv[rf_idx];
@@ -469,49 +452,50 @@ static void process_single_ref_mv_candidate(
         this_mv.as_mv.col = -this_mv.as_mv.col;
       }
       int stack_idx;
-      for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-        const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+      for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) {
+        const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv;
         if (this_mv.as_int == stack_mv.as_int) break;
       }
 
-      if (stack_idx == refmv_count[ref_frame]) {
-        ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+      if (stack_idx == *refmv_count) {
+        ref_mv_stack[stack_idx].this_mv = this_mv;
 
         // TODO(jingning): Set an arbitrary small number here. The weight
         // doesn't matter as long as it is properly initialized.
-        ref_mv_stack[ref_frame][stack_idx].weight = 2;
-        ++refmv_count[ref_frame];
+        ref_mv_weight[stack_idx] = 2;
+        ++(*refmv_count);
       }
     }
   }
 }
 
-static void setup_ref_mv_list(
+static AOM_INLINE void setup_ref_mv_list(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
-    uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
-    int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+    uint8_t *const refmv_count,
+    CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
+    int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
     int mi_row, int mi_col, int16_t *mode_context) {
-  const int bs = AOMMAX(xd->n4_w, xd->n4_h);
+  const int bs = AOMMAX(xd->width, xd->height);
   const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
   MV_REFERENCE_FRAME rf[2];
 
   const TileInfo *const tile = &xd->tile;
   int max_row_offset = 0, max_col_offset = 0;
-  const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
-  const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+  const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+  const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
   int processed_rows = 0;
   int processed_cols = 0;
 
   av1_set_ref_frame(rf, ref_frame);
   mode_context[ref_frame] = 0;
-  refmv_count[ref_frame] = 0;
+  *refmv_count = 0;
 
   // Find valid maximum row/col offset.
   if (xd->up_available) {
     max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
 
-    if (xd->n4_h < mi_size_high[BLOCK_8X8])
+    if (xd->height < mi_size_high[BLOCK_8X8])
       max_row_offset = -(2 << 1) + row_adj;
 
     max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
@@ -520,7 +504,7 @@ static void setup_ref_mv_list(
   if (xd->left_available) {
     max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
 
-    if (xd->n4_w < mi_size_wide[BLOCK_8X8])
+    if (xd->width < mi_size_wide[BLOCK_8X8])
       max_col_offset = -(2 << 1) + col_adj;
 
     max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
@@ -532,48 +516,48 @@ static void setup_ref_mv_list(
 
   // Scan the first above row mode info. row_offset = -1;
   if (abs(max_row_offset) >= 1)
-    scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
-                  &refmv_count[ref_frame], &row_match_count, &newmv_count,
-                  gm_mv_candidates, max_row_offset, &processed_rows);
+    scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
+                  refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
+                  max_row_offset, &processed_rows);
   // Scan the first left column mode info. col_offset = -1;
   if (abs(max_col_offset) >= 1)
-    scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
-                  &refmv_count[ref_frame], &col_match_count, &newmv_count,
-                  gm_mv_candidates, max_col_offset, &processed_cols);
+    scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
+                  refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
+                  max_col_offset, &processed_cols);
   // Check top-right boundary
   if (has_tr)
-    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
-                  ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
-                  gm_mv_candidates, &refmv_count[ref_frame]);
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack,
+                  ref_mv_weight, &row_match_count, &newmv_count,
+                  gm_mv_candidates, refmv_count);
 
   const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
-  const uint8_t nearest_refmv_count = refmv_count[ref_frame];
+  const uint8_t nearest_refmv_count = *refmv_count;
 
   // TODO(yunqing): for comp_search, do it for all 3 cases.
   for (int idx = 0; idx < nearest_refmv_count; ++idx)
-    ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
+    ref_mv_weight[idx] += REF_CAT_LEVEL;
 
-  if (cm->allow_ref_frame_mvs) {
+  if (cm->features.allow_ref_frame_mvs) {
     int is_available = 0;
-    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
-    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
-    const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
-    const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width);
+    const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]);
 
     const int tpl_sample_pos[3][2] = {
       { voffset, -2 },
       { voffset, hoffset },
       { voffset - 2, hoffset },
     };
-    const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
-                                (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
-                                (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
-                                (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+    const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->height < mi_size_high[BLOCK_64X64]) &&
+                                (xd->width >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->width < mi_size_wide[BLOCK_64X64]);
 
-    const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+    const int step_h = (xd->height >= mi_size_high[BLOCK_64X64])
                            ? mi_size_high[BLOCK_16X16]
                            : mi_size_high[BLOCK_8X8];
-    const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+    const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64])
                            ? mi_size_wide[BLOCK_16X16]
                            : mi_size_wide[BLOCK_8X8];
 
@@ -581,7 +565,7 @@ static void setup_ref_mv_list(
       for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
         int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
                                  blk_col, gm_mv_candidates, refmv_count,
-                                 ref_mv_stack, mode_context);
+                                 ref_mv_stack, ref_mv_weight, mode_context);
         if (blk_row == 0 && blk_col == 0) is_available = ret;
       }
     }
@@ -594,16 +578,17 @@ static void setup_ref_mv_list(
 
       if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
       add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
-                     gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
+                     gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight,
+                     mode_context);
     }
   }
 
   uint8_t dummy_newmv_count = 0;
 
   // Scan the second outer area.
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight,
                 &row_match_count, &dummy_newmv_count, gm_mv_candidates,
-                &refmv_count[ref_frame]);
+                refmv_count);
 
   for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
     const int row_offset = -(idx << 1) + 1 + row_adj;
@@ -611,24 +596,21 @@ static void setup_ref_mv_list(
 
     if (abs(row_offset) <= abs(max_row_offset) &&
         abs(row_offset) > processed_rows)
-      scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
-                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
-                    &row_match_count, &dummy_newmv_count, gm_mv_candidates,
-                    max_row_offset, &processed_rows);
+      scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
+                    refmv_count, &row_match_count, &dummy_newmv_count,
+                    gm_mv_candidates, max_row_offset, &processed_rows);
 
     if (abs(col_offset) <= abs(max_col_offset) &&
         abs(col_offset) > processed_cols)
-      scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
-                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
-                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
-                    max_col_offset, &processed_cols);
+      scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
+                    refmv_count, &col_match_count, &dummy_newmv_count,
+                    gm_mv_candidates, max_col_offset, &processed_cols);
   }
 
   const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
 
   switch (nearest_match) {
     case 0:
-      mode_context[ref_frame] |= 0;
       if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
       if (ref_match_count == 1)
         mode_context[ref_frame] |= (1 << REFMV_OFFSET);
@@ -658,45 +640,48 @@ static void setup_ref_mv_list(
   while (len > 0) {
     int nr_len = 0;
     for (int idx = 1; idx < len; ++idx) {
-      if (ref_mv_stack[ref_frame][idx - 1].weight <
-          ref_mv_stack[ref_frame][idx].weight) {
-        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
-        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
-        ref_mv_stack[ref_frame][idx] = tmp_mv;
+      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+        ref_mv_weight[idx] = tmp_ref_mv_weight;
         nr_len = idx;
       }
     }
     len = nr_len;
   }
 
-  len = refmv_count[ref_frame];
+  len = *refmv_count;
   while (len > nearest_refmv_count) {
     int nr_len = nearest_refmv_count;
     for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
-      if (ref_mv_stack[ref_frame][idx - 1].weight <
-          ref_mv_stack[ref_frame][idx].weight) {
-        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
-        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
-        ref_mv_stack[ref_frame][idx] = tmp_mv;
+      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+        ref_mv_weight[idx] = tmp_ref_mv_weight;
         nr_len = idx;
       }
     }
     len = nr_len;
   }
 
+  int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width);
+  mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col);
+  int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height);
+  mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row);
+  const int mi_size = AOMMIN(mi_width, mi_height);
   if (rf[1] > NONE_FRAME) {
     // TODO(jingning, yunqing): Refactor and consolidate the compound and
     // single reference frame modes. Reduce unnecessary redundancy.
-    if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
+    if (*refmv_count < MAX_MV_REF_CANDIDATES) {
       int_mv ref_id[2][2], ref_diff[2][2];
       int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
 
-      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
-      mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
-      mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
-      int mi_size = AOMMIN(mi_width, mi_height);
-
       for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
         const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
         process_compound_ref_mv_candidate(
@@ -712,95 +697,82 @@ static void setup_ref_mv_list(
       }
 
       // Build up the compound mv predictor
-      int_mv comp_list[3][2];
+      int_mv comp_list[MAX_MV_REF_CANDIDATES][2];
 
       for (int idx = 0; idx < 2; ++idx) {
         int comp_idx = 0;
-        for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
+        for (int list_idx = 0;
+             list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
              ++list_idx, ++comp_idx)
           comp_list[comp_idx][idx] = ref_id[idx][list_idx];
-        for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
+        for (int list_idx = 0;
+             list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
              ++list_idx, ++comp_idx)
           comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
-        for (; comp_idx < 3; ++comp_idx)
+        for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx)
           comp_list[comp_idx][idx] = gm_mv_candidates[idx];
       }
 
-      if (refmv_count[ref_frame]) {
-        assert(refmv_count[ref_frame] == 1);
-        if (comp_list[0][0].as_int ==
-                ref_mv_stack[ref_frame][0].this_mv.as_int &&
-            comp_list[0][1].as_int ==
-                ref_mv_stack[ref_frame][0].comp_mv.as_int) {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[1][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[1][1];
+      if (*refmv_count) {
+        assert(*refmv_count == 1);
+        if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int &&
+            comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) {
+          ref_mv_stack[*refmv_count].this_mv = comp_list[1][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1];
         } else {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[0][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[0][1];
+          ref_mv_stack[*refmv_count].this_mv = comp_list[0][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1];
         }
-        ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
-        ++refmv_count[ref_frame];
+        ref_mv_weight[*refmv_count] = 2;
+        ++*refmv_count;
       } else {
         for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[idx][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[idx][1];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
-          ++refmv_count[ref_frame];
+          ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1];
+          ref_mv_weight[*refmv_count] = 2;
+          ++*refmv_count;
         }
       }
     }
 
-    assert(refmv_count[ref_frame] >= 2);
+    assert(*refmv_count >= 2);
 
-    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
-                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+    for (int idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
+      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
     }
   } else {
     // Handle single reference frame extension
-    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
-    mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
-    mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
-    int mi_size = AOMMIN(mi_width, mi_height);
-
     for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
-                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
-                                      ref_mv_stack);
+                                      ref_mv_stack, ref_mv_weight);
       idx += mi_size_wide[candidate->sb_type];
     }
 
     for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
-                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
       const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
       process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
-                                      ref_mv_stack);
+                                      ref_mv_stack, ref_mv_weight);
       idx += mi_size_high[candidate->sb_type];
     }
 
-    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+    for (int idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
     }
 
     if (mv_ref_list != NULL) {
-      for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
-        mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
+      for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx)
+        mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int;
 
-      for (int idx = 0;
-           idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
-        mv_ref_list[rf[0]][idx].as_int =
-            ref_mv_stack[ref_frame][idx].this_mv.as_int;
+      for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count);
+           ++idx) {
+        mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
       }
     }
   }
@@ -810,43 +782,44 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
                       CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
                       int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
-                      int_mv *global_mvs, int mi_row, int mi_col,
-                      int16_t *mode_context) {
-  int_mv zeromv[2];
-  BLOCK_SIZE bsize = mi->sb_type;
-  MV_REFERENCE_FRAME rf[2];
-  av1_set_ref_frame(rf, ref_frame);
-
-  if (ref_frame < REF_FRAMES) {
-    if (ref_frame != INTRA_FRAME) {
-      global_mvs[ref_frame] = gm_get_motion_vector(
-          &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
-          mi_col, mi_row, cm->cur_frame_force_integer_mv);
-    } else {
+                      int_mv *global_mvs, int16_t *mode_context) {
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int_mv gm_mv[2];
+
+  if (ref_frame == INTRA_FRAME) {
+    gm_mv[0].as_int = gm_mv[1].as_int = 0;
+    if (global_mvs != NULL) {
       global_mvs[ref_frame].as_int = INVALID_MV;
     }
-  }
-
-  if (ref_frame != INTRA_FRAME) {
-    zeromv[0].as_int =
-        gm_get_motion_vector(&cm->global_motion[rf[0]],
-                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             cm->cur_frame_force_integer_mv)
-            .as_int;
-    zeromv[1].as_int =
-        (rf[1] != NONE_FRAME)
-            ? gm_get_motion_vector(&cm->global_motion[rf[1]],
-                                   cm->allow_high_precision_mv, bsize, mi_col,
-                                   mi_row, cm->cur_frame_force_integer_mv)
-                  .as_int
-            : 0;
   } else {
-    zeromv[0].as_int = zeromv[1].as_int = 0;
+    const BLOCK_SIZE bsize = mi->sb_type;
+    const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+    const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+    if (ref_frame < REF_FRAMES) {
+      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+      gm_mv[1].as_int = 0;
+      if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0];
+    } else {
+      MV_REFERENCE_FRAME rf[2];
+      av1_set_ref_frame(rf, ref_frame);
+      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+      gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+    }
   }
 
-  setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
-                    zeromv, mi_row, mi_col, mode_context);
+  setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame],
+                    ref_mv_stack[ref_frame], ref_mv_weight[ref_frame],
+                    mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row,
+                    mi_col, mode_context);
 }
 
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
@@ -861,26 +834,29 @@ void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
 }
 
 void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
-  cm->cur_frame->cur_frame_offset = cm->frame_offset;
+  cm->cur_frame->order_hint = cm->current_frame.order_hint;
+  cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
 
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
-    if (buf_idx >= 0)
-      cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME] =
-          cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
+      cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] =
+          buf->display_order_hint;
+    }
   }
 }
 
 void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
-    if (cm->seq_params.enable_order_hint && buf_idx != INVALID_IDX) {
-      const int ref_frame_offset =
-          cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
+      const int ref_order_hint = buf->order_hint;
       cm->ref_frame_sign_bias[ref_frame] =
-          (get_relative_dist(cm, ref_frame_offset, (int)cm->frame_offset) <= 0)
+          (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
+                             (int)cm->current_frame.order_hint) <= 0)
               ? 0
               : 1;
     } else {
@@ -908,8 +884,8 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
   const int col =
       (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
 
-  if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
-      col >= (cm->mi_cols >> 1))
+  if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 ||
+      col >= (cm->mi_params.mi_cols >> 1))
     return 0;
 
   if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
@@ -935,35 +911,36 @@ static int motion_field_projection(AV1_COMMON *cm,
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
   int ref_offset[REF_FRAMES] = { 0 };
 
-  (void)dir;
-
-  const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
-  if (start_frame_idx < 0) return 0;
+  const RefCntBuffer *const start_frame_buf =
+      get_ref_frame_buf(cm, start_frame);
+  if (start_frame_buf == NULL) return 0;
 
-  if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
+  if (start_frame_buf->frame_type == KEY_FRAME ||
+      start_frame_buf->frame_type == INTRA_ONLY_FRAME)
+    return 0;
 
-  if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
-      cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
+  if (start_frame_buf->mi_rows != cm->mi_params.mi_rows ||
+      start_frame_buf->mi_cols != cm->mi_params.mi_cols)
     return 0;
 
-  const int start_frame_offset =
-      cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
-  const unsigned int *const ref_frame_offsets =
-      &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
-  const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
-  int start_to_current_frame_offset =
-      get_relative_dist(cm, start_frame_offset, cur_frame_offset);
+  const int start_frame_order_hint = start_frame_buf->order_hint;
+  const unsigned int *const ref_order_hints =
+      &start_frame_buf->ref_order_hints[0];
+  const int cur_order_hint = cm->cur_frame->order_hint;
+  int start_to_current_frame_offset = get_relative_dist(
+      &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint);
 
   for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
-    ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
-                                       ref_frame_offsets[rf - LAST_FRAME]);
+    ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info,
+                                       start_frame_order_hint,
+                                       ref_order_hints[rf - LAST_FRAME]);
   }
 
   if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
 
-  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
-  const int mvs_rows = (cm->mi_rows + 1) >> 1;
-  const int mvs_cols = (cm->mi_cols + 1) >> 1;
+  MV_REF *mv_ref_base = start_frame_buf->mvs;
+  const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1;
+  const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1;
 
   for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
     for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
@@ -988,7 +965,7 @@ static int motion_field_projection(AV1_COMMON *cm,
         }
 
         if (pos_valid) {
-          const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+          const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c;
 
           tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
           tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
@@ -1002,33 +979,35 @@ static int motion_field_projection(AV1_COMMON *cm,
 }
 
 void av1_setup_motion_field(AV1_COMMON *cm) {
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+
   memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
-  if (!cm->seq_params.enable_order_hint) return;
+  if (!order_hint_info->enable_order_hint) return;
 
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
-  int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
+             (cm->mi_params.mi_stride >> 1);
   for (int idx = 0; idx < size; ++idx) {
     tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
     tpl_mvs_base[idx].ref_frame_offset = 0;
   }
 
-  const int cur_order_hint = cm->cur_frame->cur_frame_offset;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  const int cur_order_hint = cm->cur_frame->order_hint;
 
-  int ref_buf_idx[INTER_REFS_PER_FRAME];
+  const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
   int ref_order_hint[INTER_REFS_PER_FRAME];
 
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     const int ref_idx = ref_frame - LAST_FRAME;
-    const int buf_idx = cm->frame_refs[ref_idx].idx;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
     int order_hint = 0;
 
-    if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset;
+    if (buf != NULL) order_hint = buf->order_hint;
 
-    ref_buf_idx[ref_idx] = buf_idx;
+    ref_buf[ref_idx] = buf;
     ref_order_hint[ref_idx] = order_hint;
 
-    if (get_relative_dist(cm, order_hint, cur_order_hint) > 0)
+    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
       cm->ref_frame_side[ref_frame] = 1;
     else if (order_hint == cur_order_hint)
       cm->ref_frame_side[ref_frame] = -1;
@@ -1036,10 +1015,10 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
 
   int ref_stamp = MFMV_STACK_SIZE - 1;
 
-  if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) {
+  if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) {
     const int alt_of_lst_order_hint =
-        frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]]
-            .ref_frame_offset[ALTREF_FRAME - LAST_FRAME];
+        ref_buf[LAST_FRAME - LAST_FRAME]
+            ->ref_order_hints[ALTREF_FRAME - LAST_FRAME];
 
     const int is_lst_overlay =
         (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
@@ -1047,47 +1026,50 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
     --ref_stamp;
   }
 
-  if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+  if (get_relative_dist(order_hint_info,
+                        ref_order_hint[BWDREF_FRAME - LAST_FRAME],
                         cur_order_hint) > 0) {
     if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
   }
 
-  if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+  if (get_relative_dist(order_hint_info,
+                        ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
                         cur_order_hint) > 0) {
     if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
   }
 
-  if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+  if (get_relative_dist(order_hint_info,
+                        ref_order_hint[ALTREF_FRAME - LAST_FRAME],
                         cur_order_hint) > 0 &&
       ref_stamp >= 0)
     if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
 
-  if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
-    if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
+  if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
 }
 
-static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
-                                  int row_offset, int sign_r, int col_offset,
-                                  int sign_c) {
+static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
+                                  int *pts_inref, int row_offset, int sign_r,
+                                  int col_offset, int sign_c) {
   int bw = block_size_wide[mbmi->sb_type];
   int bh = block_size_high[mbmi->sb_type];
   int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
   int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
 
-  pts[0] = (x * 8);
-  pts[1] = (y * 8);
-  pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
-  pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+  pts[0] = GET_MV_SUBPEL(x);
+  pts[1] = GET_MV_SUBPEL(y);
+  pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col;
+  pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row;
 }
 
 // Select samples according to the motion vector difference.
-int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+                          BLOCK_SIZE bsize) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
   int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
   int i, j, k, l = len;
-  int ret = 0;
+  uint8_t ret = 0;
   assert(len <= LEAST_SQUARES_SAMPLES_MAX);
 
   // Obtain the motion vector difference.
@@ -1128,30 +1110,32 @@ int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
 // Note: Samples returned are at 1/8-pel precision
 // Sample are the neighbor block center point's coordinates relative to the
 // left-top pixel of current block.
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref) {
-  MB_MODE_INFO *const mbmi0 = xd->mi[0];
-  int ref_frame = mbmi0->ref_frame[0];
-  int up_available = xd->up_available;
-  int left_available = xd->left_available;
-  int i, mi_step = 1, np = 0;
-
-  const TileInfo *const tile = &xd->tile;
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+                        int *pts_inref) {
+  const MB_MODE_INFO *const mbmi0 = xd->mi[0];
+  const int ref_frame = mbmi0->ref_frame[0];
+  const int up_available = xd->up_available;
+  const int left_available = xd->left_available;
+  int i, mi_step;
+  uint8_t np = 0;
   int do_tl = 1;
   int do_tr = 1;
+  const int mi_stride = xd->mi_stride;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
 
   // scan the nearest above rows
   if (up_available) {
-    int mi_row_offset = -1;
-    MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
-    uint8_t n4_w = mi_size_wide[mbmi->sb_type];
+    const int mi_row_offset = -1;
+    const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
+    uint8_t superblock_width = mi_size_wide[mbmi->sb_type];
 
-    if (xd->n4_w <= n4_w) {
+    if (xd->width <= superblock_width) {
       // Handle "current block width <= above block width" case.
-      int col_offset = -mi_col % n4_w;
+      const int col_offset = -mi_col % superblock_width;
 
       if (col_offset < 0) do_tl = 0;
-      if (col_offset + n4_w > xd->n4_w) do_tr = 0;
+      if (col_offset + superblock_width > xd->width) do_tr = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
         record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
@@ -1162,11 +1146,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       }
     } else {
       // Handle "current block width > above block width" case.
-      for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
-        int mi_col_offset = i;
-        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n4_w = mi_size_wide[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n4_w, n4_w);
+      for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+           i += mi_step) {
+        mbmi = xd->mi[i + mi_row_offset * mi_stride];
+        superblock_width = mi_size_wide[mbmi->sb_type];
+        mi_step = AOMMIN(xd->width, superblock_width);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1183,14 +1167,13 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 
   // scan the nearest left columns
   if (left_available) {
-    int mi_col_offset = -1;
-
-    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
-    uint8_t n4_h = mi_size_high[mbmi->sb_type];
+    const int mi_col_offset = -1;
+    const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
+    uint8_t superblock_height = mi_size_high[mbmi->sb_type];
 
-    if (xd->n4_h <= n4_h) {
+    if (xd->height <= superblock_height) {
       // Handle "current block height <= above block height" case.
-      int row_offset = -mi_row % n4_h;
+      const int row_offset = -mi_row % superblock_height;
 
       if (row_offset < 0) do_tl = 0;
 
@@ -1203,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       }
     } else {
       // Handle "current block height > above block height" case.
-      for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
-        int mi_row_offset = i;
-        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        n4_h = mi_size_high[mbmi->sb_type];
-        mi_step = AOMMIN(xd->n4_h, n4_h);
+      for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+           i += mi_step) {
+        mbmi = xd->mi[mi_col_offset + i * mi_stride];
+        superblock_height = mi_size_high[mbmi->sb_type];
+        mi_step = AOMMIN(xd->height, superblock_height);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1224,10 +1207,9 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 
   // Top-left block
   if (do_tl && left_available && up_available) {
-    int mi_row_offset = -1;
-    int mi_col_offset = -1;
-
-    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    const int mi_row_offset = -1;
+    const int mi_col_offset = -1;
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 
     if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
       record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
@@ -1241,18 +1223,17 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 
   // Top-right block
   if (do_tr &&
-      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
-    POSITION trb_pos = { -1, xd->n4_w };
-
+      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
+    const POSITION trb_pos = { -1, xd->width };
+    const TileInfo *const tile = &xd->tile;
     if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
-      int mi_row_offset = -1;
-      int mi_col_offset = xd->n4_w;
-
-      MB_MODE_INFO *mbmi =
-          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+      const int mi_row_offset = -1;
+      const int mi_col_offset = xd->width;
+      const MB_MODE_INFO *mbmi =
+          xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
         np++;
         if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
@@ -1264,36 +1245,43 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
 }
 
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
-  cm->is_skip_mode_allowed = 0;
-  cm->ref_frame_idx_0 = cm->ref_frame_idx_1 = INVALID_IDX;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
 
-  if (!cm->seq_params.enable_order_hint || frame_is_intra_only(cm) ||
-      cm->reference_mode == SINGLE_REFERENCE)
+  skip_mode_info->skip_mode_allowed = 0;
+  skip_mode_info->ref_frame_idx_0 = INVALID_IDX;
+  skip_mode_info->ref_frame_idx_1 = INVALID_IDX;
+
+  if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) ||
+      cm->current_frame.reference_mode == SINGLE_REFERENCE)
     return;
 
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-  const int cur_frame_offset = cm->frame_offset;
-  int ref_frame_offset[2] = { -1, INT_MAX };
+  const int cur_order_hint = cm->current_frame.order_hint;
+  int ref_order_hints[2] = { -1, INT_MAX };
   int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
 
   // Identify the nearest forward and backward references.
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-    const int buf_idx = cm->frame_refs[i].idx;
-    if (buf_idx == INVALID_IDX) continue;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+    if (buf == NULL) continue;
 
-    const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
-    if (get_relative_dist(cm, ref_offset, cur_frame_offset) < 0) {
+    const int ref_order_hint = buf->order_hint;
+    if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) <
+        0) {
       // Forward reference
-      if (ref_frame_offset[0] == -1 ||
-          get_relative_dist(cm, ref_offset, ref_frame_offset[0]) > 0) {
-        ref_frame_offset[0] = ref_offset;
+      if (ref_order_hints[0] == -1 ||
+          get_relative_dist(order_hint_info, ref_order_hint,
+                            ref_order_hints[0]) > 0) {
+        ref_order_hints[0] = ref_order_hint;
         ref_idx[0] = i;
       }
-    } else if (get_relative_dist(cm, ref_offset, cur_frame_offset) > 0) {
+    } else if (get_relative_dist(order_hint_info, ref_order_hint,
+                                 cur_order_hint) > 0) {
       // Backward reference
-      if (ref_frame_offset[1] == INT_MAX ||
-          get_relative_dist(cm, ref_offset, ref_frame_offset[1]) < 0) {
-        ref_frame_offset[1] = ref_offset;
+      if (ref_order_hints[1] == INT_MAX ||
+          get_relative_dist(order_hint_info, ref_order_hint,
+                            ref_order_hints[1]) < 0) {
+        ref_order_hints[1] = ref_order_hint;
         ref_idx[1] = i;
       }
     }
@@ -1301,75 +1289,71 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
 
   if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
     // == Bi-directional prediction ==
-    cm->is_skip_mode_allowed = 1;
-    cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
-    cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+    skip_mode_info->skip_mode_allowed = 1;
+    skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+    skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
   } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
     // == Forward prediction only ==
     // Identify the second nearest forward reference.
-    ref_frame_offset[1] = -1;
+    ref_order_hints[1] = -1;
     for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-      const int buf_idx = cm->frame_refs[i].idx;
-      if (buf_idx == INVALID_IDX) continue;
-
-      const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
-      if ((ref_frame_offset[0] != -1 &&
-           get_relative_dist(cm, ref_offset, ref_frame_offset[0]) < 0) &&
-          (ref_frame_offset[1] == -1 ||
-           get_relative_dist(cm, ref_offset, ref_frame_offset[1]) > 0)) {
+      const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+      if (buf == NULL) continue;
+
+      const int ref_order_hint = buf->order_hint;
+      if ((ref_order_hints[0] != -1 &&
+           get_relative_dist(order_hint_info, ref_order_hint,
+                             ref_order_hints[0]) < 0) &&
+          (ref_order_hints[1] == -1 ||
+           get_relative_dist(order_hint_info, ref_order_hint,
+                             ref_order_hints[1]) > 0)) {
         // Second closest forward reference
-        ref_frame_offset[1] = ref_offset;
+        ref_order_hints[1] = ref_order_hint;
         ref_idx[1] = i;
       }
     }
-    if (ref_frame_offset[1] != -1) {
-      cm->is_skip_mode_allowed = 1;
-      cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
-      cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+    if (ref_order_hints[1] != -1) {
+      skip_mode_info->skip_mode_allowed = 1;
+      skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+      skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
     }
   }
 }
 
 typedef struct {
-  int map_idx;   // frame map index
-  int buf_idx;   // frame buffer index
-  int sort_idx;  // index based on the offset to be used for sorting
+  int map_idx;        // frame map index
+  RefCntBuffer *buf;  // frame buffer
+  int sort_idx;       // index based on the offset to be used for sorting
 } REF_FRAME_INFO;
 
+// Compares the sort_idx fields. If they are equal, then compares the map_idx
+// fields to break the tie. This ensures a stable sort.
 static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
   const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
   const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
 
-  if (info_a->sort_idx < info_b->sort_idx) return -1;
-  if (info_a->sort_idx > info_b->sort_idx) return 1;
-  return (info_a->map_idx < info_b->map_idx)
-             ? -1
-             : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+  const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
+  if (sort_idx_diff != 0) return sort_idx_diff;
+  return info_a->map_idx - info_b->map_idx;
 }
 
-static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
-                               REF_FRAME_INFO *ref_info) {
+static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
+                                          REF_FRAME_INFO *ref_info) {
   assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 
-  const int buf_idx = ref_info->buf_idx;
-
-  cm->frame_refs[frame_idx].idx = buf_idx;
-  cm->frame_refs[frame_idx].buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
-  cm->frame_refs[frame_idx].map_idx = ref_info->map_idx;
+  remapped_ref_idx[frame_idx] = ref_info->map_idx;
 }
 
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
-                        int gld_map_idx) {
-  BufferPool *const pool = cm->buffer_pool;
-  RefCntBuffer *const frame_bufs = pool->frame_bufs;
-
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+                        int lst_map_idx, int gld_map_idx) {
   int lst_frame_sort_idx = -1;
   int gld_frame_sort_idx = -1;
 
-  assert(cm->seq_params.enable_order_hint);
-  assert(cm->seq_params.order_hint_bits_minus_1 >= 0);
-  const int cur_frame_offset = (int)cm->frame_offset;
-  const int cur_frame_sort_idx = 1 << cm->seq_params.order_hint_bits_minus_1;
+  assert(cm->seq_params.order_hint_info.enable_order_hint);
+  assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0);
+  const int cur_order_hint = (int)cm->current_frame.order_hint;
+  const int cur_frame_sort_idx =
+      1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1;
 
   REF_FRAME_INFO ref_frame_info[REF_FRAMES];
   int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
@@ -1380,18 +1364,19 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
     ref_frame_info[i].map_idx = map_idx;
     ref_frame_info[i].sort_idx = -1;
 
-    const int buf_idx = cm->ref_frame_map[map_idx];
-    ref_frame_info[i].buf_idx = buf_idx;
+    RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
+    ref_frame_info[i].buf = buf;
 
-    if (buf_idx < 0 || buf_idx >= FRAME_BUFFERS) continue;
-    // TODO(zoeliu@google.com): To verify the checking on ref_count.
-    if (frame_bufs[buf_idx].ref_count <= 0) continue;
+    if (buf == NULL) continue;
+    // If this assertion fails, there is a reference leak.
+    assert(buf->ref_count > 0);
 
-    const int offset = (int)frame_bufs[buf_idx].cur_frame_offset;
+    const int offset = (int)buf->order_hint;
     ref_frame_info[i].sort_idx =
         (offset == -1) ? -1
                        : cur_frame_sort_idx +
-                             get_relative_dist(cm, offset, cur_frame_offset);
+                             get_relative_dist(&cm->seq_params.order_hint_info,
+                                               offset, cur_order_hint);
     assert(ref_frame_info[i].sort_idx >= -1);
 
     if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
@@ -1414,8 +1399,8 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
         compare_ref_frame_info);
 
   // Identify forward and backward reference frames.
-  // Forward  reference: offset < cur_frame_offset
-  // Backward reference: offset >= cur_frame_offset
+  // Forward  reference: offset < order_hint
+  // Backward reference: offset >= order_hint
   int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
 
   for (int i = 0; i < REF_FRAMES; i++) {
@@ -1437,7 +1422,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
 
   // == ALTREF_FRAME ==
   if (bwd_start_idx <= bwd_end_idx) {
-    set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
                        &ref_frame_info[bwd_end_idx]);
     ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
     bwd_end_idx--;
@@ -1445,7 +1430,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
 
   // == BWDREF_FRAME ==
   if (bwd_start_idx <= bwd_end_idx) {
-    set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
                        &ref_frame_info[bwd_start_idx]);
     ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
     bwd_start_idx++;
@@ -1453,7 +1438,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
 
   // == ALTREF2_FRAME ==
   if (bwd_start_idx <= bwd_end_idx) {
-    set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
                        &ref_frame_info[bwd_start_idx]);
     ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
   }
@@ -1463,13 +1448,15 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
   for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
     // == LAST_FRAME ==
     if (ref_frame_info[i].map_idx == lst_map_idx) {
-      set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
+                         &ref_frame_info[i]);
       ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
     }
 
     // == GOLDEN_FRAME ==
     if (ref_frame_info[i].map_idx == gld_map_idx) {
-      set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
+                         &ref_frame_info[i]);
       ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
     }
   }
@@ -1501,18 +1488,19 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
     }
     if (fwd_start_idx > fwd_end_idx) break;
 
-    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
                        &ref_frame_info[fwd_end_idx]);
     ref_flag_list[ref_frame - LAST_FRAME] = 1;
 
     fwd_end_idx--;
   }
 
-  // Assign all the remaining frame(s), if any, to the earliest reference frame.
+  // Assign all the remaining frame(s), if any, to the earliest reference
+  // frame.
   for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
     const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
     if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
-    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
                        &ref_frame_info[fwd_start_idx]);
     ref_flag_list[ref_frame - LAST_FRAME] = 1;
   }
diff --git a/media/libaom/src/av1/common/mvref_common.h b/media/libaom/src/av1/common/mvref_common.h
index 83f7a1ac0..05a0dbc04 100644
--- a/media/libaom/src/av1/common/mvref_common.h
+++ b/media/libaom/src/av1/common/mvref_common.h
@@ -11,7 +11,7 @@
 #ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
 #define AOM_AV1_COMMON_MVREF_COMMON_H_
 
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 
 #ifdef __cplusplus
@@ -34,10 +34,10 @@ typedef struct position {
 // clamp_mv_ref
 #define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
-static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
-  if (!cm->seq_params.enable_order_hint) return 0;
+static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
+  if (!oh->enable_order_hint) return 0;
 
-  const int bits = cm->seq_params.order_hint_bits_minus_1 + 1;
+  const int bits = oh->order_hint_bits_minus_1 + 1;
 
   assert(bits >= 1);
   assert(a >= 0 && a < (1 << bits));
@@ -50,38 +50,19 @@ static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
 }
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
-           xd->mb_to_right_edge + bw * 8 + MV_BORDER,
-           xd->mb_to_top_edge - bh * 8 - MV_BORDER,
-           xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
+  const SubpelMvLimits mv_limits = {
+    xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
+    xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
+    xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
+    xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
+  };
+  clamp_mv(mv, &mv_limits);
 }
 
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
-                                      int which_mv, int search_col) {
-  (void)search_col;
+static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
   return candidate->mv[which_mv];
 }
 
-static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
-                                           int which_mv, int search_col) {
-  (void)search_col;
-  return candidate->mv[which_mv];
-}
-
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
-  }
-  return mv;
-}
-
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
@@ -169,14 +150,14 @@ static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
 // clang-format on
 
 static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
-                                     int8_t ref_frame_type) {
+                                     MV_REFERENCE_FRAME ref_frame_type) {
   if (ref_frame_type >= REF_FRAMES) {
     rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
     rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
   } else {
+    assert(ref_frame_type > NONE_FRAME);
     rf[0] = ref_frame_type;
     rf[1] = NONE_FRAME;
-    assert(ref_frame_type > NONE_FRAME);
   }
 }
 
@@ -201,18 +182,17 @@ static INLINE int16_t av1_mode_context_analyzer(
   return comp_ctx;
 }
 
-static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
-                                  int ref_idx) {
-  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL)
+static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
+  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
     return 0;
 
-  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
     return 1;
 
-  if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+  if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
     return 2;
 
   return 0;
@@ -222,7 +202,8 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm);
 void av1_setup_frame_sign_bias(AV1_COMMON *cm);
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
 void av1_setup_motion_field(AV1_COMMON *cm);
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+                        int lst_map_idx, int gld_map_idx);
 
 static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
   av1_zero(xd->neighbors_ref_counts);
@@ -255,13 +236,16 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm,
                         const MB_MODE_INFO *const mi, int mi_row, int mi_col,
                         int x_mis, int y_mis);
 
+// The global_mvs output parameter points to an array of REF_FRAMES elements.
+// The caller may pass a null global_mvs if it does not need the global_mvs
+// output.
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
                       CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
                       int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
-                      int_mv *global_mvs, int mi_row, int mi_col,
-                      int16_t *mode_context);
+                      int_mv *global_mvs, int16_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
@@ -269,25 +253,24 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
                            int_mv *near_mv, int is_integer);
 
-int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref);
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+                          BLOCK_SIZE bsize);
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+                        int *pts_inref);
 
 #define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
 #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
 
 static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
-                                   int mib_size, int mi_row, int mi_col) {
-  (void)mi_col;
+                                   int mib_size, int mi_row) {
   if (mi_row - mib_size < tile->mi_row_start) {
-    ref_dv->as_mv.row = 0;
-    ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
+    ref_dv->as_fullmv.row = 0;
+    ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
   } else {
-    ref_dv->as_mv.row = -MI_SIZE * mib_size;
-    ref_dv->as_mv.col = 0;
+    ref_dv->as_fullmv.row = -MI_SIZE * mib_size;
+    ref_dv->as_fullmv.col = 0;
   }
-  ref_dv->as_mv.row *= 8;
-  ref_dv->as_mv.col *= 8;
+  convert_fullmv_to_mv(ref_dv);
 }
 
 static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
@@ -319,15 +302,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
 
   // Special case for sub 8x8 chroma cases, to prevent referring to chroma
   // pixels outside current tile.
-  for (int plane = 1; plane < av1_num_planes(cm); ++plane) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                            pd->subsampling_y)) {
-      if (bw < 8 && pd->subsampling_x)
-        if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
-      if (bh < 8 && pd->subsampling_y)
-        if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
-    }
+  if (xd->is_chroma_ref && av1_num_planes(cm) > 1) {
+    const struct macroblockd_plane *const pd = &xd->plane[1];
+    if (bw < 8 && pd->subsampling_x)
+      if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+    if (bh < 8 && pd->subsampling_y)
+      if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
   }
 
   // Is the bottom right within an already coded SB? Also consider additional
diff --git a/media/libaom/src/av1/common/obmc.h b/media/libaom/src/av1/common/obmc.h
index 1c90cd93f..cc97b6bb1 100644
--- a/media/libaom/src/av1/common/obmc.h
+++ b/media/libaom/src/av1/common/obmc.h
@@ -12,25 +12,24 @@
 #ifndef AOM_AV1_COMMON_OBMC_H_
 #define AOM_AV1_COMMON_OBMC_H_
 
-typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
-                                          uint8_t nb_mi_size,
-                                          MB_MODE_INFO *nb_mi, void *fun_ctxt,
-                                          const int num_planes);
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row,
+                                          int rel_mi_col, uint8_t op_mi_size,
+                                          int dir, MB_MODE_INFO *nb_mi,
+                                          void *fun_ctxt, const int num_planes);
 
 static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
-                                                 MACROBLOCKD *xd, int mi_col,
-                                                 int nb_max,
+                                                 MACROBLOCKD *xd, int nb_max,
                                                  overlappable_nb_visitor_t fun,
                                                  void *fun_ctxt) {
-  const int num_planes = av1_num_planes(cm);
   if (!xd->up_available) return;
 
+  const int num_planes = av1_num_planes(cm);
   int nb_count = 0;
-
+  const int mi_col = xd->mi_col;
   // prev_row_mi points into the mi array, starting at the beginning of the
   // previous row.
   MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
-  const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
+  const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
   uint8_t mi_step;
   for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
        above_mi_col += mi_step) {
@@ -49,26 +48,25 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
     }
     if (is_neighbor_overlappable(*above_mi)) {
       ++nb_count;
-      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
-          fun_ctxt, num_planes);
+      fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
+          *above_mi, fun_ctxt, num_planes);
     }
   }
 }
 
 static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
-                                                MACROBLOCKD *xd, int mi_row,
-                                                int nb_max,
+                                                MACROBLOCKD *xd, int nb_max,
                                                 overlappable_nb_visitor_t fun,
                                                 void *fun_ctxt) {
-  const int num_planes = av1_num_planes(cm);
   if (!xd->left_available) return;
 
+  const int num_planes = av1_num_planes(cm);
   int nb_count = 0;
-
   // prev_col_mi points into the mi array, starting at the top of the
   // previous column
+  const int mi_row = xd->mi_row;
   MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
-  const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
+  const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
   uint8_t mi_step;
   for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
        left_mi_row += mi_step) {
@@ -82,7 +80,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
     }
     if (is_neighbor_overlappable(*left_mi)) {
       ++nb_count;
-      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
+      fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
           fun_ctxt, num_planes);
     }
   }
diff --git a/media/libaom/src/av1/common/obu_util.c b/media/libaom/src/av1/common/obu_util.c
index 823b700b1..7d2694b89 100644
--- a/media/libaom/src/av1/common/obu_util.c
+++ b/media/libaom/src/av1/common/obu_util.c
@@ -8,6 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <assert.h>
+
 #include "av1/common/obu_util.h"
 
 #include "aom_dsp/bitreader_buffer.h"
@@ -112,36 +114,41 @@ aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
                                              ObuHeader *obu_header,
                                              size_t *const payload_size,
                                              size_t *const bytes_read) {
-  size_t length_field_size = 0, obu_size = 0;
+  size_t length_field_size_obu = 0;
+  size_t length_field_size_payload = 0;
+  size_t obu_size = 0;
   aom_codec_err_t status;
 
   if (is_annexb) {
     // Size field comes before the OBU header, and includes the OBU header
     status =
-        read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+        read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu);
 
     if (status != AOM_CODEC_OK) return status;
   }
 
-  struct aom_read_bit_buffer rb = { data + length_field_size,
+  struct aom_read_bit_buffer rb = { data + length_field_size_obu,
                                     data + bytes_available, 0, NULL, NULL };
 
   status = read_obu_header(&rb, is_annexb, obu_header);
   if (status != AOM_CODEC_OK) return status;
 
-  if (is_annexb) {
+  if (!obu_header->has_size_field) {
+    assert(is_annexb);
     // Derive the payload size from the data we've already read
     if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
 
     *payload_size = obu_size - obu_header->size;
   } else {
     // Size field comes after the OBU header, and is just the payload size
-    status = read_obu_size(data + obu_header->size,
-                           bytes_available - obu_header->size, payload_size,
-                           &length_field_size);
+    status = read_obu_size(
+        data + length_field_size_obu + obu_header->size,
+        bytes_available - length_field_size_obu - obu_header->size,
+        payload_size, &length_field_size_payload);
     if (status != AOM_CODEC_OK) return status;
   }
 
-  *bytes_read = length_field_size + obu_header->size;
+  *bytes_read =
+      length_field_size_obu + obu_header->size + length_field_size_payload;
   return AOM_CODEC_OK;
 }
diff --git a/media/libaom/src/av1/common/onyxc_int.h b/media/libaom/src/av1/common/onyxc_int.h
deleted file mode 100644
index ff011c89e..000000000
--- a/media/libaom/src/av1/common/onyxc_int.h
+++ /dev/null
@@ -1,1342 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
-#define AOM_AV1_COMMON_ONYXC_INT_H_
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "aom/internal/aom_codec_internal.h"
-#include "aom_util/aom_thread.h"
-#include "av1/common/alloccommon.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/entropy.h"
-#include "av1/common/entropymode.h"
-#include "av1/common/entropymv.h"
-#include "av1/common/enums.h"
-#include "av1/common/frame_buffers.h"
-#include "av1/common/mv.h"
-#include "av1/common/quant_common.h"
-#include "av1/common/restoration.h"
-#include "av1/common/tile_common.h"
-#include "av1/common/timing.h"
-#include "av1/common/odintrin.h"
-#include "av1/encoder/hash_motion.h"
-#include "aom_dsp/grain_synthesis.h"
-#include "aom_dsp/grain_table.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(__clang__) && defined(__has_warning)
-#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
-#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
-#endif
-#elif defined(__GNUC__) && __GNUC__ >= 7
-#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough))  // NOLINT
-#endif
-
-#ifndef AOM_FALLTHROUGH_INTENDED
-#define AOM_FALLTHROUGH_INTENDED \
-  do {                           \
-  } while (0)
-#endif
-
-#define CDEF_MAX_STRENGTHS 16
-
-/* Constant values while waiting for the sequence header */
-#define FRAME_ID_LENGTH 15
-#define DELTA_FRAME_ID_LENGTH 14
-
-#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
-// Extra frame context which is always kept at default values
-#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
-#define PRIMARY_REF_BITS 3
-#define PRIMARY_REF_NONE 7
-
-#define NUM_PING_PONG_BUFFERS 2
-
-#define MAX_NUM_TEMPORAL_LAYERS 8
-#define MAX_NUM_SPATIAL_LAYERS 4
-/* clang-format off */
-// clang-format seems to think this is a pointer dereference and not a
-// multiplication.
-#define MAX_NUM_OPERATING_POINTS \
-  MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS
-/* clang-format on*/
-
-// TODO(jingning): Turning this on to set up transform coefficient
-// processing timer.
-#define TXCOEFF_TIMER 0
-#define TXCOEFF_COST_TIMER 0
-
-typedef enum {
-  SINGLE_REFERENCE = 0,
-  COMPOUND_REFERENCE = 1,
-  REFERENCE_MODE_SELECT = 2,
-  REFERENCE_MODES = 3,
-} REFERENCE_MODE;
-
-typedef enum {
-  /**
-   * Frame context updates are disabled
-   */
-  REFRESH_FRAME_CONTEXT_DISABLED,
-  /**
-   * Update frame context to values resulting from backward probability
-   * updates based on entropy/counts in the decoded frame
-   */
-  REFRESH_FRAME_CONTEXT_BACKWARD,
-} REFRESH_FRAME_CONTEXT_MODE;
-
-#define MFMV_STACK_SIZE 3
-typedef struct {
-  int_mv mfmv0;
-  uint8_t ref_frame_offset;
-} TPL_MV_REF;
-
-typedef struct {
-  int_mv mv;
-  MV_REFERENCE_FRAME ref_frame;
-} MV_REF;
-
-typedef struct {
-  int ref_count;
-
-  unsigned int cur_frame_offset;
-  unsigned int ref_frame_offset[INTER_REFS_PER_FRAME];
-
-  MV_REF *mvs;
-  uint8_t *seg_map;
-  struct segmentation seg;
-  int mi_rows;
-  int mi_cols;
-  // Width and height give the size of the buffer (before any upscaling, unlike
-  // the sizes that can be derived from the buf structure)
-  int width;
-  int height;
-  WarpedMotionParams global_motion[REF_FRAMES];
-  int showable_frame;  // frame can be used as show existing frame in future
-  int film_grain_params_present;
-  aom_film_grain_t film_grain_params;
-  aom_codec_frame_buffer_t raw_frame_buffer;
-  YV12_BUFFER_CONFIG buf;
-  hash_table hash_table;
-  uint8_t intra_only;
-  FRAME_TYPE frame_type;
-  // The Following variables will only be used in frame parallel decode.
-
-  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
-  // that no FrameWorker owns, or is decoding, this buffer.
-  AVxWorker *frame_worker_owner;
-
-  // row and col indicate which position frame has been decoded to in real
-  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
-  // when the frame is fully decoded.
-  int row;
-  int col;
-
-  // Inter frame reference frame delta for loop filter
-  int8_t ref_deltas[REF_FRAMES];
-
-  // 0 = ZERO_MV, MV
-  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
-} RefCntBuffer;
-
-typedef struct BufferPool {
-// Protect BufferPool from being accessed by several FrameWorkers at
-// the same time during frame parallel decode.
-// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t pool_mutex;
-#endif
-
-  // Private data associated with the frame buffer callbacks.
-  void *cb_priv;
-
-  aom_get_frame_buffer_cb_fn_t get_fb_cb;
-  aom_release_frame_buffer_cb_fn_t release_fb_cb;
-
-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
-
-  // Frame buffers allocated internally by the codec.
-  InternalFrameBufferList int_frame_buffers;
-} BufferPool;
-
-typedef struct {
-  int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
-                    [BASE_CONTEXT_POSITION_NUM + 1];
-} LV_MAP_CTX_TABLE;
-typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
-                          [BASE_CONTEXT_POSITION_NUM + 1];
-
-typedef struct BitstreamLevel {
-  uint8_t major;
-  uint8_t minor;
-} BitstreamLevel;
-
-// Sequence header structure.
-// Note: All syntax elements of sequence_header_obu that need to be
-// bit-identical across multiple sequence headers must be part of this struct,
-// so that consistency is checked by are_seq_headers_consistent() function.
-typedef struct SequenceHeader {
-  int num_bits_width;
-  int num_bits_height;
-  int max_frame_width;
-  int max_frame_height;
-  int frame_id_numbers_present_flag;
-  int frame_id_length;
-  int delta_frame_id_length;
-  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
-  int mib_size;        // Size of the superblock in units of MI blocks
-  int mib_size_log2;   // Log 2 of above.
-  int order_hint_bits_minus_1;
-  int force_screen_content_tools;  // 0 - force off
-                                   // 1 - force on
-                                   // 2 - adaptive
-  int force_integer_mv;            // 0 - Not to force. MV can be in 1/4 or 1/8
-                                   // 1 - force to integer
-                                   // 2 - adaptive
-  int still_picture;               // Video is a single frame still picture
-  int reduced_still_picture_hdr;   // Use reduced header for still picture
-  int enable_filter_intra;         // enables/disables filterintra
-  int enable_intra_edge_filter;    // enables/disables corner/edge/upsampling
-  int enable_interintra_compound;  // enables/disables interintra_compound
-  int enable_masked_compound;      // enables/disables masked compound
-  int enable_dual_filter;          // 0 - disable dual interpolation filter
-                                   // 1 - enable vert/horiz filter selection
-  int enable_order_hint;           // 0 - disable order hint, and related tools
-                                   // jnt_comp, ref_frame_mvs, frame_sign_bias
-                                   // if 0, enable_jnt_comp and
-                                   // enable_ref_frame_mvs must be set zs 0.
-  int enable_jnt_comp;             // 0 - disable joint compound modes
-                                   // 1 - enable it
-  int enable_ref_frame_mvs;        // 0 - disable ref frame mvs
-                                   // 1 - enable it
-  int enable_warped_motion;        // 0 - disable warped motion for sequence
-                                   // 1 - enable it for the sequence
-  int enable_superres;     // 0 - Disable superres for the sequence, and disable
-                           //     transmitting per-frame superres enabled flag.
-                           // 1 - Enable superres for the sequence, and also
-                           //     enable per-frame flag to denote if superres is
-                           //     enabled for that frame.
-  int enable_cdef;         // To turn on/off CDEF
-  int enable_restoration;  // To turn on/off loop restoration
-  BITSTREAM_PROFILE profile;
-
-  // Operating point info.
-  int operating_points_cnt_minus_1;
-  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
-  int display_model_info_present_flag;
-  int decoder_model_info_present_flag;
-  BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
-  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in the spec. One bit: 0
-                                           // or 1.
-
-  // Color config.
-  aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
-                              // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
-  int use_highbitdepth;       // If true, we need to use 16bit frame buffers.
-  int monochrome;             // Monochorme video
-  aom_color_primaries_t color_primaries;
-  aom_transfer_characteristics_t transfer_characteristics;
-  aom_matrix_coefficients_t matrix_coefficients;
-  int color_range;
-  int subsampling_x;          // Chroma subsampling for x
-  int subsampling_y;          // Chroma subsampling for y
-  aom_chroma_sample_position_t chroma_sample_position;
-  int separate_uv_delta_q;
-
-  int film_grain_params_present;
-} SequenceHeader;
-
-typedef struct AV1Common {
-  struct aom_internal_error_info error;
-  int width;
-  int height;
-  int render_width;
-  int render_height;
-  int last_width;
-  int last_height;
-  int timing_info_present;
-  aom_timing_info_t timing_info;
-  int buffer_removal_time_present;
-  aom_dec_model_info_t buffer_model;
-  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
-  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
-  uint32_t frame_presentation_time;
-
-  int largest_tile_id;
-  size_t largest_tile_size;
-  int context_update_tile_id;
-
-  // Scale of the current frame with respect to itself.
-  struct scale_factors sf_identity;
-
-  YV12_BUFFER_CONFIG *frame_to_show;
-  RefCntBuffer *prev_frame;
-
-  // TODO(hkuang): Combine this with cur_buf in macroblockd.
-  RefCntBuffer *cur_frame;
-
-  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
-
-  // Prepare ref_frame_map for the next frame.
-  // Only used in frame parallel decode.
-  int next_ref_frame_map[REF_FRAMES];
-
-  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
-  // roll new_fb_idx into it.
-
-  // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
-  RefBuffer frame_refs[INTER_REFS_PER_FRAME];
-  int is_skip_mode_allowed;
-  int skip_mode_flag;
-  int ref_frame_idx_0;
-  int ref_frame_idx_1;
-
-  int new_fb_idx;
-
-  FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
-  FRAME_TYPE frame_type;
-
-  int show_frame;
-  int showable_frame;  // frame can be used as show existing frame in future
-  int last_show_frame;
-  int show_existing_frame;
-  // Flag for a frame used as a reference - not written to the bitstream
-  int is_reference_frame;
-  int reset_decoder_state;
-
-  // Flag signaling that the frame is encoded using only INTRA modes.
-  uint8_t intra_only;
-  uint8_t last_intra_only;
-  uint8_t disable_cdf_update;
-  int allow_high_precision_mv;
-  int cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
-
-  int allow_screen_content_tools;
-  int allow_intrabc;
-  int allow_warped_motion;
-
-  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
-  // MB_MODE_INFO (8-pixel) units.
-  int MBs;
-  int mb_rows, mi_rows;
-  int mb_cols, mi_cols;
-  int mi_stride;
-
-  /* profile settings */
-  TX_MODE tx_mode;
-
-#if CONFIG_ENTROPY_STATS
-  int coef_cdf_category;
-#endif
-
-  int base_qindex;
-  int y_dc_delta_q;
-  int u_dc_delta_q;
-  int v_dc_delta_q;
-  int u_ac_delta_q;
-  int v_ac_delta_q;
-
-  // The dequantizers below are true dequntizers used only in the
-  // dequantization process.  They have the same coefficient
-  // shift/scale as TX.
-  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
-  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
-
-  // Global quant matrix tables
-  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
-  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
-
-  // Local quant matrix tables for each frame
-  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-
-  // Encoder
-  int using_qmatrix;
-  int qm_y;
-  int qm_u;
-  int qm_v;
-  int min_qmlevel;
-  int max_qmlevel;
-
-  /* We allocate a MB_MODE_INFO struct for each macroblock, together with
-     an extra row on top and column on the left to simplify prediction. */
-  int mi_alloc_size;
-  MB_MODE_INFO *mip; /* Base of allocated array */
-  MB_MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
-
-  // TODO(agrange): Move prev_mi into encoder structure.
-  // prev_mip and prev_mi will only be allocated in encoder.
-  MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */
-  MB_MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
-
-  // Separate mi functions between encoder and decoder.
-  int (*alloc_mi)(struct AV1Common *cm, int mi_size);
-  void (*free_mi)(struct AV1Common *cm);
-  void (*setup_mi)(struct AV1Common *cm);
-
-  // Grid of pointers to 8x8 MB_MODE_INFO structs.  Any 8x8 not in the visible
-  // area will be NULL.
-  MB_MODE_INFO **mi_grid_base;
-  MB_MODE_INFO **mi_grid_visible;
-  MB_MODE_INFO **prev_mi_grid_base;
-  MB_MODE_INFO **prev_mi_grid_visible;
-
-  // Whether to use previous frames' motion vectors for prediction.
-  int allow_ref_frame_mvs;
-
-  uint8_t *last_frame_seg_map;
-  uint8_t *current_frame_seg_map;
-  int seg_map_alloc_size;
-
-  InterpFilter interp_filter;
-
-  int switchable_motion_mode;
-
-  loop_filter_info_n lf_info;
-  // The denominator of the superres scale; the numerator is fixed.
-  uint8_t superres_scale_denominator;
-  int superres_upscaled_width;
-  int superres_upscaled_height;
-  RestorationInfo rst_info[MAX_MB_PLANE];
-
-  // rst_end_stripe[i] is one more than the index of the bottom stripe
-  // for tile row i.
-  int rst_end_stripe[MAX_TILE_ROWS];
-
-  // Pointer to a scratch buffer used by self-guided restoration
-  int32_t *rst_tmpbuf;
-  RestorationLineBuffers *rlbs;
-
-  // Output of loop restoration
-  YV12_BUFFER_CONFIG rst_frame;
-
-  // Flag signaling how frame contexts should be updated at the end of
-  // a frame decode
-  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
-
-  int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */
-
-  struct loopfilter lf;
-  struct segmentation seg;
-  int coded_lossless;  // frame is fully lossless at the coded resolution.
-  int all_lossless;    // frame is fully lossless at the upscaled resolution.
-
-  int reduced_tx_set_used;
-
-  // Context probabilities for reference frame prediction
-  MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
-  MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
-  REFERENCE_MODE reference_mode;
-
-  FRAME_CONTEXT *fc;              /* this frame entropy */
-  FRAME_CONTEXT *frame_contexts;  // FRAME_CONTEXTS
-  unsigned int frame_context_idx; /* Context to use/update */
-  int fb_of_context_type[REF_FRAMES];
-  int primary_ref_frame;
-
-  unsigned int frame_offset;
-
-  unsigned int current_video_frame;
-
-  aom_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
-
-  int error_resilient_mode;
-  int force_primary_ref_none;
-
-  int tile_cols, tile_rows;
-  int last_tile_cols, last_tile_rows;
-
-  int max_tile_width_sb;
-  int min_log2_tile_cols;
-  int max_log2_tile_cols;
-  int max_log2_tile_rows;
-  int min_log2_tile_rows;
-  int min_log2_tiles;
-  int max_tile_height_sb;
-  int uniform_tile_spacing_flag;
-  int log2_tile_cols;                        // only valid for uniform tiles
-  int log2_tile_rows;                        // only valid for uniform tiles
-  int tile_col_start_sb[MAX_TILE_COLS + 1];  // valid for 0 <= i <= tile_cols
-  int tile_row_start_sb[MAX_TILE_ROWS + 1];  // valid for 0 <= i <= tile_rows
-  int tile_width, tile_height;               // In MI units
-
-  unsigned int large_scale_tile;
-  unsigned int single_tile_decoding;
-
-  int byte_alignment;
-  int skip_loop_filter;
-  int skip_film_grain;
-
-  // Private data associated with the frame buffer callbacks.
-  void *cb_priv;
-  aom_get_frame_buffer_cb_fn_t get_fb_cb;
-  aom_release_frame_buffer_cb_fn_t release_fb_cb;
-
-  // Handles memory for the codec.
-  InternalFrameBufferList int_frame_buffers;
-
-  // External BufferPool passed from outside.
-  BufferPool *buffer_pool;
-
-  PARTITION_CONTEXT **above_seg_context;
-  ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
-  TXFM_CONTEXT **above_txfm_context;
-  WarpedMotionParams global_motion[REF_FRAMES];
-  aom_film_grain_t film_grain_params;
-
-  int cdef_pri_damping;
-  int cdef_sec_damping;
-  int nb_cdef_strengths;
-  int cdef_strengths[CDEF_MAX_STRENGTHS];
-  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
-  int cdef_bits;
-
-  int delta_q_present_flag;
-  // Resolution of delta quant
-  int delta_q_res;
-  int delta_lf_present_flag;
-  // Resolution of delta lf level
-  int delta_lf_res;
-  // This is a flag for number of deltas of loop filter level
-  // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
-  // 1: use separate deltas for each filter level
-  int delta_lf_multi;
-  int num_tg;
-  SequenceHeader seq_params;
-  int current_frame_id;
-  int ref_frame_id[REF_FRAMES];
-  int valid_for_referencing[REF_FRAMES];
-  int invalid_delta_frame_id_minus_1;
-  LV_MAP_CTX_TABLE coeff_ctx_table;
-  TPL_MV_REF *tpl_mvs;
-  int tpl_mvs_mem_size;
-  // TODO(jingning): This can be combined with sign_bias later.
-  int8_t ref_frame_side[REF_FRAMES];
-
-  int is_annexb;
-
-  int frame_refs_short_signaling;
-  int temporal_layer_id;
-  int spatial_layer_id;
-  unsigned int number_temporal_layers;
-  unsigned int number_spatial_layers;
-  int num_allocated_above_context_mi_col;
-  int num_allocated_above_contexts;
-  int num_allocated_above_context_planes;
-
-#if TXCOEFF_TIMER
-  int64_t cum_txcoeff_timer;
-  int64_t txcoeff_timer;
-  int txb_count;
-#endif
-
-#if TXCOEFF_COST_TIMER
-  int64_t cum_txcoeff_cost_timer;
-  int64_t txcoeff_cost_timer;
-  int64_t txcoeff_cost_count;
-#endif
-  const cfg_options_t *options;
-} AV1_COMMON;
-
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-static void lock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_lock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
-}
-
-static void unlock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_unlock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
-}
-
-static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
-  if (index < 0 || index >= REF_FRAMES) return NULL;
-  if (cm->ref_frame_map[index] < 0) return NULL;
-  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
-  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
-}
-
-static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
-    const AV1_COMMON *const cm) {
-  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
-}
-
-static INLINE int get_free_fb(AV1_COMMON *cm) {
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-  int i;
-
-  lock_buffer_pool(cm->buffer_pool);
-  for (i = 0; i < FRAME_BUFFERS; ++i)
-    if (frame_bufs[i].ref_count == 0) break;
-
-  if (i != FRAME_BUFFERS) {
-    if (frame_bufs[i].buf.use_external_reference_buffers) {
-      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
-      // external reference buffers. Restore the buffer pointers to point to the
-      // internally allocated memory.
-      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
-      ybf->y_buffer = ybf->store_buf_adr[0];
-      ybf->u_buffer = ybf->store_buf_adr[1];
-      ybf->v_buffer = ybf->store_buf_adr[2];
-      ybf->use_external_reference_buffers = 0;
-    }
-
-    frame_bufs[i].ref_count = 1;
-  } else {
-    // Reset i to be INVALID_IDX to indicate no free buffer found.
-    i = INVALID_IDX;
-  }
-
-  unlock_buffer_pool(cm->buffer_pool);
-  return i;
-}
-
-static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
-  const int ref_index = *idx;
-
-  if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
-    bufs[ref_index].ref_count--;
-
-  *idx = new_idx;
-
-  bufs[new_idx].ref_count++;
-}
-
-static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
-  return cm->frame_type == KEY_FRAME || cm->intra_only;
-}
-
-static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
-  return cm->frame_type == S_FRAME;
-}
-
-static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
-  if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
-      cm->frame_refs[cm->primary_ref_frame].idx == INVALID_IDX) {
-    return NULL;
-  } else {
-    return &cm->buffer_pool
-                ->frame_bufs[cm->frame_refs[cm->primary_ref_frame].idx];
-  }
-}
-
-// Returns 1 if this frame might allow mvs from some reference frame.
-static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
-  return !cm->error_resilient_mode && cm->seq_params.enable_ref_frame_mvs &&
-         cm->seq_params.enable_order_hint && !frame_is_intra_only(cm);
-}
-
-// Returns 1 if this frame might use warped_motion
-static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
-  return !cm->error_resilient_mode && !frame_is_intra_only(cm) &&
-         cm->seq_params.enable_warped_motion;
-}
-
-static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
-  const int buf_rows = buf->mi_rows;
-  const int buf_cols = buf->mi_cols;
-
-  if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) {
-    aom_free(buf->mvs);
-    buf->mi_rows = cm->mi_rows;
-    buf->mi_cols = cm->mi_cols;
-    CHECK_MEM_ERROR(cm, buf->mvs,
-                    (MV_REF *)aom_calloc(
-                        ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1),
-                        sizeof(*buf->mvs)));
-    aom_free(buf->seg_map);
-    CHECK_MEM_ERROR(cm, buf->seg_map,
-                    (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols,
-                                          sizeof(*buf->seg_map)));
-  }
-
-  const int mem_size =
-      ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
-  int realloc = cm->tpl_mvs == NULL;
-  if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
-
-  if (realloc) {
-    aom_free(cm->tpl_mvs);
-    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
-                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
-    cm->tpl_mvs_mem_size = mem_size;
-  }
-}
-
-void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
-
-static INLINE int av1_num_planes(const AV1_COMMON *cm) {
-  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
-}
-
-static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          const int tile_row) {
-  const int num_planes = av1_num_planes(cm);
-  for (int i = 0; i < num_planes; ++i) {
-    xd->above_context[i] = cm->above_context[i][tile_row];
-  }
-  xd->above_seg_context = cm->above_seg_context[tile_row];
-  xd->above_txfm_context = cm->above_txfm_context[tile_row];
-}
-
-static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        tran_low_t *dqcoeff) {
-  const int num_planes = av1_num_planes(cm);
-  for (int i = 0; i < num_planes; ++i) {
-    xd->plane[i].dqcoeff = dqcoeff;
-
-    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
-      memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX,
-             sizeof(cm->y_dequant_QTX));
-      memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
-
-    } else {
-      if (i == AOM_PLANE_U) {
-        memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX,
-               sizeof(cm->u_dequant_QTX));
-        memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix,
-               sizeof(cm->u_iqmatrix));
-      } else {
-        memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX,
-               sizeof(cm->v_dequant_QTX));
-        memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix,
-               sizeof(cm->v_iqmatrix));
-      }
-    }
-  }
-  xd->mi_stride = cm->mi_stride;
-  xd->error_info = &cm->error;
-  cfl_init(&xd->cfl, &cm->seq_params);
-}
-
-static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    const int num_planes) {
-  int i;
-  int row_offset = mi_row;
-  int col_offset = mi_col;
-  for (i = 0; i < num_planes; ++i) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    // Offset the buffer pointer
-    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
-      row_offset = mi_row - 1;
-    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
-      col_offset = mi_col - 1;
-    int above_idx = col_offset;
-    int left_idx = row_offset & MAX_MIB_MASK;
-    pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
-    pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
-  }
-}
-
-static INLINE int calc_mi_size(int len) {
-  // len is in mi units. Align to a multiple of SBs.
-  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
-}
-
-static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
-                                const int num_planes) {
-  int i;
-  for (i = 0; i < num_planes; i++) {
-    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
-    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
-
-    xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
-    xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
-  }
-}
-
-static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
-                                  int mi_row, int bh, int mi_col, int bw,
-                                  int mi_rows, int mi_cols) {
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
-
-  // Are edges available for intra prediction?
-  xd->up_available = (mi_row > tile->mi_row_start);
-
-  const int ss_x = xd->plane[1].subsampling_x;
-  const int ss_y = xd->plane[1].subsampling_y;
-
-  xd->left_available = (mi_col > tile->mi_col_start);
-  xd->chroma_up_available = xd->up_available;
-  xd->chroma_left_available = xd->left_available;
-  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
-    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
-  if (ss_y && bh < mi_size_high[BLOCK_8X8])
-    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
-  if (xd->up_available) {
-    xd->above_mbmi = xd->mi[-xd->mi_stride];
-  } else {
-    xd->above_mbmi = NULL;
-  }
-
-  if (xd->left_available) {
-    xd->left_mbmi = xd->mi[-1];
-  } else {
-    xd->left_mbmi = NULL;
-  }
-
-  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
-                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
-  if (chroma_ref) {
-    // To help calculate the "above" and "left" chroma blocks, note that the
-    // current block may cover multiple luma blocks (eg, if partitioned into
-    // 4x4 luma blocks).
-    // First, find the top-left-most luma block covered by this chroma block
-    MB_MODE_INFO **base_mi =
-        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
-
-    // Then, we consider the luma region covered by the left or above 4x4 chroma
-    // prediction. We want to point to the chroma reference block in that
-    // region, which is the bottom-right-most mi unit.
-    // This leads to the following offsets:
-    MB_MODE_INFO *chroma_above_mi =
-        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
-    xd->chroma_above_mbmi = chroma_above_mi;
-
-    MB_MODE_INFO *chroma_left_mi =
-        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
-    xd->chroma_left_mbmi = chroma_left_mi;
-  }
-
-  xd->n4_h = bh;
-  xd->n4_w = bw;
-  xd->is_sec_rect = 0;
-  if (xd->n4_w < xd->n4_h) {
-    // Only mark is_sec_rect as 1 for the last block.
-    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
-    // For other partitions, it would be (0, 1).
-    if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
-  }
-
-  if (xd->n4_w > xd->n4_h)
-    if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
-}
-
-static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
-                                           const MB_MODE_INFO *above_mi,
-                                           const MB_MODE_INFO *left_mi) {
-  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
-  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
-  const int above_ctx = intra_mode_context[above];
-  const int left_ctx = intra_mode_context[left];
-  return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
-}
-
-static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
-                                            int mi_col, BLOCK_SIZE subsize,
-                                            BLOCK_SIZE bsize) {
-  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
-  PARTITION_CONTEXT *const left_ctx =
-      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
-
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  memset(above_ctx, partition_context_lookup[subsize].above, bw);
-  memset(left_ctx, partition_context_lookup[subsize].left, bh);
-}
-
-static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                      int subsampling_x, int subsampling_y) {
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
-                ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
-  return ref_pos;
-}
-
-static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
-                                            int subsampling_y) {
-  BLOCK_SIZE bs = bsize;
-  switch (bsize) {
-    case BLOCK_4X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X8;
-      break;
-    case BLOCK_4X8:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X8;
-      break;
-    case BLOCK_8X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_8X8;
-      break;
-    case BLOCK_4X16:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_8X16;
-      else if (subsampling_x == 1)
-        bs = BLOCK_8X16;
-      else if (subsampling_y == 1)
-        bs = BLOCK_4X16;
-      break;
-    case BLOCK_16X4:
-      if (subsampling_x == 1 && subsampling_y == 1)
-        bs = BLOCK_16X8;
-      else if (subsampling_x == 1)
-        bs = BLOCK_16X4;
-      else if (subsampling_y == 1)
-        bs = BLOCK_16X8;
-      break;
-    default: break;
-  }
-  return bs;
-}
-
-static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
-                                            size_t element) {
-  assert(cdf != NULL);
-  return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
-}
-
-static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
-                                               const aom_cdf_prob *const in,
-                                               BLOCK_SIZE bsize) {
-  (void)bsize;
-  out[0] = CDF_PROB_TOP;
-  out[0] -= cdf_element_prob(in, PARTITION_HORZ);
-  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
-  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
-  out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
-  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
-  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
-  out[0] = AOM_ICDF(out[0]);
-  out[1] = AOM_ICDF(CDF_PROB_TOP);
-}
-
-static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
-                                               const aom_cdf_prob *const in,
-                                               BLOCK_SIZE bsize) {
-  (void)bsize;
-  out[0] = CDF_PROB_TOP;
-  out[0] -= cdf_element_prob(in, PARTITION_VERT);
-  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
-  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
-  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
-  out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
-  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
-  out[0] = AOM_ICDF(out[0]);
-  out[1] = AOM_ICDF(CDF_PROB_TOP);
-}
-
-static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
-                                                int mi_col, BLOCK_SIZE subsize,
-                                                BLOCK_SIZE bsize,
-                                                PARTITION_TYPE partition) {
-  if (bsize >= BLOCK_8X8) {
-    const int hbs = mi_size_wide[bsize] / 2;
-    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
-    switch (partition) {
-      case PARTITION_SPLIT:
-        if (bsize != BLOCK_8X8) break;
-        AOM_FALLTHROUGH_INTENDED;
-      case PARTITION_NONE:
-      case PARTITION_HORZ:
-      case PARTITION_VERT:
-      case PARTITION_HORZ_4:
-      case PARTITION_VERT_4:
-        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-        break;
-      case PARTITION_HORZ_A:
-        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
-        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
-        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
-        break;
-      case PARTITION_VERT_A:
-        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
-        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
-        break;
-      case PARTITION_VERT_B:
-        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
-        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
-        break;
-      default: assert(0 && "Invalid partition type");
-    }
-  }
-}
-
-static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
-                                          int mi_col, BLOCK_SIZE bsize) {
-  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx =
-      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
-  // Minimum partition point is 8x8. Offset the bsl accordingly.
-  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
-  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
-
-  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
-  assert(bsl >= 0);
-
-  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-}
-
-// Return the number of elements in the partition CDF when
-// partitioning the (square) block with luma block size of bsize.
-static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
-  if (bsize <= BLOCK_8X8)
-    return PARTITION_TYPES;
-  else if (bsize == BLOCK_128X128)
-    return EXT_PARTITION_TYPES - 2;
-  else
-    return EXT_PARTITION_TYPES;
-}
-
-static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                 int plane) {
-  int max_blocks_wide = block_size_wide[bsize];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-
-  if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
-
-  // Scale the width in the transform block unit.
-  return max_blocks_wide >> tx_size_wide_log2[0];
-}
-
-static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                 int plane) {
-  int max_blocks_high = block_size_high[bsize];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-
-  if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
-
-  // Scale the height in the transform block unit.
-  return max_blocks_high >> tx_size_high_log2[0];
-}
-
-static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
-                                        BLOCK_SIZE plane_bsize, int plane,
-                                        TX_SIZE tx_size) {
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
-                              << tx_size_wide_log2[0];
-  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
-}
-
-static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
-                                         BLOCK_SIZE plane_bsize, int plane,
-                                         TX_SIZE tx_size) {
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
-                              << tx_size_high_log2[0];
-  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
-}
-
-static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
-  int mi_col_start, int mi_col_end, const int tile_row) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
-  const int num_planes = av1_num_planes(cm);
-  const int width = mi_col_end - mi_col_start;
-  const int aligned_width =
-    ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
-
-  const int offset_y = mi_col_start;
-  const int width_y = aligned_width;
-  const int offset_uv = offset_y >> seq_params->subsampling_x;
-  const int width_uv = width_y >> seq_params->subsampling_x;
-
-  av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
-  if (num_planes > 1) {
-    if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) {
-      av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
-      av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
-    } else {
-      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid value of planes");
-    }
-  }
-
-  av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width);
-
-  memset(cm->above_txfm_context[tile_row] + mi_col_start,
-    tx_size_wide[TX_SIZES_LARGEST],
-    aligned_width * sizeof(TXFM_CONTEXT));
-}
-
-static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
-  av1_zero(xd->left_context);
-  av1_zero(xd->left_seg_context);
-
-  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
-         sizeof(xd->left_txfm_context_buffer));
-}
-
-// Disable array-bounds checks as the TX_SIZE enum contains values larger than
-// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround
-// infeasible. The assert is enough for static analysis and this or other tools
-// asan, valgrind would catch oob access at runtime.
-#if defined(__GNUC__) && __GNUC__ >= 4
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-#if defined(__GNUC__) && __GNUC__ >= 4
-#pragma GCC diagnostic warning "-Warray-bounds"
-#endif
-
-static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
-  int i;
-  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
-}
-
-static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
-                                 const MACROBLOCKD *xd) {
-  uint8_t bw = tx_size_wide[tx_size];
-  uint8_t bh = tx_size_high[tx_size];
-
-  if (skip) {
-    bw = n4_w * MI_SIZE;
-    bh = n4_h * MI_SIZE;
-  }
-
-  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
-  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
-}
-
-static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
-                                         TXFM_CONTEXT *left_ctx,
-                                         TX_SIZE tx_size, TX_SIZE txb_size) {
-  BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
-  int bh = mi_size_high[bsize];
-  int bw = mi_size_wide[bsize];
-  uint8_t txw = tx_size_wide[tx_size];
-  uint8_t txh = tx_size_high[tx_size];
-  int i;
-  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
-  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
-}
-
-static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
-  switch (tx_dim) {
-    case 128:
-    case 64: return TX_64X64; break;
-    case 32: return TX_32X32; break;
-    case 16: return TX_16X16; break;
-    case 8: return TX_8X8; break;
-    default: return TX_4X4;
-  }
-}
-
-static INLINE TX_SIZE get_tx_size(int width, int height) {
-  if (width == height) {
-    return get_sqr_tx_size(width);
-  }
-  if (width < height) {
-    if (width + width == height) {
-      switch (width) {
-        case 4: return TX_4X8; break;
-        case 8: return TX_8X16; break;
-        case 16: return TX_16X32; break;
-        case 32: return TX_32X64; break;
-      }
-    } else {
-      switch (width) {
-        case 4: return TX_4X16; break;
-        case 8: return TX_8X32; break;
-        case 16: return TX_16X64; break;
-      }
-    }
-  } else {
-    if (height + height == width) {
-      switch (height) {
-        case 4: return TX_8X4; break;
-        case 8: return TX_16X8; break;
-        case 16: return TX_32X16; break;
-        case 32: return TX_64X32; break;
-      }
-    } else {
-      switch (height) {
-        case 4: return TX_16X4; break;
-        case 8: return TX_32X8; break;
-        case 16: return TX_64X16; break;
-      }
-    }
-  }
-  assert(0);
-  return TX_4X4;
-}
-
-static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
-                                         TXFM_CONTEXT *left_ctx,
-                                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  const uint8_t txw = tx_size_wide[tx_size];
-  const uint8_t txh = tx_size_high[tx_size];
-  const int above = *above_ctx < txw;
-  const int left = *left_ctx < txh;
-  int category = TXFM_PARTITION_CONTEXTS;
-
-  // dummy return, not used by others.
-  if (tx_size <= TX_4X4) return 0;
-
-  TX_SIZE max_tx_size =
-      get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
-
-  if (max_tx_size >= TX_8X8) {
-    category =
-        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
-        (TX_SIZES - 1 - max_tx_size) * 2;
-  }
-  assert(category != TXFM_PARTITION_CONTEXTS);
-  return category * 3 + above + left;
-}
-
-// Compute the next partition in the direction of the sb_type stored in the mi
-// array, starting with bsize.
-static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
-                                           int mi_row, int mi_col,
-                                           BLOCK_SIZE bsize) {
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID;
-
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  MB_MODE_INFO **mi = cm->mi_grid_visible + offset;
-  const BLOCK_SIZE subsize = mi[0]->sb_type;
-
-  if (subsize == bsize) return PARTITION_NONE;
-
-  const int bhigh = mi_size_high[bsize];
-  const int bwide = mi_size_wide[bsize];
-  const int sshigh = mi_size_high[subsize];
-  const int sswide = mi_size_wide[subsize];
-
-  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows &&
-      mi_col + bhigh / 2 < cm->mi_cols) {
-    // In this case, the block might be using an extended partition
-    // type.
-    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
-    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride];
-
-    if (sswide == bwide) {
-      // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
-      // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
-      // half was split.
-      if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
-      assert(sshigh * 2 == bhigh);
-
-      if (mbmi_below->sb_type == subsize)
-        return PARTITION_HORZ;
-      else
-        return PARTITION_HORZ_B;
-    } else if (sshigh == bhigh) {
-      // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
-      // PARTITION_VERT_B. To distinguish the latter two, check if the right
-      // half was split.
-      if (sswide * 4 == bwide) return PARTITION_VERT_4;
-      assert(sswide * 2 == bhigh);
-
-      if (mbmi_right->sb_type == subsize)
-        return PARTITION_VERT;
-      else
-        return PARTITION_VERT_B;
-    } else {
-      // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
-      // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
-      // dimensions, we immediately know this is a split (which will recurse to
-      // get to subsize). Otherwise look down and to the right. With
-      // PARTITION_VERT_A, the right block will have height bhigh; with
-      // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
-      // it's PARTITION_SPLIT.
-      if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
-
-      if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
-      if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
-
-      return PARTITION_SPLIT;
-    }
-  }
-  const int vert_split = sswide < bwide;
-  const int horz_split = sshigh < bhigh;
-  const int split_idx = (vert_split << 1) | horz_split;
-  assert(split_idx != 0);
-
-  static const PARTITION_TYPE base_partitions[4] = {
-    PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
-  };
-
-  return base_partitions[split_idx];
-}
-
-static INLINE void set_use_reference_buffer(AV1_COMMON *const cm, int use) {
-  cm->seq_params.frame_id_numbers_present_flag = use;
-}
-
-static INLINE void set_sb_size(SequenceHeader *const seq_params,
-                               BLOCK_SIZE sb_size) {
-  seq_params->sb_size = sb_size;
-  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
-  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
-}
-
-// Returns true if the frame is fully lossless at the coded resolution.
-// Note: If super-resolution is used, such a frame will still NOT be lossless at
-// the upscaled resolution.
-static INLINE int is_coded_lossless(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd) {
-  int coded_lossless = 1;
-  if (cm->seg.enabled) {
-    for (int i = 0; i < MAX_SEGMENTS; ++i) {
-      if (!xd->lossless[i]) {
-        coded_lossless = 0;
-        break;
-      }
-    }
-  } else {
-    coded_lossless = xd->lossless[0];
-  }
-  return coded_lossless;
-}
-
-static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
-  return seq_level_idx < 24 || seq_level_idx == 31;
-}
-
-static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
-  assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
-  // Since bl.minor is unsigned a comparison will return a warning:
-  // comparison is always true due to limited range of data type
-  assert(LEVEL_MINOR_MIN == 0);
-  assert(bl.minor <= LEVEL_MINOR_MAX);
-  return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/media/libaom/src/av1/common/ppc/cfl_ppc.c b/media/libaom/src/av1/common/ppc/cfl_ppc.c
index 026a07809..6f88768f2 100644
--- a/media/libaom/src/av1/common/ppc/cfl_ppc.c
+++ b/media/libaom/src/av1/common/ppc/cfl_ppc.c
@@ -124,27 +124,27 @@ CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
 
 // Based on observation, for small blocks VSX does not outperform C (no 64bit
 // load and store intrinsics). So we call the C code for block widths 4.
-cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
   static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
-    subtract_average_4x4_c,     /* 4x4 */
-    subtract_average_8x8_vsx,   /* 8x8 */
-    subtract_average_16x16_vsx, /* 16x16 */
-    subtract_average_32x32_vsx, /* 32x32 */
-    cfl_subtract_average_null,  /* 64x64 (invalid CFL size) */
-    subtract_average_4x8_c,     /* 4x8 */
-    subtract_average_8x4_vsx,   /* 8x4 */
-    subtract_average_8x16_vsx,  /* 8x16 */
-    subtract_average_16x8_vsx,  /* 16x8 */
-    subtract_average_16x32_vsx, /* 16x32 */
-    subtract_average_32x16_vsx, /* 32x16 */
-    cfl_subtract_average_null,  /* 32x64 (invalid CFL size) */
-    cfl_subtract_average_null,  /* 64x32 (invalid CFL size) */
-    subtract_average_4x16_c,    /* 4x16 */
-    subtract_average_16x4_vsx,  /* 16x4 */
-    subtract_average_8x32_vsx,  /* 8x32 */
-    subtract_average_32x8_vsx,  /* 32x8 */
-    cfl_subtract_average_null,  /* 16x64 (invalid CFL size) */
-    cfl_subtract_average_null,  /* 64x16 (invalid CFL size) */
+    cfl_subtract_average_4x4_c,     /* 4x4 */
+    cfl_subtract_average_8x8_vsx,   /* 8x8 */
+    cfl_subtract_average_16x16_vsx, /* 16x16 */
+    cfl_subtract_average_32x32_vsx, /* 32x32 */
+    NULL,                           /* 64x64 (invalid CFL size) */
+    cfl_subtract_average_4x8_c,     /* 4x8 */
+    cfl_subtract_average_8x4_vsx,   /* 8x4 */
+    cfl_subtract_average_8x16_vsx,  /* 8x16 */
+    cfl_subtract_average_16x8_vsx,  /* 16x8 */
+    cfl_subtract_average_16x32_vsx, /* 16x32 */
+    cfl_subtract_average_32x16_vsx, /* 32x16 */
+    NULL,                           /* 32x64 (invalid CFL size) */
+    NULL,                           /* 64x32 (invalid CFL size) */
+    cfl_subtract_average_4x16_c,    /* 4x16 */
+    cfl_subtract_average_16x4_vsx,  /* 16x4 */
+    cfl_subtract_average_8x32_vsx,  /* 8x32 */
+    cfl_subtract_average_32x8_vsx,  /* 32x8 */
+    NULL,                           /* 16x64 (invalid CFL size) */
+    NULL,                           /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
   // index the function pointer array out of bounds.
diff --git a/media/libaom/src/av1/common/pred_common.h b/media/libaom/src/av1/common/pred_common.h
index 6dba2322d..d1dab97e7 100644
--- a/media/libaom/src/av1/common/pred_common.h
+++ b/media/libaom/src/av1/common/pred_common.h
@@ -12,29 +12,31 @@
 #ifndef AOM_AV1_COMMON_PRED_COMMON_H_
 #define AOM_AV1_COMMON_PRED_COMMON_H_
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
-#include "av1/common/onyxc_int.h"
 #include "aom_dsp/aom_dsp_common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static INLINE int get_segment_id(const AV1_COMMON *const cm,
+static INLINE int get_segment_id(const CommonModeInfoParams *const mi_params,
                                  const uint8_t *segment_ids, BLOCK_SIZE bsize,
                                  int mi_row, int mi_col) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
-  int x, y, segment_id = MAX_SEGMENTS;
-
-  for (y = 0; y < ymis; ++y)
-    for (x = 0; x < xmis; ++x)
-      segment_id =
-          AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+  int segment_id = MAX_SEGMENTS;
+
+  for (int y = 0; y < ymis; ++y) {
+    for (int x = 0; x < xmis; ++x) {
+      segment_id = AOMMIN(segment_id,
+                          segment_ids[mi_offset + y * mi_params->mi_cols + x]);
+    }
+  }
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
   return segment_id;
@@ -42,26 +44,33 @@ static INLINE int get_segment_id(const AV1_COMMON *const cm,
 
 static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
                                            const MACROBLOCKD *const xd,
-                                           int mi_row, int mi_col,
                                            int *cdf_index) {
   int prev_ul = -1;  // top left segment_id
   int prev_l = -1;   // left segment_id
   int prev_u = -1;   // top segment_id
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const uint8_t *seg_map = cm->cur_frame->seg_map;
   if ((xd->up_available) && (xd->left_available)) {
-    prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
-                             mi_row - 1, mi_col - 1);
+    prev_ul =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 1);
   }
   if (xd->up_available) {
-    prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
-                            mi_row - 1, mi_col - 0);
+    prev_u =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 0);
   }
   if (xd->left_available) {
-    prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
-                            mi_row - 0, mi_col - 1);
+    prev_l =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, mi_col - 1);
   }
+  // This property follows from the fact that get_segment_id() returns a
+  // nonnegative value. This allows us to test for all edge cases with a simple
+  // prev_ul < 0 check.
+  assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0));
 
   // Pick CDF index based on number of matching/out-of-bounds segment IDs.
-  if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+  if (prev_ul < 0) /* Edge cases */
     *cdf_index = 0;
   else if ((prev_ul == prev_u) && (prev_ul == prev_l))
     *cdf_index = 2;
@@ -90,18 +99,18 @@ static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
 static INLINE int get_comp_index_context(const AV1_COMMON *cm,
                                          const MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
-  int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
-  int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
   int bck_frame_index = 0, fwd_frame_index = 0;
-  int cur_frame_index = cm->cur_frame->cur_frame_offset;
+  int cur_frame_index = cm->cur_frame->order_hint;
 
-  if (bck_idx >= 0)
-    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
 
-  if (fwd_idx >= 0)
-    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
-  int fwd = abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index));
-  int bck = abs(get_relative_dist(cm, cur_frame_index, bck_frame_index));
+  int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                  fwd_frame_index, cur_frame_index));
+  int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                  cur_frame_index, bck_frame_index));
 
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mi = xd->left_mbmi;
@@ -109,14 +118,14 @@ static INLINE int get_comp_index_context(const AV1_COMMON *cm,
   int above_ctx = 0, left_ctx = 0;
   const int offset = (fwd == bck);
 
-  if (above_mi) {
+  if (above_mi != NULL) {
     if (has_second_ref(above_mi))
       above_ctx = above_mi->compound_idx;
     else if (above_mi->ref_frame[0] == ALTREF_FRAME)
       above_ctx = 1;
   }
 
-  if (left_mi) {
+  if (left_mi != NULL) {
     if (has_second_ref(left_mi))
       left_ctx = left_mi->compound_idx;
     else if (left_mi->ref_frame[0] == ALTREF_FRAME)
@@ -178,6 +187,7 @@ int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
                           uint16_t *cache);
 
 static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
   return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
 }
 
@@ -198,6 +208,10 @@ static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
   return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
 }
 
+static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)];
+}
+
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
 
 // == Uni-directional contexts ==
diff --git a/media/libaom/src/av1/common/quant_common.c b/media/libaom/src/av1/common/quant_common.c
index 0e14da7a3..e96d71a3b 100644
--- a/media/libaom/src/av1/common/quant_common.c
+++ b/media/libaom/src/av1/common/quant_common.c
@@ -9,14 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
 #include "av1/common/common.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/entropy.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/seg_common.h"
-#include "av1/common/blockd.h"
 
-static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = {
   4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
   19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
   31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
@@ -38,7 +38,7 @@ static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
   1184, 1232, 1282, 1336,
 };
 
-static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = {
   4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
   40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
   86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
@@ -61,7 +61,7 @@ static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
   3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
 };
 
-static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = {
   4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
   103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
   251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
@@ -88,7 +88,7 @@ static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
   19718, 20521, 21387,
 };
 
-static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = {
   4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
@@ -111,7 +111,7 @@ static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
   1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };
 
-static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = {
   4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
   44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
   96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
@@ -134,7 +134,7 @@ static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
   6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
 };
 
-static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = {
   4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
   112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
   280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
@@ -190,39 +190,30 @@ static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
 // addition, the minimum allowable quantizer is 4; smaller values will
 // underflow to 0 in the actual quantization routines.
 
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
   switch (bit_depth) {
-    case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return dc_qlookup_QTX[q_clamped];
+    case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped];
+    case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
 }
 
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
   switch (bit_depth) {
-    case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return ac_qlookup_QTX[q_clamped];
+    case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped];
+    case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
 }
 
-// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
-// bits), so QTX == Q3.
-
-int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  return av1_dc_quant_Q3(qindex, delta, bit_depth);
-}
-
-int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
-  return av1_ac_quant_Q3(qindex, delta, bit_depth);
-}
-
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
   if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
@@ -234,39 +225,82 @@ int av1_get_qindex(const struct segmentation *seg, int segment_id,
   }
 }
 
-const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane,
-                             TX_SIZE tx_size) {
-  return &cm->giqmatrix[qmlevel][plane][tx_size][0];
+bool av1_use_qmatrix(const CommonQuantParams *quant_params,
+                     const struct macroblockd *xd, int segment_id) {
+  // True if explicit Q matrix levels and this is not a lossless segment.
+  return quant_params->using_qmatrix && !xd->lossless[segment_id];
 }
-const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane,
-                            TX_SIZE tx_size) {
-  return &cm->gqmatrix[qmlevel][plane][tx_size][0];
+
+const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel,
+                             int plane, TX_SIZE tx_size) {
+  assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL ||
+         qmlevel == NUM_QM_LEVELS - 1);
+  return quant_params->giqmatrix[qmlevel][plane][tx_size];
+}
+const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel,
+                            int plane, TX_SIZE tx_size) {
+  assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL ||
+         qmlevel == NUM_QM_LEVELS - 1);
+  return quant_params->gqmatrix[qmlevel][plane][tx_size];
+}
+
+// Returns true if the tx_type corresponds to non-identity transform in both
+// horizontal and vertical directions.
+static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }
+
+const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params,
+                                 const MACROBLOCKD *xd, int plane,
+                                 TX_SIZE tx_size, TX_TYPE tx_type) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  return is_2d_transform(tx_type)
+             ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+             : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+}
+
+const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params,
+                                const MACROBLOCKD *xd, int plane,
+                                TX_SIZE tx_size, TX_TYPE tx_type) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  return is_2d_transform(tx_type)
+             ? pd->seg_qmatrix[seg_id][qm_tx_size]
+             : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
 }
 
 #define QM_TOTAL_SIZE 3344
-static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
-static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+// We only use wt_matrix_ref[q] and iwt_matrix_ref[q]
+// for q = 0, ..., NUM_QM_LEVELS - 2.
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
 
-void av1_qm_init(AV1_COMMON *cm) {
-  const int num_planes = av1_num_planes(cm);
-  int q, c, t;
-  int current;
-  for (q = 0; q < NUM_QM_LEVELS; ++q) {
-    for (c = 0; c < num_planes; ++c) {
-      current = 0;
-      for (t = 0; t < TX_SIZES_ALL; ++t) {
+void av1_qm_init(CommonQuantParams *quant_params, int num_planes) {
+  for (int q = 0; q < NUM_QM_LEVELS; ++q) {
+    for (int c = 0; c < num_planes; ++c) {
+      int current = 0;
+      for (int t = 0; t < TX_SIZES_ALL; ++t) {
         const int size = tx_size_2d[t];
         const int qm_tx_size = av1_get_adjusted_tx_size(t);
         if (q == NUM_QM_LEVELS - 1) {
-          cm->gqmatrix[q][c][t] = NULL;
-          cm->giqmatrix[q][c][t] = NULL;
+          quant_params->gqmatrix[q][c][t] = NULL;
+          quant_params->giqmatrix[q][c][t] = NULL;
         } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
-          cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size];
-          cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size];
+          assert(t > qm_tx_size);
+          quant_params->gqmatrix[q][c][t] =
+              quant_params->gqmatrix[q][c][qm_tx_size];
+          quant_params->giqmatrix[q][c][t] =
+              quant_params->giqmatrix[q][c][qm_tx_size];
         } else {
           assert(current + size <= QM_TOTAL_SIZE);
-          cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
-          cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current];
+          quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+          quant_params->giqmatrix[q][c][t] =
+              &iwt_matrix_ref[q][c >= 1][current];
           current += size;
         }
       }
@@ -274,7 +308,7 @@ void av1_qm_init(AV1_COMMON *cm) {
   }
 }
 
-/* Provide 16 sets of quantization matrices for chroma and luma
+/* Provide 15 sets of quantization matrices for chroma and luma
    and each TX size. Matrices for different TX sizes are in fact
    sub-sampled from the 32x32 and 16x16 sizes, but explicitly
    defined here for convenience. Intra and inter matrix sets are the
@@ -283,9 +317,10 @@ void av1_qm_init(AV1_COMMON *cm) {
    frame.
    Matrices for different QM levels have been rescaled in the
    frequency domain according to different nominal viewing
-   distances.
+   distances. Matrices for QM level 15 are omitted because they are
+   not used.
  */
-static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
   {
       { /* Luma */
         /* Size 4x4 */
@@ -6633,427 +6668,9 @@ static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
   },
-  {
-      { /* Luma */
-        /* Size 4x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 32x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32 },
-      { /* Chroma */
-        /* Size 4x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 32x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32 },
-  },
 };
 
-static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
   {
       { /* Luma */
         /* Size 4x4 */
@@ -13255,422 +12872,4 @@ static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
   },
-  {
-      { /* Luma */
-        /* Size 4x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 32x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32 },
-      { /* Chroma */
-        /* Size 4x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32,
-        /* Size 16x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 32x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 4x16 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 16x4 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        /* Size 8x32 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32,
-        /* Size 32x8 */
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-        32, 32, 32, 32 },
-  },
 };
diff --git a/media/libaom/src/av1/common/quant_common.h b/media/libaom/src/av1/common/quant_common.h
index d1f52a660..9c30204ff 100644
--- a/media/libaom/src/av1/common/quant_common.h
+++ b/media/libaom/src/av1/common/quant_common.h
@@ -12,6 +12,7 @@
 #ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
 #define AOM_AV1_COMMON_QUANT_COMMON_H_
 
+#include <stdbool.h>
 #include "aom/aom_codec.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/enums.h"
@@ -37,24 +38,43 @@ extern "C" {
 #define DEFAULT_QM_LAST 9
 
 struct AV1Common;
+struct CommonQuantParams;
+struct macroblockd;
 
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
 int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
 int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
 
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
+
+// Returns true if we are using quantization matrix.
+bool av1_use_qmatrix(const struct CommonQuantParams *quant_params,
+                     const struct macroblockd *xd, int segment_id);
+
 // Reduce the large number of quantizers to a smaller number of levels for which
 // different matrices may be defined
 static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
   return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
 }
-void av1_qm_init(struct AV1Common *cm);
-const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
-                             TX_SIZE tx_size);
-const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
-                            TX_SIZE tx_size);
+
+// Initialize all global quant/dequant matrices.
+void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes);
+
+// Get global dequant matrix.
+const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params,
+                             int qmlevel, int plane, TX_SIZE tx_size);
+// Get global quant matrix.
+const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params,
+                            int qmlevel, int plane, TX_SIZE tx_size);
+
+// Get either local / global dequant matrix as appropriate.
+const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params,
+                                 const struct macroblockd *xd, int plane,
+                                 TX_SIZE tx_size, TX_TYPE tx_type);
+// Get either local / global quant matrix as appropriate.
+const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params,
+                                const struct macroblockd *xd, int plane,
+                                TX_SIZE tx_size, TX_TYPE tx_type);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/reconinter.c b/media/libaom/src/av1/common/reconinter.c
index 3203efce4..287adddcc 100644
--- a/media/libaom/src/av1/common/reconinter.c
+++ b/media/libaom/src/av1/common/reconinter.c
@@ -20,25 +20,24 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/obmc.h"
-
-#define USE_PRECOMPUTED_WEDGE_MASK 1
-#define USE_PRECOMPUTED_WEDGE_SIGN 1
 
 // This function will determine whether or not to create a warped
 // prediction.
 int av1_allow_warp(const MB_MODE_INFO *const mbmi,
                    const WarpTypesAllowed *const warp_types,
                    const WarpedMotionParams *const gm_params,
-                   int build_for_obmc, int x_scale, int y_scale,
+                   int build_for_obmc, const struct scale_factors *const sf,
                    WarpedMotionParams *final_warp_params) {
-  if (x_scale != SCALE_SUBPEL_SHIFTS || y_scale != SCALE_SUBPEL_SHIFTS)
-    return 0;
+  // Note: As per the spec, we must test the fixed point scales here, which are
+  // at a higher precision (1 << 14) than the xs and ys in subpel_params (that
+  // have 1 << 10 precision).
+  if (av1_is_scaled(sf)) return 0;
 
   if (final_warp_params != NULL) *final_warp_params = default_warp_params;
 
@@ -57,48 +56,114 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
   return 0;
 }
 
-void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, const SubpelParams *subpel_params,
-                              const struct scale_factors *sf, int w, int h,
-                              ConvolveParams *conv_params,
-                              InterpFilters interp_filters,
-                              const WarpTypesAllowed *warp_types, int p_col,
-                              int p_row, int plane, int ref,
-                              const MB_MODE_INFO *mi, int build_for_obmc,
-                              const MACROBLOCKD *xd, int can_use_previous) {
-  // Make sure the selected motion mode is valid for this configuration
-  assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi,
-                           can_use_previous);
-  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-
-  WarpedMotionParams final_warp_params;
-  const int do_warp =
-      (w >= 8 && h >= 8 &&
-       av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
-                      build_for_obmc, subpel_params->xs, subpel_params->ys,
-                      &final_warp_params));
-  const int is_intrabc = mi->use_intrabc;
-  assert(IMPLIES(is_intrabc, !do_warp));
-
-  if (do_warp && xd->cur_frame_force_integer_mv == 0) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const struct buf_2d *const pre_buf = &pd->pre[ref];
-    av1_warp_plane(&final_warp_params,
-                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-                   pre_buf->buf0, pre_buf->width, pre_buf->height,
-                   pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
-                   pd->subsampling_x, pd->subsampling_y, conv_params);
-  } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
-                           w, h, conv_params, interp_filters, is_intrabc,
-                           xd->bd);
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+                           int block_height, int pix_row, int pix_col,
+                           int subsampling_x, int subsampling_y, int bit_depth,
+                           int use_hbd_buf, int is_intrabc,
+                           const struct scale_factors *sf,
+                           const struct buf_2d *ref_buf,
+                           int_interpfilters interp_filters) {
+  inter_pred_params->block_width = block_width;
+  inter_pred_params->block_height = block_height;
+  inter_pred_params->pix_row = pix_row;
+  inter_pred_params->pix_col = pix_col;
+  inter_pred_params->subsampling_x = subsampling_x;
+  inter_pred_params->subsampling_y = subsampling_y;
+  inter_pred_params->bit_depth = bit_depth;
+  inter_pred_params->use_hbd_buf = use_hbd_buf;
+  inter_pred_params->is_intrabc = is_intrabc;
+  inter_pred_params->scale_factors = sf;
+  inter_pred_params->ref_frame_buf = *ref_buf;
+  inter_pred_params->mode = TRANSLATION_PRED;
+  inter_pred_params->comp_mode = UNIFORM_SINGLE;
+
+  if (is_intrabc) {
+    inter_pred_params->interp_filter_params[0] = &av1_intrabc_filter_params;
+    inter_pred_params->interp_filter_params[1] = &av1_intrabc_filter_params;
   } else {
-    inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
-                    conv_params, interp_filters, is_intrabc);
+    inter_pred_params->interp_filter_params[0] =
+        av1_get_interp_filter_params_with_block_size(
+            interp_filters.as_filters.x_filter, block_width);
+    inter_pred_params->interp_filter_params[1] =
+        av1_get_interp_filter_params_with_block_size(
+            interp_filters.as_filters.y_filter, block_height);
+  }
+}
+
+void av1_init_comp_mode(InterPredParams *inter_pred_params) {
+  inter_pred_params->comp_mode = UNIFORM_COMP;
+}
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+                          const WarpTypesAllowed *warp_types, int ref,
+                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
+  if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8)
+    return;
+
+  if (xd->cur_frame_force_integer_mv) return;
+
+  if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
+                     inter_pred_params->scale_factors,
+                     &inter_pred_params->warp_params))
+    inter_pred_params->mode = WARP_PRED;
+}
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+                        const INTERINTER_COMPOUND_DATA *mask_comp) {
+  inter_pred_params->sb_type = bsize;
+  inter_pred_params->mask_comp = *mask_comp;
+
+  if (inter_pred_params->conv_params.compound_index == 1) {
+    inter_pred_params->conv_params.do_average = 0;
+    inter_pred_params->comp_mode = MASK_COMP;
+  }
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride,
+                              InterPredParams *inter_pred_params,
+                              const SubpelParams *subpel_params) {
+  assert(IMPLIES(inter_pred_params->conv_params.is_compound,
+                 inter_pred_params->conv_params.dst != NULL));
+
+  // TODO(jingning): av1_warp_plane() can be further cleaned up.
+  if (inter_pred_params->mode == WARP_PRED) {
+    av1_warp_plane(
+        &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
+        inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
+        inter_pred_params->ref_frame_buf.width,
+        inter_pred_params->ref_frame_buf.height,
+        inter_pred_params->ref_frame_buf.stride, dst,
+        inter_pred_params->pix_col, inter_pred_params->pix_row,
+        inter_pred_params->block_width, inter_pred_params->block_height,
+        dst_stride, inter_pred_params->subsampling_x,
+        inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
+  } else if (inter_pred_params->mode == TRANSLATION_PRED) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (inter_pred_params->use_hbd_buf) {
+      highbd_inter_predictor(
+          src, src_stride, dst, dst_stride, subpel_params,
+          inter_pred_params->scale_factors, inter_pred_params->block_width,
+          inter_pred_params->block_height, &inter_pred_params->conv_params,
+          inter_pred_params->interp_filter_params,
+          inter_pred_params->bit_depth);
+    } else {
+      inter_predictor(
+          src, src_stride, dst, dst_stride, subpel_params,
+          inter_pred_params->scale_factors, inter_pred_params->block_width,
+          inter_pred_params->block_height, &inter_pred_params->conv_params,
+          inter_pred_params->interp_filter_params);
+    }
+#else
+    inter_predictor(
+        src, src_stride, dst, dst_stride, subpel_params,
+        inter_pred_params->scale_factors, inter_pred_params->block_width,
+        inter_pred_params->block_height, &inter_pred_params->conv_params,
+        inter_pred_params->interp_filter_params);
+#endif
   }
 }
 
-#if USE_PRECOMPUTED_WEDGE_MASK
 static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
@@ -118,7 +183,8 @@ static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
   64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
 };
 
-static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift,
+                                  int width) {
   if (shift >= 0) {
     memcpy(dst + shift, src, width - shift);
     memset(dst, src[0], shift);
@@ -128,9 +194,7 @@ static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
     memset(dst + width - shift, src[width - 1], shift);
   }
 }
-#endif  // USE_PRECOMPUTED_WEDGE_MASK
 
-#if USE_PRECOMPUTED_WEDGE_SIGN
 /* clang-format off */
 DECLARE_ALIGNED(16, static uint8_t,
                 wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
@@ -158,10 +222,6 @@ DECLARE_ALIGNED(16, static uint8_t,
   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
 };
 /* clang-format on */
-#else
-DECLARE_ALIGNED(16, static uint8_t,
-                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
-#endif  // USE_PRECOMPUTED_WEDGE_SIGN
 
 // [negative][direction]
 DECLARE_ALIGNED(
@@ -173,6 +233,10 @@ DECLARE_ALIGNED(
 DECLARE_ALIGNED(16, static uint8_t,
                 wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
 
+DECLARE_ALIGNED(16, static uint8_t,
+                smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL]
+                                          [MAX_WEDGE_SQUARE]);
+
 static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
 
 static const wedge_code_type wedge_codebook_16_hgtw[16] = {
@@ -208,23 +272,23 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
   { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };
 
-const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
+const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = {
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
     wedge_masks[BLOCK_8X8] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
     wedge_masks[BLOCK_8X16] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
     wedge_masks[BLOCK_16X8] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
     wedge_masks[BLOCK_16X16] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
     wedge_masks[BLOCK_16X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
     wedge_masks[BLOCK_32X16] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
     wedge_masks[BLOCK_32X32] },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
@@ -234,9 +298,9 @@ const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
     wedge_masks[BLOCK_8X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
     wedge_masks[BLOCK_32X8] },
   { 0, NULL, NULL, NULL },
   { 0, NULL, NULL, NULL },
@@ -248,12 +312,12 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
   const int bh = block_size_high[sb_type];
   const int bw = block_size_wide[sb_type];
   const wedge_code_type *a =
-      wedge_params_lookup[sb_type].codebook + wedge_index;
+      av1_wedge_params_lookup[sb_type].codebook + wedge_index;
   int woff, hoff;
-  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+  const uint8_t wsignflip =
+      av1_wedge_params_lookup[sb_type].signflip[wedge_index];
 
-  assert(wedge_index >= 0 &&
-         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+  assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type));
   woff = (a->x_offset * bw) >> 3;
   hoff = (a->y_offset * bh) >> 3;
   master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
@@ -275,10 +339,10 @@ const uint8_t *av1_get_compound_type_mask(
   }
 }
 
-static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
-                             const CONV_BUF_TYPE *src0, int src0_stride,
-                             const CONV_BUF_TYPE *src1, int src1_stride, int h,
-                             int w, ConvolveParams *conv_params, int bd) {
+static AOM_INLINE void diffwtd_mask_d16(
+    uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
   int round =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
   int i, j, m, diff;
@@ -309,9 +373,10 @@ void av1_build_compound_diffwtd_mask_d16_c(
   }
 }
 
-static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
-                         const uint8_t *src0, int src0_stride,
-                         const uint8_t *src1, int src1_stride, int h, int w) {
+static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
+                                    int mask_base, const uint8_t *src0,
+                                    int src0_stride, const uint8_t *src1,
+                                    int src1_stride, int h, int w) {
   int i, j, m, diff;
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
@@ -419,13 +484,12 @@ void av1_build_compound_diffwtd_mask_highbd_c(
   }
 }
 
-static void init_wedge_master_masks() {
+static AOM_INLINE void init_wedge_master_masks() {
   int i, j;
   const int w = MASK_MASTER_SIZE;
   const int h = MASK_MASTER_SIZE;
   const int stride = MASK_MASTER_STRIDE;
-// Note: index [0] stores the masters, and [1] its complement.
-#if USE_PRECOMPUTED_WEDGE_MASK
+  // Note: index [0] stores the masters, and [1] its complement.
   // Generate prototype by shifting the masters
   int shift = h / 4;
   for (i = 0; i < h; i += 2) {
@@ -443,22 +507,7 @@ static void init_wedge_master_masks() {
            wedge_master_vertical,
            MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
   }
-#else
-  static const double smoother_param = 2.85;
-  const int a[2] = { 2, 1 };
-  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; ++j) {
-      int x = (2 * j + 1 - w);
-      int y = (2 * i + 1 - h);
-      double d = (a[0] * x + a[1] * y) / asqrt;
-      const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
-      wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
-      const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
-      wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
-    }
-  }
-#endif  // USE_PRECOMPUTED_WEDGE_MASK
+
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
       const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
@@ -480,57 +529,18 @@ static void init_wedge_master_masks() {
   }
 }
 
-#if !USE_PRECOMPUTED_WEDGE_SIGN
-// If the signs for the wedges for various blocksizes are
-// inconsistent flip the sign flag. Do it only once for every
-// wedge codebook.
-static void init_wedge_signs() {
-  BLOCK_SIZE sb_type;
-  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
-  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
-    const int bw = block_size_wide[sb_type];
-    const int bh = block_size_high[sb_type];
-    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
-    const int wbits = wedge_params.bits;
-    const int wtypes = 1 << wbits;
-    int i, w;
-    if (wbits) {
-      for (w = 0; w < wtypes; ++w) {
-        // Get the mask master, i.e. index [0]
-        const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
-        int avg = 0;
-        for (i = 0; i < bw; ++i) avg += mask[i];
-        for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
-        avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
-        // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
-        // If default sign is 1:
-        //   If sign requested is 0, we need to flip the sign and return
-        //   the complement i.e. index [1] instead. If sign requested is 1
-        //   we need to flip the sign and return index [0] instead.
-        // If default sign is 0:
-        //   If sign requested is 0, we need to return index [0] the master
-        //   if sign requested is 1, we need to return the complement index [1]
-        //   instead.
-        wedge_params.signflip[w] = (avg < 32);
-      }
-    }
-  }
-}
-#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
-
-static void init_wedge_masks() {
+static AOM_INLINE void init_wedge_masks() {
   uint8_t *dst = wedge_mask_buf;
   BLOCK_SIZE bsize;
   memset(wedge_masks, 0, sizeof(wedge_masks));
   for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize];
+    const int wtypes = wedge_params->wedge_types;
+    if (wtypes == 0) continue;
     const uint8_t *mask;
     const int bw = block_size_wide[bsize];
     const int bh = block_size_high[bsize];
-    const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
-    const int wbits = wedge_params->bits;
-    const int wtypes = 1 << wbits;
     int w;
-    if (wbits == 0) continue;
     for (w = 0; w < wtypes; ++w) {
       mask = get_wedge_mask_inplace(w, 0, bsize);
       aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
@@ -548,109 +558,383 @@ static void init_wedge_masks() {
   }
 }
 
+/* clang-format off */
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
+  60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
+  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+  16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
+  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
+  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
+};
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
+    32, 16, 16, 16, 8, 8, 8, 4,
+    4,  4,  2,  2,  2, 1, 1, 1,
+    8,  8,  4,  4,  2, 2
+};
+/* clang-format on */
+
+static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
+                                                    BLOCK_SIZE plane_bsize,
+                                                    INTERINTRA_MODE mode) {
+  int i, j;
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_SMOOTH_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j)
+          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, 32, bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+  }
+}
+
+static AOM_INLINE void init_smooth_interintra_masks() {
+  for (int m = 0; m < INTERINTRA_MODES; ++m) {
+    for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
+      const int bw = block_size_wide[bs];
+      const int bh = block_size_high[bs];
+      if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
+      build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs,
+                                   m);
+    }
+  }
+}
+
 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
 void av1_init_wedge_masks() {
   init_wedge_master_masks();
-#if !USE_PRECOMPUTED_WEDGE_SIGN
-  init_wedge_signs();
-#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
   init_wedge_masks();
+  init_smooth_interintra_masks();
 }
 
-static void build_masked_compound_no_round(
+static AOM_INLINE void build_masked_compound_no_round(
     uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     const CONV_BUF_TYPE *src1, int src1_stride,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w, ConvolveParams *conv_params, MACROBLOCKD *xd) {
-  // Derive subsampling from h and w passed in. May be refactored to
-  // pass in subsampling factors directly.
-  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
-  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+    int w, InterPredParams *inter_pred_params) {
+  const int ssy = inter_pred_params->subsampling_y;
+  const int ssx = inter_pred_params->subsampling_x;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+  const int mask_stride = block_size_wide[sb_type];
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (inter_pred_params->use_hbd_buf) {
     aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, block_size_wide[sb_type],
-                                  w, h, subw, subh, conv_params, xd->bd);
-  else
+                                  src1_stride, mask, mask_stride, w, h, ssx,
+                                  ssy, &inter_pred_params->conv_params,
+                                  inter_pred_params->bit_depth);
+  } else {
     aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, block_size_wide[sb_type], w,
-                                 h, subw, subh, conv_params);
+                                 src1_stride, mask, mask_stride, w, h, ssx, ssy,
+                                 &inter_pred_params->conv_params);
+  }
+#else
+  aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h, ssx, ssy,
+                               &inter_pred_params->conv_params);
+#endif
 }
 
-void av1_make_masked_inter_predictor(
-    const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
-    const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-    MACROBLOCKD *xd, int can_use_previous) {
-  MB_MODE_INFO *mi = xd->mi[0];
-  (void)dst;
-  (void)dst_stride;
-  mi->interinter_comp.seg_mask = xd->seg_mask;
-  const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp;
-
-// We're going to call av1_make_inter_predictor to generate a prediction into
-// a temporary buffer, then will blend that temporary buffer with that from
-// the other reference.
-//
-#define INTER_PRED_BYTES_PER_PIXEL 2
-
-  DECLARE_ALIGNED(32, uint8_t,
-                  tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
-#undef INTER_PRED_BYTES_PER_PIXEL
-
-  uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params) {
+  const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
+  BLOCK_SIZE sb_type = inter_pred_params->sb_type;
+
+  // We're going to call av1_make_inter_predictor to generate a prediction into
+  // a temporary buffer, then will blend that temporary buffer with that from
+  // the other reference.
+  DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_dst =
+      inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
 
   const int tmp_buf_stride = MAX_SB_SIZE;
-  CONV_BUF_TYPE *org_dst = conv_params->dst;
-  int org_dst_stride = conv_params->dst_stride;
+  CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
+  int org_dst_stride = inter_pred_params->conv_params.dst_stride;
   CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
-  conv_params->dst = tmp_buf16;
-  conv_params->dst_stride = tmp_buf_stride;
-  assert(conv_params->do_average == 0);
+  inter_pred_params->conv_params.dst = tmp_buf16;
+  inter_pred_params->conv_params.dst_stride = tmp_buf_stride;
+  assert(inter_pred_params->conv_params.do_average == 0);
 
   // This will generate a prediction in tmp_buf for the second reference
-  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params,
-                           sf, w, h, conv_params, interp_filters, warp_types,
-                           p_col, p_row, plane, ref, mi, 0, xd,
-                           can_use_previous);
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
+                           inter_pred_params, subpel_params);
 
-  if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+  if (!inter_pred_params->conv_params.plane &&
+      comp_data->type == COMPOUND_DIFFWTD) {
     av1_build_compound_diffwtd_mask_d16(
         comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
-        tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd);
+        tmp_buf16, tmp_buf_stride, inter_pred_params->block_height,
+        inter_pred_params->block_width, &inter_pred_params->conv_params,
+        inter_pred_params->bit_depth);
+  }
+  build_masked_compound_no_round(
+      dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride,
+      comp_data, sb_type, inter_pred_params->block_height,
+      inter_pred_params->block_width, inter_pred_params);
+}
+
+void av1_build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *const src_mv,
+    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+    int ref, CalcSubpelParamsFunc calc_subpel_params_func) {
+  SubpelParams subpel_params;
+  uint8_t *src;
+  int src_stride;
+  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src,
+                          &subpel_params, &src_stride);
+
+  if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
+      inter_pred_params->comp_mode == UNIFORM_COMP) {
+    av1_make_inter_predictor(src, src_stride, dst, dst_stride,
+                             inter_pred_params, &subpel_params);
+  } else {
+    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+                                    inter_pred_params, &subpel_params);
   }
-  build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
-                                 tmp_buf16, tmp_buf_stride, comp_data,
-                                 mi->sb_type, h, w, conv_params, xd);
 }
 
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
-                                int order_idx, int *fwd_offset, int *bck_offset,
-                                int *use_jnt_comp_avg, int is_compound) {
+// True if the following hold:
+//  1. Not intrabc and not build_for_obmc
+//  2. A U or V plane
+//  3. If the block size differs from the base block size
+//  4. If sub-sampled, none of the previous blocks around the sub-sample
+//     are intrabc or inter-blocks
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
+                            int is_intrabc, int build_for_obmc) {
+  if (is_intrabc || build_for_obmc) {
+    return false;
+  }
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  if ((block_size_wide[bsize] >= 8 || !ss_x) &&
+      (block_size_high[bsize] >= 8 || !ss_y)) {
+    return false;
+  }
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+
+  for (int row = row_start; row <= 0; ++row) {
+    for (int col = col_start; col <= 0; ++col) {
+      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      if (!is_inter_block(this_mbmi)) return false;
+      if (is_intrabc_block(this_mbmi)) return false;
+    }
+  }
+  return true;
+}
+
+static void build_inter_predictors_sub8x8(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int bw, int bh, int mi_x, int mi_y,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const BLOCK_SIZE bsize = mi->sb_type;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const bool ss_x = pd->subsampling_x;
+  const bool ss_y = pd->subsampling_y;
+  const int b4_w = block_size_wide[bsize] >> ss_x;
+  const int b4_h = block_size_high[bsize] >> ss_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int b8_w = block_size_wide[plane_bsize];
+  const int b8_h = block_size_high[plane_bsize];
+  const int is_compound = has_second_ref(mi);
+  assert(!is_compound);
+  assert(!is_intrabc_block(mi));
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  int row = row_start;
+  for (int y = 0; y < b8_h; y += b4_h) {
+    int col = col_start;
+    for (int x = 0; x < b8_w; x += b4_w) {
+      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      int tmp_dst_stride = 8;
+      assert(bw < 8 || bh < 8);
+      (void)bw;
+      (void)bh;
+      struct buf_2d *const dst_buf = &pd->dst;
+      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+      int ref = 0;
+      const RefCntBuffer *ref_buf =
+          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *ref_scale_factors =
+          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *const sf = ref_scale_factors;
+      const struct buf_2d pre_buf = {
+        NULL,
+        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
+        ref_buf->buf.uv_crop_width,
+        ref_buf->buf.uv_crop_height,
+        ref_buf->buf.uv_stride,
+      };
+
+      const MV mv = this_mbmi->mv[ref].as_mv;
+
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
+                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
+                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
+                            &pre_buf, this_mbmi->interp_filters);
+      inter_pred_params.conv_params = get_conv_params_no_round(
+          ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+      inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+
+      av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+                                    &inter_pred_params, xd, mi_x + x, mi_y + y,
+                                    ref, calc_subpel_params_func);
+
+      ++col;
+    }
+    ++row;
+  }
+}
+
+static void build_inter_predictors_8x8_and_bigger(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const int is_compound = has_second_ref(mi);
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+
+  int is_global[2] = { 0, 0 };
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
+    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+    const MV mv = mi->mv[ref].as_mv;
+    const WarpTypesAllowed warp_types = { is_global[ref],
+                                          mi->motion_mode == WARPED_CAUSAL };
+
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
+                          pd->subsampling_x, pd->subsampling_y, xd->bd,
+                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
+                          mi->interp_filters);
+    if (is_compound) av1_init_comp_mode(&inter_pred_params);
+    inter_pred_params.conv_params = get_conv_params_no_round(
+        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+    av1_dist_wtd_comp_weight_assign(
+        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+        &inter_pred_params.conv_params.bck_offset,
+        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
+
+    if (!build_for_obmc)
+      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    if (is_masked_compound_type(mi->interinter_comp.type)) {
+      av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp);
+      // Assign physical buffer.
+      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+    }
+
+    av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
+                                  xd, mi_x, mi_y, ref, calc_subpel_params_func);
+  }
+}
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                int plane, const MB_MODE_INFO *mi,
+                                int build_for_obmc, int bw, int bh, int mi_x,
+                                int mi_y,
+                                CalcSubpelParamsFunc calc_subpel_params_func) {
+  if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi),
+                      build_for_obmc)) {
+    build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y,
+                                  calc_subpel_params_func);
+  } else {
+    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+                                          bh, mi_x, mi_y,
+                                          calc_subpel_params_func);
+  }
+}
+
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int order_idx,
+                                     int *fwd_offset, int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound) {
   assert(fwd_offset != NULL && bck_offset != NULL);
   if (!is_compound || mbmi->compound_idx) {
-    *use_jnt_comp_avg = 0;
+    *use_dist_wtd_comp_avg = 0;
     return;
   }
 
-  *use_jnt_comp_avg = 1;
-  const int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
-  const int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
-  const int cur_frame_index = cm->cur_frame->cur_frame_offset;
+  *use_dist_wtd_comp_avg = 1;
+  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
+  const int cur_frame_index = cm->cur_frame->order_hint;
   int bck_frame_index = 0, fwd_frame_index = 0;
 
-  if (bck_idx >= 0) {
-    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
-  }
-
-  if (fwd_idx >= 0) {
-    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
-  }
+  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
 
-  int d0 = clamp(abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)),
+  int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                       fwd_frame_index, cur_frame_index)),
                  0, MAX_FRAME_DISTANCE);
-  int d1 = clamp(abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)),
+  int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                       cur_frame_index, bck_frame_index)),
                  0, MAX_FRAME_DISTANCE);
 
   const int order = d0 <= d1;
@@ -708,10 +992,9 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
 
 // obmc_mask_N[overlap_position]
 static const uint8_t obmc_mask_1[1] = { 64 };
+DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 };
 
-static const uint8_t obmc_mask_2[2] = { 45, 64 };
-
-static const uint8_t obmc_mask_4[4] = { 39, 50, 59, 64 };
+DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 };
 
 static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
 
@@ -743,19 +1026,21 @@ const uint8_t *av1_get_obmc_mask(int length) {
   }
 }
 
-static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
-                                     uint8_t mi_hw, MB_MODE_INFO *mi,
-                                     void *fun_ctxt, const int num_planes) {
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
+                                     int rel_mi_col, uint8_t op_mi_size,
+                                     int dir, MB_MODE_INFO *mi, void *fun_ctxt,
+                                     const int num_planes) {
   (void)xd;
-  (void)rel_mi_rc;
-  (void)mi_hw;
+  (void)rel_mi_row;
+  (void)rel_mi_col;
+  (void)op_mi_size;
+  (void)dir;
   (void)mi;
   ++*(int *)fun_ctxt;
   (void)num_planes;
 }
 
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col) {
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = xd->mi[0];
 
   mbmi->overlappable_neighbors[0] = 0;
@@ -763,9 +1048,9 @@ void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
   if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
 
-  foreach_overlappable_nb_above(cm, xd, mi_col, INT_MAX, increment_int_ptr,
+  foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
                                 &mbmi->overlappable_neighbors[0]);
-  foreach_overlappable_nb_left(cm, xd, mi_row, INT_MAX, increment_int_ptr,
+  foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
                                &mbmi->overlappable_neighbors[1]);
 }
 
@@ -806,21 +1091,20 @@ struct obmc_inter_pred_ctxt {
   int *adjacent_stride;
 };
 
-static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
-                                               uint8_t above_mi_width,
-                                               MB_MODE_INFO *above_mi,
-                                               void *fun_ctxt,
-                                               const int num_planes) {
+static INLINE void build_obmc_inter_pred_above(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
   (void)above_mi;
+  (void)rel_mi_row;
+  (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
   const int overlap =
       AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
 
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
-    const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
     const int bh = overlap >> pd->subsampling_y;
     const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
 
@@ -831,32 +1115,36 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
     const int tmp_stride = ctxt->adjacent_stride[plane];
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
     const uint8_t *const mask = av1_get_obmc_mask(bh);
-
+#if CONFIG_AV1_HIGHBITDEPTH
+    const int is_hbd = is_cur_buf_hbd(xd);
     if (is_hbd)
       aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
                                  tmp_stride, mask, bw, bh, xd->bd);
     else
       aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
                           mask, bw, bh);
+#else
+    aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+                        bw, bh);
+#endif
   }
 }
 
-static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
-                                              uint8_t left_mi_height,
-                                              MB_MODE_INFO *left_mi,
-                                              void *fun_ctxt,
-                                              const int num_planes) {
+static INLINE void build_obmc_inter_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
   (void)left_mi;
+  (void)rel_mi_col;
+  (void)dir;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int overlap =
       AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 
   for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = overlap >> pd->subsampling_x;
-    const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
+    const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y;
     const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
 
     if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
@@ -867,12 +1155,18 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
     const uint8_t *const mask = av1_get_obmc_mask(bw);
 
+#if CONFIG_AV1_HIGHBITDEPTH
+    const int is_hbd = is_cur_buf_hbd(xd);
     if (is_hbd)
       aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
                                  tmp_stride, mask, bw, bh, xd->bd);
     else
       aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
                           mask, bw, bh);
+#else
+    aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+                        bw, bh);
+#endif
   }
 }
 
@@ -881,7 +1175,6 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
 // prediction. We assume the original prediction (bmc) is stored in
 // xd->plane[].dst.buf
 void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
                                      uint8_t *above[MAX_MB_PLANE],
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
@@ -890,23 +1183,54 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
   // handle above row
   struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
-  foreach_overlappable_nb_above(cm, xd, mi_col,
+  foreach_overlappable_nb_above(cm, xd,
                                 max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_obmc_inter_pred_above, &ctxt_above);
 
   // handle left column
   struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
-  foreach_overlappable_nb_left(cm, xd, mi_row,
+  foreach_overlappable_nb_left(cm, xd,
                                max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_obmc_inter_pred_left, &ctxt_left);
 }
 
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                struct build_prediction_ctxt *ctxt,
+                                const int num_planes) {
+  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type);
+  const int ref_mi_row = xd->mi_row + mi_row_offset;
+  const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(ctxt->cm, frame);
+
+  xd->block_ref_scale_factors[0] = sf;
+  if ((!av1_is_valid_scale(sf)))
+    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Reference frame has invalid dimensions");
+
+  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+                       num_planes);
+}
+
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
     const int num_planes) {
   const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  const int above_mi_col = xd->mi_col + rel_mi_col;
 
   av1_modify_neighbor_predictor_for_obmc(above_mbmi);
 
@@ -922,19 +1246,21 @@ void av1_setup_build_prediction_by_above_pred(
   for (int ref = 0; ref < num_refs; ++ref) {
     const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
 
-    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
-
-    xd->block_refs[ref] = ref_buf;
-    if ((!av1_is_valid_scale(&ref_buf->sf)))
+    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(ctxt->cm, frame);
+    xd->block_ref_scale_factors[ref] = sf;
+    if ((!av1_is_valid_scale(sf)))
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
-                         &ref_buf->sf, num_planes);
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
+                         num_planes);
   }
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
-  xd->mb_to_right_edge = ctxt->mb_to_far_edge +
-                         (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+  xd->mb_to_right_edge =
+      ctxt->mb_to_far_edge +
+      (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8;
 }
 
 void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
@@ -943,7 +1269,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes) {
   const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
-  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  const int left_mi_row = xd->mi_row + rel_mi_row;
 
   av1_modify_neighbor_predictor_for_obmc(left_mbmi);
 
@@ -959,91 +1285,34 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
   for (int ref = 0; ref < num_refs; ++ref) {
     const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
 
-    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+    const struct scale_factors *const ref_scale_factors =
+        get_ref_scale_factors_const(ctxt->cm, frame);
 
-    xd->block_refs[ref] = ref_buf;
-    if ((!av1_is_valid_scale(&ref_buf->sf)))
+    xd->block_ref_scale_factors[ref] = ref_scale_factors;
+    if ((!av1_is_valid_scale(ref_scale_factors)))
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
-                         &ref_buf->sf, num_planes);
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
+                         ref_scale_factors, num_planes);
   }
 
-  xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
+  xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
   xd->mb_to_bottom_edge =
       ctxt->mb_to_far_edge +
-      (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+      GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
 }
 
-/* clang-format off */
-static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
-  60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
-  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
-  16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
-  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
-  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
-  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
-  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
-};
-static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
-    32, 16, 16, 16, 8, 8, 8, 4,
-    4,  4,  2,  2,  2, 1, 1, 1,
-    8,  8,  4,  4,  2, 2
-};
-/* clang-format on */
-
-static void build_smooth_interintra_mask(uint8_t *mask, int stride,
-                                         BLOCK_SIZE plane_bsize,
-                                         INTERINTRA_MODE mode) {
-  int i, j;
-  const int bw = block_size_wide[plane_bsize];
-  const int bh = block_size_high[plane_bsize];
-  const int size_scale = ii_size_scales[plane_bsize];
-
-  switch (mode) {
-    case II_V_PRED:
-      for (i = 0; i < bh; ++i) {
-        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
-        mask += stride;
-      }
-      break;
-
-    case II_H_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
-        mask += stride;
-      }
-      break;
-
-    case II_SMOOTH_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j)
-          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
-        mask += stride;
-      }
-      break;
-
-    case II_DC_PRED:
-    default:
-      for (i = 0; i < bh; ++i) {
-        memset(mask, 32, bw * sizeof(mask[0]));
-        mask += stride;
-      }
-      break;
-  }
-}
-
-static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
-                               int wedge_index, int wedge_sign,
-                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
-                               uint8_t *comppred, int compstride,
-                               const uint8_t *interpred, int interstride,
-                               const uint8_t *intrapred, int intrastride) {
+static AOM_INLINE void combine_interintra(
+    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+    int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+    uint8_t *comppred, int compstride, const uint8_t *interpred,
+    int interstride, const uint8_t *intrapred, int intrastride) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
 
   if (use_wedge_interintra) {
-    if (is_interintra_wedge_used(bsize)) {
+    if (av1_is_wedge_used(bsize)) {
       const uint8_t *mask =
           av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
       const int subw = 2 * mi_size_wide[bsize] == bw;
@@ -1055,22 +1324,22 @@ static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
     return;
   }
 
-  uint8_t mask[MAX_SB_SQUARE];
-  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
   aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
                      interstride, mask, bw, bw, bh, 0, 0);
 }
 
-static void combine_interintra_highbd(
-    INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
-    int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void combine_interintra_highbd(
+    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+    int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
     uint8_t *comppred8, int compstride, const uint8_t *interpred8,
     int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
 
   if (use_wedge_interintra) {
-    if (is_interintra_wedge_used(bsize)) {
+    if (av1_is_wedge_used(bsize)) {
       const uint8_t *mask =
           av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
       const int subh = 2 * mi_size_high[bsize] == bh;
@@ -1088,12 +1357,13 @@ static void combine_interintra_highbd(
                             interpred8, interstride, mask, bw, bw, bh, 0, 0,
                             bd);
 }
+#endif
 
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd,
                                                BLOCK_SIZE bsize, int plane,
-                                               BUFFER_SET *ctx, uint8_t *dst,
-                                               int dst_stride) {
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ssx = xd->plane[plane].subsampling_x;
   const int ssy = xd->plane[plane].subsampling_y;
@@ -1116,28 +1386,30 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
   const int ssx = xd->plane[plane].subsampling_x;
   const int ssy = xd->plane[plane].subsampling_y;
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
     combine_interintra_highbd(
         xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
-        xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
-        bsize, plane_bsize, xd->plane[plane].dst.buf,
-        xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred,
-        intra_stride, xd->bd);
+        xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+        plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+        inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
     return;
   }
+#endif
   combine_interintra(
       xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
-      xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
-      bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+      plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
       inter_pred, inter_stride, intra_pred, intra_stride);
 }
 
 // build interintra_predictors for one plane
-void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *pred, int stride,
-                                         BUFFER_SET *ctx, int plane,
-                                         BLOCK_SIZE bsize) {
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    uint8_t *pred, int stride,
+                                    const BUFFER_SET *ctx, int plane,
+                                    BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  if (is_cur_buf_hbd(xd)) {
     DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
         cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
@@ -1152,11 +1424,3 @@ void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
                            MAX_SB_SIZE);
   }
 }
-
-void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          uint8_t *upred, uint8_t *vpred,
-                                          int ustride, int vstride,
-                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
-  av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
-  av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
-}
diff --git a/media/libaom/src/av1/common/reconinter.h b/media/libaom/src/av1/common/reconinter.h
index db86c777e..fe3c6a621 100644
--- a/media/libaom/src/av1/common/reconinter.h
+++ b/media/libaom/src/av1/common/reconinter.h
@@ -12,9 +12,9 @@
 #ifndef AOM_AV1_COMMON_RECONINTER_H_
 #define AOM_AV1_COMMON_RECONINTER_H_
 
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/convolve.h"
+#include "av1/common/filter.h"
 #include "av1/common/warped_motion.h"
 #include "aom/aom_integer.h"
 
@@ -35,8 +35,7 @@
 extern "C" {
 #endif
 
-// Set to (1 << 5) if the 32-ary codebooks are used for any bock size
-#define MAX_WEDGE_TYPES (1 << 4)
+#define MAX_WEDGE_TYPES 16
 
 #define MAX_WEDGE_SIZE_LOG2 5  // 32x32
 #define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
@@ -47,7 +46,7 @@ extern "C" {
 #define WEDGE_NONE -1
 
 // Angles are with respect to horizontal anti-clockwise
-typedef enum {
+enum {
   WEDGE_HORIZONTAL = 0,
   WEDGE_VERTICAL = 1,
   WEDGE_OBLIQUE27 = 2,
@@ -55,7 +54,7 @@ typedef enum {
   WEDGE_OBLIQUE117 = 4,
   WEDGE_OBLIQUE153 = 5,
   WEDGE_DIRECTIONS
-} WedgeDirectionType;
+} UENUM1BYTE(WedgeDirectionType);
 
 // 3-tuple: {direction, x_offset, y_offset}
 typedef struct {
@@ -67,13 +66,13 @@ typedef struct {
 typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
 
 typedef struct {
-  int bits;
+  int wedge_types;
   const wedge_code_type *codebook;
   uint8_t *signflip;
   wedge_masks_type *masks;
 } wedge_params_type;
 
-extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL];
+extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL];
 
 typedef struct SubpelParams {
   int xs;
@@ -84,8 +83,6 @@ typedef struct SubpelParams {
 
 struct build_prediction_ctxt {
   const AV1_COMMON *cm;
-  int mi_row;
-  int mi_col;
   uint8_t **tmp_buf;
   int *tmp_width;
   int *tmp_height;
@@ -93,6 +90,55 @@ struct build_prediction_ctxt {
   int mb_to_far_edge;
 };
 
+typedef enum InterPredMode {
+  TRANSLATION_PRED,
+  WARP_PRED,
+} InterPredMode;
+
+typedef enum InterCompMode {
+  UNIFORM_SINGLE,
+  UNIFORM_COMP,
+  MASK_COMP,
+} InterCompMode;
+
+typedef struct InterPredParams {
+  InterPredMode mode;
+  InterCompMode comp_mode;
+  WarpedMotionParams warp_params;
+  ConvolveParams conv_params;
+  const InterpFilterParams *interp_filter_params[2];
+  int block_width;
+  int block_height;
+  int pix_row;
+  int pix_col;
+  struct buf_2d ref_frame_buf;
+  int subsampling_x;
+  int subsampling_y;
+  const struct scale_factors *scale_factors;
+  int bit_depth;
+  int use_hbd_buf;
+  INTERINTER_COMPOUND_DATA mask_comp;
+  BLOCK_SIZE sb_type;
+  int is_intrabc;
+} InterPredParams;
+
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+                           int block_height, int pix_row, int pix_col,
+                           int subsampling_x, int subsampling_y, int bit_depth,
+                           int use_hbd_buf, int is_intrabc,
+                           const struct scale_factors *sf,
+                           const struct buf_2d *ref_buf,
+                           int_interpfilters interp_filters);
+
+void av1_init_comp_mode(InterPredParams *inter_pred_params);
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+                          const WarpTypesAllowed *warp_types, int ref,
+                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+                        const INTERINTER_COMPOUND_DATA *mask_comp);
+
 static INLINE int has_scale(int xs, int ys) {
   return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
 }
@@ -108,53 +154,47 @@ static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
   assert(sp->ys <= SUBPEL_SHIFTS);
 }
 
-static INLINE void inter_predictor(const uint8_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride,
-                                   const SubpelParams *subpel_params,
-                                   const struct scale_factors *sf, int w, int h,
-                                   ConvolveParams *conv_params,
-                                   InterpFilters interp_filters,
-                                   int is_intrabc) {
+static INLINE void inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params,
+    const InterpFilterParams *interp_filters[2]) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
-  assert(IMPLIES(is_intrabc, !is_scaled));
   if (is_scaled) {
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, subpel_params->subpel_x,
                            subpel_params->xs, subpel_params->subpel_y,
-                           subpel_params->ys, 1, conv_params, sf, is_intrabc);
+                           subpel_params->ys, 1, conv_params, sf);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
     av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
                            interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
-                           sp.ys, 0, conv_params, sf, is_intrabc);
+                           sp.ys, 0, conv_params, sf);
   }
 }
 
-static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
-                                          uint8_t *dst, int dst_stride,
-                                          const SubpelParams *subpel_params,
-                                          const struct scale_factors *sf, int w,
-                                          int h, ConvolveParams *conv_params,
-                                          InterpFilters interp_filters,
-                                          int is_intrabc, int bd) {
+static INLINE void highbd_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params,
+    const InterpFilterParams *interp_filters[2], int bd) {
   assert(conv_params->do_average == 0 || conv_params->do_average == 1);
   assert(sf);
   const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
-  assert(IMPLIES(is_intrabc, !is_scaled));
   if (is_scaled) {
-    av1_highbd_convolve_2d_facade(
-        src, src_stride, dst, dst_stride, w, h, interp_filters,
-        subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
-        subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, subpel_params->subpel_x,
+                                  subpel_params->xs, subpel_params->subpel_y,
+                                  subpel_params->ys, 1, conv_params, sf, bd);
   } else {
     SubpelParams sp = *subpel_params;
     revert_scale_extra_bits(&sp);
-    av1_highbd_convolve_2d_facade(
-        src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
-        sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, sp.subpel_x, sp.xs,
+                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
   }
 }
 
@@ -167,9 +207,10 @@ static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
   const int comp_allowed = is_comp_ref_allowed(sb_type);
   switch (type) {
     case COMPOUND_AVERAGE:
+    case COMPOUND_DISTWTD:
     case COMPOUND_DIFFWTD: return comp_allowed;
     case COMPOUND_WEDGE:
-      return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
+      return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0;
     default: assert(0); return 0;
   }
 }
@@ -187,39 +228,41 @@ static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
   return 0;
 }
 
-static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].bits;
-}
-
-static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
-  const int wbits = wedge_params_lookup[sb_type].bits;
-  return (wbits > 0) ? wbits + 1 : 0;
-}
-
-static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].bits > 0;
+static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
+  return av1_wedge_params_lookup[sb_type].wedge_types;
 }
 
-static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].bits;
+static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) {
+  return av1_wedge_params_lookup[sb_type].wedge_types > 0;
 }
 
 void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, const SubpelParams *subpel_params,
-                              const struct scale_factors *sf, int w, int h,
-                              ConvolveParams *conv_params,
-                              InterpFilters interp_filters,
-                              const WarpTypesAllowed *warp_types, int p_col,
-                              int p_row, int plane, int ref,
-                              const MB_MODE_INFO *mi, int build_for_obmc,
-                              const MACROBLOCKD *xd, int can_use_previous);
-
-void av1_make_masked_inter_predictor(
-    const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
-    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
-    const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-    MACROBLOCKD *xd, int can_use_previous);
+                              int dst_stride,
+                              InterPredParams *inter_pred_params,
+                              const SubpelParams *subpel_params);
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params);
+
+typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv,
+                                     InterPredParams *const inter_pred_params,
+                                     MACROBLOCKD *xd, int mi_x, int mi_y,
+                                     int ref, uint8_t **pre,
+                                     SubpelParams *subpel_params,
+                                     int *src_stride);
+
+void av1_build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *const src_mv,
+    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+    int ref, CalcSubpelParamsFunc calc_subpel_params_func);
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                int plane, const MB_MODE_INFO *mi,
+                                int build_for_obmc, int bw, int bh, int mi_x,
+                                int mi_y,
+                                CalcSubpelParamsFunc calc_subpel_params_func);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
 static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
@@ -236,22 +279,26 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
                     (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
+  const SubpelMvLimits mv_limits = {
+    xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+    xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+    xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+    xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom
+  };
 
-  clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
-           xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
-           xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
-           xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+  clamp_mv(&clamped_mv, &mv_limits);
 
   return clamped_mv;
 }
 
-static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
-                                       const struct scale_factors *sf) {
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+                                           int stride,
+                                           const struct scale_factors *sf) {
   const int x =
       sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
   const int y =
       sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
-  return y * stride + x;
+  return (int64_t)y * stride + x;
 }
 
 static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
@@ -296,6 +343,11 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
   return 1;
 }
 
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                struct build_prediction_ctxt *ctxt,
+                                const int num_planes);
+
 void av1_setup_build_prediction_by_above_pred(
     MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
     MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
@@ -306,56 +358,53 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
                                              struct build_prediction_ctxt *ctxt,
                                              const int num_planes);
 void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
                                      uint8_t *above[MAX_MB_PLANE],
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
                                      int left_stride[MAX_MB_PLANE]);
 
 const uint8_t *av1_get_obmc_mask(int length);
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
 #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
 #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
 
 void av1_init_wedge_masks();
 
-static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
-                                                          int wedge_sign,
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
+                                                          int8_t wedge_sign,
                                                           BLOCK_SIZE sb_type) {
-  return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+  return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
 }
 
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
 
 // build interintra_predictors for one plane
-void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         uint8_t *pred, int stride,
-                                         BUFFER_SET *ctx, int plane,
-                                         BLOCK_SIZE bsize);
-
-void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          uint8_t *upred, uint8_t *vpred,
-                                          int ustride, int vstride,
-                                          BUFFER_SET *ctx, BLOCK_SIZE bsize);
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    uint8_t *pred, int stride,
+                                    const BUFFER_SET *ctx, int plane,
+                                    BLOCK_SIZE bsize);
 
-void av1_build_intra_predictors_for_interintra(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-    BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride);
 
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
 
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
-                                int order_idx, int *fwd_offset, int *bck_offset,
-                                int *use_jnt_comp_avg, int is_compound);
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int order_idx,
+                                     int *fwd_offset, int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound);
 int av1_allow_warp(const MB_MODE_INFO *const mbmi,
                    const WarpTypesAllowed *const warp_types,
                    const WarpedMotionParams *const gm_params,
-                   int build_for_obmc, int x_scale, int y_scale,
+                   int build_for_obmc, const struct scale_factors *const sf,
                    WarpedMotionParams *final_warp_params);
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/common/reconintra.c b/media/libaom/src/av1/common/reconintra.c
index 71a52e73e..1307a0313 100644
--- a/media/libaom/src/av1/common/reconintra.c
+++ b/media/libaom/src/av1/common/reconintra.c
@@ -20,9 +20,9 @@
 #include "aom_ports/aom_once.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
-#include "av1/common/reconintra.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
 
 enum {
   NEED_LEFT = 1 << 1,
@@ -198,7 +198,7 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
                          int col_off, int ss_x, int ss_y) {
   if (!top_available || !right_available) return 0;
 
-  const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int bw_unit = mi_size_wide[bsize];
   const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
   const int top_right_count_unit = tx_size_wide_unit[txsz];
 
@@ -405,7 +405,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
     // Bottom-left pixels are in the bottom-left block, which is not available.
     return 0;
   } else {
-    const int bh_unit = block_size_high[bsize] >> tx_size_high_log2[0];
+    const int bh_unit = mi_size_high[bsize];
     const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
     const int bottom_left_count_unit = tx_size_high_unit[txsz];
 
@@ -422,10 +422,9 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
     // and/or bottom-left superblocks. But only the left superblock is
     // available, so check if all required pixels fall in that superblock.
     if (blk_col_in_sb == 0) {
-      const int blk_start_row_off = blk_row_in_sb
-                                        << (bh_in_mi_log2 + MI_SIZE_LOG2 -
-                                            tx_size_wide_log2[0]) >>
-                                    ss_y;
+      const int blk_start_row_off =
+          blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
+          ss_y;
       const int row_off_in_sb = blk_start_row_off + row_off;
       const int sb_height_unit = sb_mi_size >> ss_y;
       return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
@@ -453,11 +452,13 @@ typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
 static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
 static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *above, const uint16_t *left,
                                    int bd);
 static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
 static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
+#endif
 
 static void init_intra_predictors_internal(void) {
   assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
@@ -499,7 +500,7 @@ static void init_intra_predictors_internal(void) {
   INIT_ALL_SIZES(dc_pred[0][1], dc_top);
   INIT_ALL_SIZES(dc_pred[1][0], dc_left);
   INIT_ALL_SIZES(dc_pred[1][1], dc);
-
+#if CONFIG_AV1_HIGHBITDEPTH
   INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
   INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
   INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
@@ -510,6 +511,7 @@ static void init_intra_predictors_internal(void) {
   INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
   INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
   INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif
 #undef intra_pred_allsizes
 }
 
@@ -556,33 +558,37 @@ void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                             const uint8_t *above, const uint8_t *left,
                             int upsample_above, int upsample_left, int dx,
                             int dy) {
-  int r, c, x, y, shift1, shift2, val, base1, base2;
-
   assert(dx > 0);
   assert(dy > 0);
 
   const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  (void)min_base_y;
   const int frac_bits_x = 6 - upsample_above;
   const int frac_bits_y = 6 - upsample_left;
-  const int base_inc_x = 1 << upsample_above;
-  x = -dx;
-  for (r = 0; r < bh; ++r, x -= dx, dst += stride) {
-    base1 = x >> frac_bits_x;
-    y = (r << 6) - dy;
-    for (c = 0; c < bw; ++c, base1 += base_inc_x, y -= dy) {
-      if (base1 >= min_base_x) {
-        shift1 = ((x * (1 << upsample_above)) & 0x3F) >> 1;
-        val = above[base1] * (32 - shift1) + above[base1 + 1] * shift1;
+
+  for (int r = 0; r < bh; ++r) {
+    for (int c = 0; c < bw; ++c) {
+      int val;
+      int y = r + 1;
+      int x = (c << 6) - y * dx;
+      const int base_x = x >> frac_bits_x;
+      if (base_x >= min_base_x) {
+        const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
         val = ROUND_POWER_OF_TWO(val, 5);
       } else {
-        base2 = y >> frac_bits_y;
-        assert(base2 >= -(1 << upsample_left));
-        shift2 = ((y * (1 << upsample_left)) & 0x3F) >> 1;
-        val = left[base2] * (32 - shift2) + left[base2 + 1] * shift2;
+        x = c + 1;
+        y = (r << 6) - x * dy;
+        const int base_y = y >> frac_bits_y;
+        assert(base_y >= min_base_y);
+        const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
         val = ROUND_POWER_OF_TWO(val, 5);
       }
       dst[c] = val;
     }
+    dst += stride;
   }
 }
 
@@ -643,6 +649,7 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // Directional prediction, zone 1: 0 < angle < 90
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
                                    int bh, const uint16_t *above,
@@ -688,30 +695,33 @@ void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
                                    int bh, const uint16_t *above,
                                    const uint16_t *left, int upsample_above,
                                    int upsample_left, int dx, int dy, int bd) {
-  int r, c, x, y, shift, val, base;
-
   (void)bd;
   assert(dx > 0);
   assert(dy > 0);
 
   const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  (void)min_base_y;
   const int frac_bits_x = 6 - upsample_above;
   const int frac_bits_y = 6 - upsample_left;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      y = r + 1;
-      x = (c << 6) - y * dx;
-      base = x >> frac_bits_x;
-      if (base >= min_base_x) {
-        shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
-        val = above[base] * (32 - shift) + above[base + 1] * shift;
+
+  for (int r = 0; r < bh; ++r) {
+    for (int c = 0; c < bw; ++c) {
+      int val;
+      int y = r + 1;
+      int x = (c << 6) - y * dx;
+      const int base_x = x >> frac_bits_x;
+      if (base_x >= min_base_x) {
+        const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
         val = ROUND_POWER_OF_TWO(val, 5);
       } else {
         x = c + 1;
         y = (r << 6) - x * dy;
-        base = y >> frac_bits_y;
-        shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
-        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        const int base_y = y >> frac_bits_y;
+        assert(base_y >= min_base_y);
+        const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
         val = ROUND_POWER_OF_TWO(val, 5);
       }
       dst[c] = val;
@@ -778,6 +788,7 @@ static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
     pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 DECLARE_ALIGNED(16, const int8_t,
                 av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
@@ -843,10 +854,6 @@ void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
 
   assert(bw <= 32 && bh <= 32);
 
-  // The initialization is just for silencing Jenkins static analysis warnings
-  for (r = 0; r < bh + 1; ++r)
-    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
   for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
   memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
 
@@ -881,6 +888,7 @@ void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
                                           TX_SIZE tx_size,
                                           const uint16_t *above,
@@ -893,10 +901,6 @@ static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
 
   assert(bw <= 32 && bh <= 32);
 
-  // The initialization is just for silencing Jenkins static analysis warnings
-  for (r = 0; r < bh + 1; ++r)
-    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
   for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
   memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
 
@@ -931,6 +935,7 @@ static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
     dst += stride;
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
   if (plane == 0) {
@@ -1008,9 +1013,9 @@ static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
 void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
   if (!strength) return;
 
-  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
-    { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
-  };
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+                                                         { 0, 5, 6, 5, 0 },
+                                                         { 2, 4, 4, 4, 2 } };
   const int filt = strength - 1;
   uint8_t edge[129];
 
@@ -1041,9 +1046,9 @@ static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
 void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   if (!strength) return;
 
-  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
-    { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
-  };
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+                                                         { 0, 5, 6, 5, 0 },
+                                                         { 2, 4, 4, 4, 2 } };
   const int filt = strength - 1;
   uint16_t edge[129];
 
@@ -1061,6 +1066,7 @@ void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
   const int kernel[3] = { 5, 6, 5 };
 
@@ -1070,6 +1076,7 @@ static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
   p_above[-1] = s;
   p_left[-1] = s;
 }
+#endif
 
 void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
   // interpolate half-sample positions
@@ -1117,7 +1124,7 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
     p[2 * i] = in[i + 2];
   }
 }
-
+#if CONFIG_AV1_HIGHBITDEPTH
 static void build_intra_predictors_high(
     const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
     int dst_stride, PREDICTION_MODE mode, int angle_delta,
@@ -1144,7 +1151,7 @@ static void build_intra_predictors_high(
   int base = 128 << (xd->bd - 8);
 
   // The default values if ref pixels are not available:
-  // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+  // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
   // base+1   A      B  ..     Y      Z
   // base+1   C      D  ..     W      X
   // base+1   E      F  ..     U      V
@@ -1182,7 +1189,7 @@ static void build_intra_predictors_high(
 
   // NEED_LEFT
   if (need_left) {
-    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
     if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
     const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
@@ -1207,7 +1214,7 @@ static void build_intra_predictors_high(
 
   // NEED_ABOVE
   if (need_above) {
-    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
     if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
@@ -1302,6 +1309,7 @@ static void build_intra_predictors_high(
     pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int ref_stride, uint8_t *dst, int dst_stride,
@@ -1328,7 +1336,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 
   // The default values if ref pixels are not available:
-  // 127 127 127 .. 127 127 127 127 127 127
+  // 128 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
@@ -1367,10 +1375,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
 
   // NEED_LEFT
   if (need_left) {
-    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
     if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+    // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
+    // due to the avx2 mask load is with dword granularity.
+    // so we initialize 3 extra bytes to silence valgrind complain.
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
@@ -1392,7 +1403,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
 
   // NEED_ABOVE
   if (need_above) {
-    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
     if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
@@ -1486,6 +1497,57 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   }
 }
 
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
+  BLOCK_SIZE bs = bsize;
+  switch (bsize) {
+    case BLOCK_4X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_4X8:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_8X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_8X8;
+      break;
+    case BLOCK_4X16:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X16;
+      break;
+    case BLOCK_16X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_16X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_16X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_16X8;
+      break;
+    default: break;
+  }
+  return bs;
+}
+
 void av1_predict_intra_block(
     const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
     TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
@@ -1494,8 +1556,8 @@ void av1_predict_intra_block(
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
-  const int x = col_off << tx_size_wide_log2[0];
-  const int y = row_off << tx_size_high_log2[0];
+  const int x = col_off << MI_SIZE_LOG2;
+  const int y = row_off << MI_SIZE_LOG2;
 
   if (use_palette) {
     int r, c;
@@ -1503,7 +1565,7 @@ void av1_predict_intra_block(
                                xd->color_index_map_offset[plane != 0];
     const uint16_t *const palette =
         mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
       uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
       for (r = 0; r < txhpx; ++r) {
         for (c = 0; c < txwpx; ++c) {
@@ -1521,15 +1583,15 @@ void av1_predict_intra_block(
     return;
   }
 
-  BLOCK_SIZE bsize = mbmi->sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int txw = tx_size_wide_unit[tx_size];
   const int txh = tx_size_high_unit[tx_size];
-  const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
-                                                     : xd->up_available);
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int have_top =
+      row_off || (ss_y ? xd->chroma_up_available : xd->up_available);
   const int have_left =
-      col_off ||
-      (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
+      col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   const int xr_chr_offset = 0;
@@ -1537,32 +1599,35 @@ void av1_predict_intra_block(
 
   // Distance between the right edge of this prediction block to
   // the frame right edge
-  const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
-                 (wpx - x - txwpx) - xr_chr_offset;
+  const int xr =
+      (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset;
   // Distance between the bottom edge of this prediction block to
   // the frame bottom edge
-  const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
-                 (hpx - y - txhpx) - yd_chr_offset;
+  const int yd =
+      (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset;
   const int right_available =
-      mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end;
+      mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
   const int bottom_available =
-      (yd > 0) &&
-      (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end);
+      (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
 
   const PARTITION_TYPE partition = mbmi->partition;
 
+  BLOCK_SIZE bsize = mbmi->sb_type;
   // force 4x4 chroma component block size.
-  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+  if (ss_x || ss_y) {
+    bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+  }
 
-  const int have_top_right = has_top_right(
-      cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size,
-      row_off, col_off, pd->subsampling_x, pd->subsampling_y);
-  const int have_bottom_left = has_bottom_left(
-      cm, bsize, mi_row, mi_col, bottom_available, have_left, partition,
-      tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+  const int have_top_right =
+      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
+                    partition, tx_size, row_off, col_off, ss_x, ss_y);
+  const int have_bottom_left =
+      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
+                      partition, tx_size, row_off, col_off, ss_x, ss_y);
 
   const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
     build_intra_predictors_high(
         xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
         filter_intra_mode, tx_size, disable_edge_filter,
@@ -1572,7 +1637,7 @@ void av1_predict_intra_block(
         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
     return;
   }
-
+#endif
   build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
                          angle_delta, filter_intra_mode, tx_size,
                          disable_edge_filter,
@@ -1588,8 +1653,7 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   const PREDICTION_MODE mode =
       (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
   const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
diff --git a/media/libaom/src/av1/common/reconintra.h b/media/libaom/src/av1/common/reconintra.h
index 07853aba0..9d203569c 100644
--- a/media/libaom/src/av1/common/reconintra.h
+++ b/media/libaom/src/av1/common/reconintra.h
@@ -15,8 +15,8 @@
 #include <stdlib.h>
 
 #include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,13 +26,11 @@ void av1_init_intra_predictors(void);
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int plane, int blk_col, int blk_row,
                                     TX_SIZE tx_size);
-void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             int bw, int bh, TX_SIZE tx_size,
-                             PREDICTION_MODE mode, int angle_delta,
-                             int use_palette,
-                             FILTER_INTRA_MODE filter_intra_mode,
-                             const uint8_t *ref, int ref_stride, uint8_t *dst,
-                             int dst_stride, int aoff, int loff, int plane);
+void av1_predict_intra_block(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane);
 
 // Mapping of interintra to intra mode for use in the intra component
 static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
@@ -56,8 +54,8 @@ static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
 }
 
 static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
-  return frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
-         cm->allow_intrabc;
+  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
+         cm->features.allow_intrabc;
 }
 
 static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
@@ -76,6 +74,40 @@ static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
 
 extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
 
+static const int16_t dr_intra_derivative[90] = {
+  // More evenly spread out angles and limited to 10-bit
+  // Values that are 0 will never be used
+  //                    Approx angle
+  0,    0, 0,        //
+  1023, 0, 0,        // 3, ...
+  547,  0, 0,        // 6, ...
+  372,  0, 0, 0, 0,  // 9, ...
+  273,  0, 0,        // 14, ...
+  215,  0, 0,        // 17, ...
+  178,  0, 0,        // 20, ...
+  151,  0, 0,        // 23, ... (113 & 203 are base angles)
+  132,  0, 0,        // 26, ...
+  116,  0, 0,        // 29, ...
+  102,  0, 0, 0,     // 32, ...
+  90,   0, 0,        // 36, ...
+  80,   0, 0,        // 39, ...
+  71,   0, 0,        // 42, ...
+  64,   0, 0,        // 45, ... (45 & 135 are base angles)
+  57,   0, 0,        // 48, ...
+  51,   0, 0,        // 51, ...
+  45,   0, 0, 0,     // 54, ...
+  40,   0, 0,        // 58, ...
+  35,   0, 0,        // 61, ...
+  31,   0, 0,        // 64, ...
+  27,   0, 0,        // 67, ... (67 & 157 are base angles)
+  23,   0, 0,        // 70, ...
+  19,   0, 0,        // 73, ...
+  15,   0, 0, 0, 0,  // 76, ...
+  11,   0, 0,        // 81, ...
+  7,    0, 0,        // 84, ...
+  3,    0, 0,        // 87, ...
+};
+
 // Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
 // If angle > 0 && angle < 90, dx = -((int)(256 / t));
 // If angle > 90 && angle < 180, dx = (int)(256 / t);
@@ -110,7 +142,7 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
                                               int type) {
   const int d = abs(delta);
   const int blk_wh = bs0 + bs1;
-  if (d <= 0 || d >= 40) return 0;
+  if (d == 0 || d >= 40) return 0;
   return type ? (blk_wh <= 8) : (blk_wh <= 16);
 }
 #ifdef __cplusplus
diff --git a/media/libaom/src/av1/common/resize.c b/media/libaom/src/av1/common/resize.c
index d61a20aa2..98f28f7b5 100644
--- a/media/libaom/src/av1/common/resize.c
+++ b/media/libaom/src/av1/common/resize.c
@@ -313,6 +313,91 @@ static void interpolate_core(const uint8_t *const input, int in_length,
   }
 }
 
+static void interpolate_core_double_prec(const double *const input,
+                                         int in_length, double *output,
+                                         int out_length,
+                                         const int16_t *interp_filters,
+                                         int interp_taps) {
+  const int32_t delta =
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
+  const int32_t offset =
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
+  double *optr = output;
+  int x, x1, x2, k, int_pel, sub_pel;
+  double sum;
+  int32_t y;
+
+  x = 0;
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+         ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k) {
+        const int pk = int_pel - interp_taps / 2 + 1 + k;
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+      }
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+    // End part.
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] *
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+  }
+}
+
 static void interpolate(const uint8_t *const input, int in_length,
                         uint8_t *output, int out_length) {
   const InterpKernel *interp_filters =
@@ -322,6 +407,15 @@ static void interpolate(const uint8_t *const input, int in_length,
                    SUBPEL_TAPS);
 }
 
+static void interpolate_double_prec(const double *const input, int in_length,
+                                    double *output, int out_length) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  interpolate_core_double_prec(input, in_length, output, out_length,
+                               &interp_filters[0][0], SUBPEL_TAPS);
+}
+
 int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
   return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
 }
@@ -337,7 +431,6 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
   return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
 }
 
-#ifndef __clang_analyzer__
 static void down2_symeven(const uint8_t *const input, int length,
                           uint8_t *output) {
   // Actual filter len = 2 * filter_len_half.
@@ -392,7 +485,6 @@ static void down2_symeven(const uint8_t *const input, int length,
     }
   }
 }
-#endif
 
 static void down2_symodd(const uint8_t *const input, int length,
                          uint8_t *output) {
@@ -505,6 +597,12 @@ static void resize_multistep(const uint8_t *const input, int length,
   }
 }
 
+static void upscale_multistep_double_prec(const double *const input, int length,
+                                          double *output, int olength) {
+  assert(length < olength);
+  interpolate_double_prec(input, length, output, olength);
+}
+
 static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
   int i;
   uint8_t *iptr = img;
@@ -523,9 +621,29 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
   }
 }
 
-static void resize_plane(const uint8_t *const input, int height, int width,
-                         int in_stride, uint8_t *output, int height2,
-                         int width2, int out_stride) {
+static void fill_col_to_arr_double_prec(double *img, int stride, int len,
+                                        double *arr) {
+  int i;
+  double *iptr = img;
+  double *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col_double_prec(double *img, int stride, int len,
+                                        double *arr) {
+  int i;
+  double *iptr = img;
+  double *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+                      int in_stride, uint8_t *output, int height2, int width2,
+                      int out_stride) {
   int i;
   uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
   uint8_t *tmpbuf =
@@ -554,6 +672,33 @@ Error:
   aom_free(arrbuf2);
 }
 
+void av1_upscale_plane_double_prec(const double *const input, int height,
+                                   int width, int in_stride, double *output,
+                                   int height2, int width2, int out_stride) {
+  int i;
+  double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height);
+  double *arrbuf = (double *)aom_malloc(sizeof(double) * height);
+  double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2);
+  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  for (i = 0; i < height; ++i)
+    upscale_multistep_double_prec(input + in_stride * i, width,
+                                  intbuf + width2 * i, width2);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr_double_prec(intbuf + i, width2, height, arrbuf);
+    upscale_multistep_double_prec(arrbuf, height, arrbuf2, height2);
+    fill_arr_to_col_double_prec(output + i, out_stride, height2, arrbuf2);
+  }
+
+Error:
+  aom_free(intbuf);
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+}
+
 static void upscale_normative_rect(const uint8_t *const input, int height,
                                    int width, int in_stride, uint8_t *output,
                                    int height2, int width2, int out_stride,
@@ -613,6 +758,7 @@ static void upscale_normative_rect(const uint8_t *const input, int height,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_interpolate_core(const uint16_t *const input, int in_length,
                                     uint16_t *output, int out_length, int bd,
                                     const int16_t *interp_filters,
@@ -705,7 +851,6 @@ static void highbd_interpolate(const uint16_t *const input, int in_length,
                           &interp_filters[0][0], SUBPEL_TAPS);
 }
 
-#ifndef __clang_analyzer__
 static void highbd_down2_symeven(const uint16_t *const input, int length,
                                  uint16_t *output, int bd) {
   // Actual filter len = 2 * filter_len_half.
@@ -813,7 +958,6 @@ static void highbd_down2_symodd(const uint16_t *const input, int length,
     }
   }
 }
-#endif
 
 static void highbd_resize_multistep(const uint16_t *const input, int length,
                                     uint16_t *output, int olength,
@@ -871,10 +1015,9 @@ static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
   }
 }
 
-static void highbd_resize_plane(const uint8_t *const input, int height,
-                                int width, int in_stride, uint8_t *output,
-                                int height2, int width2, int out_stride,
-                                int bd) {
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+                             int in_stride, uint8_t *output, int height2,
+                             int width2, int out_stride, int bd) {
   int i;
   uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
   uint16_t *tmpbuf =
@@ -963,17 +1106,18 @@ static void highbd_upscale_normative_rect(const uint8_t *const input,
     aom_free(tmp_right);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void av1_resize_frame420(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth) {
-  resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, owidth / 2,
-               ouv_stride);
-  resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, owidth / 2,
-               ouv_stride);
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                   owidth / 2, ouv_stride);
+  av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                   owidth / 2, ouv_stride);
 }
 
 void av1_resize_frame422(const uint8_t *const y, int y_stride,
@@ -981,11 +1125,11 @@ void av1_resize_frame422(const uint8_t *const y, int y_stride,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth) {
-  resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
-               ouv_stride);
-  resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
-               ouv_stride);
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+                   ouv_stride);
+  av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+                   ouv_stride);
 }
 
 void av1_resize_frame444(const uint8_t *const y, int y_stride,
@@ -993,23 +1137,26 @@ void av1_resize_frame444(const uint8_t *const y, int y_stride,
                          int uv_stride, int height, int width, uint8_t *oy,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth) {
-  resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  resize_plane(u, height, width, uv_stride, ou, oheight, owidth, ouv_stride);
-  resize_plane(v, height, width, uv_stride, ov, oheight, owidth, ouv_stride);
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                   ouv_stride);
+  av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                   ouv_stride);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
                                 const uint8_t *const u, const uint8_t *const v,
                                 int uv_stride, int height, int width,
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd) {
-  highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
-                      oy_stride, bd);
-  highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
-                      owidth / 2, ouv_stride, bd);
-  highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
-                      owidth / 2, ouv_stride, bd);
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                          owidth / 2, ouv_stride, bd);
+  av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                          owidth / 2, ouv_stride, bd);
 }
 
 void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
@@ -1018,12 +1165,12 @@ void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd) {
-  highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
-                      oy_stride, bd);
-  highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
-                      ouv_stride, bd);
-  highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
-                      ouv_stride, bd);
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+                          owidth / 2, ouv_stride, bd);
+  av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+                          owidth / 2, ouv_stride, bd);
 }
 
 void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
@@ -1032,13 +1179,14 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd) {
-  highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
-                      oy_stride, bd);
-  highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
-                      ouv_stride, bd);
-  highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
-                      ouv_stride, bd);
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                          ouv_stride, bd);
+  av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                          ouv_stride, bd);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                  YV12_BUFFER_CONFIG *dst, int bd,
@@ -1049,16 +1197,24 @@ void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   // the static analysis warnings.
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
     const int is_uv = i > 0;
+#if CONFIG_AV1_HIGHBITDEPTH
     if (src->flags & YV12_FLAG_HIGHBITDEPTH)
-      highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
-                          src->crop_widths[is_uv], src->strides[is_uv],
-                          dst->buffers[i], dst->crop_heights[is_uv],
-                          dst->crop_widths[is_uv], dst->strides[is_uv], bd);
+      av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                              src->crop_widths[is_uv], src->strides[is_uv],
+                              dst->buffers[i], dst->crop_heights[is_uv],
+                              dst->crop_widths[is_uv], dst->strides[is_uv], bd);
     else
-      resize_plane(src->buffers[i], src->crop_heights[is_uv],
-                   src->crop_widths[is_uv], src->strides[is_uv],
-                   dst->buffers[i], dst->crop_heights[is_uv],
-                   dst->crop_widths[is_uv], dst->strides[is_uv]);
+      av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                       src->crop_widths[is_uv], src->strides[is_uv],
+                       dst->buffers[i], dst->crop_heights[is_uv],
+                       dst->crop_widths[is_uv], dst->strides[is_uv]);
+#else
+    (void)bd;
+    av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                     src->crop_widths[is_uv], src->strides[is_uv],
+                     dst->buffers[i], dst->crop_heights[is_uv],
+                     dst->crop_widths[is_uv], dst->strides[is_uv]);
+#endif
   }
   aom_extend_frame_borders(dst, num_planes);
 }
@@ -1079,7 +1235,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
   int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
                                           upscaled_plane_width, x_step_qn);
 
-  for (int j = 0; j < cm->tile_cols; j++) {
+  for (int j = 0; j < cm->tiles.cols; j++) {
     av1_tile_set_col(&tile_col, cm, j);
     // Determine the limits of this tile column in both the source
     // and destination images.
@@ -1092,7 +1248,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
 
     const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
     int upscaled_x1;
-    if (j == cm->tile_cols - 1) {
+    if (j == cm->tiles.cols - 1) {
       // Note that we can't just use AOMMIN here - due to rounding,
       // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
       // upscaled_plane_width.
@@ -1106,8 +1262,9 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
     const int dst_width = upscaled_x1 - upscaled_x0;
 
     const int pad_left = (j == 0);
-    const int pad_right = (j == cm->tile_cols - 1);
+    const int pad_right = (j == cm->tiles.cols - 1);
 
+#if CONFIG_AV1_HIGHBITDEPTH
     if (cm->seq_params.use_highbitdepth)
       highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
                                     dst_ptr, rows, dst_width, dst_stride,
@@ -1117,7 +1274,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
       upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
                              rows, dst_width, dst_stride, x_step_qn, x0_qn,
                              pad_left, pad_right);
-
+#else
+    upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows,
+                           dst_width, dst_stride, x_step_qn, x0_qn, pad_left,
+                           pad_right);
+#endif
     // Update the fractional pixel offset to prepare for the next tile column.
     x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
   }
@@ -1155,10 +1316,19 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
 // denominator.
 static void calculate_scaled_size_helper(int *dim, int denom) {
   if (denom != SCALE_NUMERATOR) {
+    // We need to ensure the constraint in "Appendix A" of the spec:
+    // * FrameWidth is greater than or equal to 16
+    // * FrameHeight is greater than or equal to 16
+    // For this, we clamp the downscaled dimension to at least 16. One
+    // exception: if original dimension itself was < 16, then we keep the
+    // downscaled dimension to be same as the original, to ensure that resizing
+    // is valid.
+    const int min_dim = AOMMIN(16, *dim);
     // Use this version if we need *dim to be even
     // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
     // *width <<= 1;
     *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
+    *dim = AOMMAX(*dim, min_dim);
   }
 }
 
@@ -1201,17 +1371,18 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   const int num_planes = av1_num_planes(cm);
   if (!av1_superres_scaled(cm)) return;
   const SequenceHeader *const seq_params = &cm->seq_params;
+  const int byte_alignment = cm->features.byte_alignment;
 
   YV12_BUFFER_CONFIG copy_buffer;
   memset(&copy_buffer, 0, sizeof(copy_buffer));
 
-  YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
+  YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf;
 
   const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
   if (aom_alloc_frame_buffer(
           &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+          AOM_BORDER_IN_PIXELS, byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
@@ -1225,27 +1396,31 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   // Realloc the current frame buffer at a higher resolution in place.
   if (pool != NULL) {
     // Use callbacks if on the decoder.
-    aom_codec_frame_buffer_t *fb =
-        &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer;
+    aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer;
     aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
     aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
     void *cb_priv = pool->cb_priv;
 
+    lock_buffer_pool(pool);
     // Realloc with callback does not release the frame buffer - release first.
-    if (release_fb_cb(cb_priv, fb))
+    if (release_fb_cb(cb_priv, fb)) {
+      unlock_buffer_pool(pool);
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to free current frame buffer before superres upscaling");
-
+    }
     // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
     if (aom_realloc_frame_buffer(
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
+            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) {
+      unlock_buffer_pool(pool);
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
+    }
+    unlock_buffer_pool(pool);
   } else {
     // Make a copy of the config data for frame_to_show in copy_buffer
     copy_buffer_config(frame_to_show, &copy_buffer);
@@ -1256,7 +1431,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+            AOM_BORDER_IN_PIXELS, byte_alignment))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");
diff --git a/media/libaom/src/av1/common/resize.h b/media/libaom/src/av1/common/resize.h
index 9a59a8d63..8ee859e5c 100644
--- a/media/libaom/src/av1/common/resize.h
+++ b/media/libaom/src/av1/common/resize.h
@@ -14,7 +14,7 @@
 
 #include <stdio.h>
 #include "aom/aom_integer.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,6 +23,9 @@ extern "C" {
 void av1_resize_plane(const uint8_t *const input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride);
+void av1_upscale_plane_double_prec(const double *const input, int height,
+                                   int width, int in_stride, double *output,
+                                   int height2, int width2, int out_stride);
 void av1_resize_frame420(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
                          int uv_stride, int height, int width, uint8_t *oy,
diff --git a/media/libaom/src/av1/common/restoration.c b/media/libaom/src/av1/common/restoration.c
index d276a915b..a0f37ad63 100644
--- a/media/libaom/src/av1/common/restoration.c
+++ b/media/libaom/src/av1/common/restoration.c
@@ -17,7 +17,7 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom_mem/aom_mem.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/restoration.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -28,7 +28,7 @@
 // The 's' values are calculated based on original 'r' and 'e' values in the
 // spec using GenSgrprojVtable().
 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
-const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
+const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
@@ -111,7 +111,7 @@ int sgrproj_mtable[SGRPROJ_PARAMS][2];
 
 static void GenSgrprojVtable() {
   for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
-    const sgr_params_type *const params = &sgr_params[i];
+    const sgr_params_type *const params = &av1_sgr_params[i];
     for (int j = 0; j < 2; ++j) {
       const int e = params->e[j];
       const int r = params->r[j];
@@ -153,6 +153,7 @@ static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void extend_frame_highbd(uint16_t *data, int width, int height,
                                 int stride, int border_horz, int border_vert) {
   uint16_t *data_p;
@@ -173,13 +174,24 @@ static void extend_frame_highbd(uint16_t *data, int width, int height,
   }
 }
 
-void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert, int highbd) {
-  if (highbd)
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+                             int src_stride, uint16_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
+}
+#endif
+
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+                      int border_horz, int border_vert, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
     extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
                         border_horz, border_vert);
-  else
-    extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
+    return;
+  }
+#endif
+  (void)highbd;
+  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
 }
 
 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
@@ -188,19 +200,17 @@ static void copy_tile_lowbd(int width, int height, const uint8_t *src,
     memcpy(dst + i * dst_stride, src + i * src_stride, width);
 }
 
-static void copy_tile_highbd(int width, int height, const uint16_t *src,
-                             int src_stride, uint16_t *dst, int dst_stride) {
-  for (int i = 0; i < height; ++i)
-    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
-}
-
 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
                       uint8_t *dst, int dst_stride, int highbd) {
-  if (highbd)
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
     copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
                      CONVERT_TO_SHORTPTR(dst), dst_stride);
-  else
-    copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+    return;
+  }
+#endif
+  (void)highbd;
+  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
 }
 
 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
@@ -212,11 +222,10 @@ static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
 // rules:
 //
 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
-//   This extension is done by a call to extend_frame() at the start of the loop
-//   restoration process, so the value of copy_above/copy_below doesn't strictly
-//   matter.
-//   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
-//   across tiles is disabled, we can allow
+//   This extension is done by a call to av1_extend_frame() at the start of the
+//   loop restoration process, so the value of copy_above/copy_below doesn't
+//   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
+//   loop filtering across tiles is disabled, we can allow
 //   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
 //   data has always been copied, simplifying the behaviour at the left and
 //   right edges of tiles.
@@ -620,7 +629,7 @@ static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
     assert(0 && "Invalid value of r in self-guided filter");
 }
 
-void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
   if (params->r[0] == 0) {
     xq[0] = 0;
     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
@@ -633,7 +642,7 @@ void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
   }
 }
 
-const int32_t x_by_xplus1[256] = {
+const int32_t av1_x_by_xplus1[256] = {
   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
   // instead of 0. See comments in selfguided_restoration_internal() for why
   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
@@ -656,7 +665,7 @@ const int32_t x_by_xplus1[256] = {
   256,
 };
 
-const int32_t one_by_x[MAX_NELEM] = {
+const int32_t av1_one_by_x[MAX_NELEM] = {
   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
 };
@@ -665,7 +674,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
                                           int dgd_stride, int bit_depth,
                                           int sgr_params_idx, int radius_idx,
                                           int pass, int32_t *A, int32_t *B) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -726,7 +735,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
       // Further, in the calculation of B[k] below, if z == 0 and r == 2,
       // then A[k] "should be" 0. But then we can end up setting B[k] to a value
       // slightly above 2^(8 + bit depth), due to rounding in the value of
-      // one_by_x[25-1].
+      // av1_one_by_x[25-1].
       //
       // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
       // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
@@ -738,17 +747,17 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
       // would be a bad idea, as that corresponds to the case where the image
       // is very variable, when we want to preserve the local pixel value as
       // much as possible.
-      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
 
       // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
-      // one_by_x[n - 1] = round(2^12 / n)
+      // av1_one_by_x[n - 1] = round(2^12 / n)
       // => the product here is < 2^(20 + bit_depth) <= 2^32,
       // and B[k] is set to a value < 2^(8 + bit depth)
-      // This holds even with the rounding in one_by_x and in the overall
+      // This holds even with the rounding in av1_one_by_x and in the overall
       // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                              (uint32_t)B[k] *
-                                             (uint32_t)one_by_x[n - 1],
+                                             (uint32_t)av1_one_by_x[n - 1],
                                          SGRPROJ_RECIP_BITS);
     }
   }
@@ -757,7 +766,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
 static void selfguided_restoration_fast_internal(
     int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
     int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
@@ -883,7 +892,7 @@ int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
     }
   }
 
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // If params->r == 0 we skip the corresponding filter. We only allow one of
   // the radii to be 0, as having both equal to 0 would be equivalent to
   // skipping SGR entirely.
@@ -899,11 +908,11 @@ int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
   return 0;
 }
 
-void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
-                                    int stride, int eps, const int *xqd,
-                                    uint8_t *dst8, int dst_stride,
-                                    int32_t *tmpbuf, int bit_depth,
-                                    int highbd) {
+void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
+                                        int height, int stride, int eps,
+                                        const int *xqd, uint8_t *dst8,
+                                        int dst_stride, int32_t *tmpbuf,
+                                        int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -912,9 +921,9 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
   (void)ret;
   assert(!ret);
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
   for (int i = 0; i < height; ++i) {
     for (int j = 0; j < width; ++j) {
       const int k = i * width + j;
@@ -950,12 +959,13 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
 
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-    apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
-                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
-                                 dst + j, dst_stride, tmpbuf, bit_depth, 0);
+    av1_apply_selfguided_restoration(
+        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
                                         int stripe_width, int stripe_height,
                                         int procunit_width, const uint8_t *src8,
@@ -984,11 +994,12 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
                                          int32_t *tmpbuf, int bit_depth) {
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-    apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
-                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
-                                 dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+    av1_apply_selfguided_restoration(
+        src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
                                   int stripe_width, int stripe_height,
@@ -996,12 +1007,18 @@ typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
                                   int src_stride, uint8_t *dst, int dst_stride,
                                   int32_t *tmpbuf, int bit_depth);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 #define NUM_STRIPE_FILTERS 4
-
 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
   sgrproj_filter_stripe_highbd
 };
+#else
+#define NUM_STRIPE_FILTERS 2
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+  wiener_filter_stripe, sgrproj_filter_stripe
+};
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Filter one restoration unit
 void av1_loop_restoration_filter_unit(
@@ -1072,13 +1089,6 @@ void av1_loop_restoration_filter_unit(
   }
 }
 
-static void filter_frame_on_tile(int tile_row, int tile_col, void *priv,
-                                 AV1_COMMON *cm) {
-  (void)tile_col;
-  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
-  ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1];
-}
-
 static void filter_frame_on_unit(const RestorationTileLimits *limits,
                                  const AV1PixelRect *tile_rect,
                                  int rest_unit_idx, void *priv, int32_t *tmpbuf,
@@ -1106,8 +1116,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
   const int frame_height = frame->crop_heights[0];
   if (aom_realloc_frame_buffer(
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
-          seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
-          cm->byte_alignment, NULL, NULL, NULL) < 0)
+          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+          cm->features.byte_alignment, NULL, NULL, NULL) < 0)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
 
@@ -1127,9 +1137,9 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
     const int plane_height = frame->crop_heights[is_uv];
     FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
 
-    extend_frame(frame->buffers[plane], plane_width, plane_height,
-                 frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
-                 highbd);
+    av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
+                     frame->strides[is_uv], RESTORATION_BORDER,
+                     RESTORATION_BORDER, highbd);
 
     lr_plane_ctxt->rsi = rsi;
     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
@@ -1141,7 +1151,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
     lr_plane_ctxt->data_stride = frame->strides[is_uv];
     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
     lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
-    filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
+    lr_plane_ctxt->tile_stripe0 = 0;
   }
 }
 
@@ -1150,10 +1160,10 @@ void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
                            int vstart, int vend);
-  static const copy_fun copy_funs[3] = {
-    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
-  };
-
+  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
+                                         aom_yv12_partial_coloc_copy_u,
+                                         aom_yv12_partial_coloc_copy_v };
+  assert(num_planes <= 3);
   for (int plane = 0; plane < num_planes; ++plane) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
@@ -1180,7 +1190,7 @@ static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
                                        AV1_COMMON *cm, int optimized_lr,
                                        void *lr_ctxt) {
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
   const int num_planes = av1_num_planes(cm);
 
   AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
@@ -1308,7 +1318,7 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
   if (bsize != cm->seq_params.sb_size) return 0;
   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
 
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   const int is_uv = plane > 0;
 
diff --git a/media/libaom/src/av1/common/restoration.h b/media/libaom/src/av1/common/restoration.h
index d834f9270..3b80dd5a9 100644
--- a/media/libaom/src/av1/common/restoration.h
+++ b/media/libaom/src/av1/common/restoration.h
@@ -22,6 +22,8 @@
 extern "C" {
 #endif
 
+// Border for Loop restoration buffer
+#define AOM_RESTORATION_FRAME_BORDER 32
 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
 #define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
 
@@ -120,6 +122,7 @@ extern "C" {
 // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
 // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
 #define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN_REDUCED (WIENER_WIN - 2)
 #define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
 
 #define WIENER_FILT_PREC_BITS 7
@@ -275,18 +278,18 @@ typedef struct AV1LrStruct {
   YV12_BUFFER_CONFIG *dst;
 } AV1LrStruct;
 
-extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
+extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS];
 extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
-extern const int32_t x_by_xplus1[256];
-extern const int32_t one_by_x[MAX_NELEM];
+extern const int32_t av1_x_by_xplus1[256];
+extern const int32_t av1_one_by_x[MAX_NELEM];
 
 void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
                                   int is_uv);
 void av1_free_restoration_struct(RestorationInfo *rst_info);
 
-void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert, int highbd);
-void decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+                      int border_horz, int border_vert, int highbd);
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
 
 // Filter a single loop restoration unit.
 //
diff --git a/media/libaom/src/av1/common/scale.c b/media/libaom/src/av1/common/scale.c
index c525fe229..3b14c0a2c 100644
--- a/media/libaom/src/av1/common/scale.c
+++ b/media/libaom/src/av1/common/scale.c
@@ -37,7 +37,7 @@ static INLINE int scaled_y(int val, const struct scale_factors *sf) {
 // Note: Expect val to be in q4 precision
 static int unscaled_value(int val, const struct scale_factors *sf) {
   (void)sf;
-  return val << SCALE_EXTRA_BITS;
+  return val * (1 << SCALE_EXTRA_BITS);
 }
 
 static int get_fixed_point_scale_factor(int other_size, int this_size) {
@@ -88,39 +88,41 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
   // AV1 convolve functions
   // Special case convolve functions should produce the same result as
   // av1_convolve_2d.
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  // subpel_x_qn == 0 && subpel_y_qn == 0
   sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
-  // subpel_x_q4 == 0
+  // subpel_x_qn == 0
   sf->convolve[0][1][0] = av1_convolve_y_sr;
-  // subpel_y_q4 == 0
+  // subpel_y_qn == 0
   sf->convolve[1][0][0] = av1_convolve_x_sr;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  // subpel_x_qn != 0 && subpel_y_qn != 0
   sf->convolve[1][1][0] = av1_convolve_2d_sr;
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
-  sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
-  // subpel_x_q4 == 0
-  sf->convolve[0][1][1] = av1_jnt_convolve_y;
-  // subpel_y_q4 == 0
-  sf->convolve[1][0][1] = av1_jnt_convolve_x;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
-  sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+  // subpel_x_qn == 0 && subpel_y_qn == 0
+  sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
+  // subpel_x_qn == 0
+  sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
+  // subpel_y_qn == 0
+  sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
+  // subpel_x_qn != 0 && subpel_y_qn != 0
+  sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
+#if CONFIG_AV1_HIGHBITDEPTH
   // AV1 High BD convolve functions
   // Special case convolve functions should produce the same result as
   // av1_highbd_convolve_2d.
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  // subpel_x_qn == 0 && subpel_y_qn == 0
   sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
-  // subpel_x_q4 == 0
+  // subpel_x_qn == 0
   sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
-  // subpel_y_q4 == 0
+  // subpel_y_qn == 0
   sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  // subpel_x_qn != 0 && subpel_y_qn != 0
   sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
-  // subpel_x_q4 == 0 && subpel_y_q4 == 0
-  sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
-  // subpel_x_q4 == 0
-  sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
-  // subpel_y_q4 == 0
-  sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
-  // subpel_x_q4 != 0 && subpel_y_q4 != 0
-  sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
+  // subpel_x_qn == 0 && subpel_y_qn == 0
+  sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
+  // subpel_x_qn == 0
+  sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
+  // subpel_y_qn == 0
+  sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
+  // subpel_x_qn != 0 && subpel_y_qn != 0
+  sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
+#endif
 }
diff --git a/media/libaom/src/av1/common/scale.h b/media/libaom/src/av1/common/scale.h
index 748e958c3..16b40bde8 100644
--- a/media/libaom/src/av1/common/scale.h
+++ b/media/libaom/src/av1/common/scale.h
@@ -45,11 +45,13 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
 
 static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+  assert(sf != NULL);
   return sf->x_scale_fp != REF_INVALID_SCALE &&
          sf->y_scale_fp != REF_INVALID_SCALE;
 }
 
 static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+  assert(sf != NULL);
   return av1_is_valid_scale(sf) &&
          (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }
diff --git a/media/libaom/src/av1/common/scan.c b/media/libaom/src/av1/common/scan.c
index 31a787b53..c1d4f3581 100644
--- a/media/libaom/src/av1/common/scan.c
+++ b/media/libaom/src/av1/common/scan.c
@@ -14,9 +14,9 @@
 #include "av1/common/common_data.h"
 #include "av1/common/scan.h"
 
-DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
-  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
-};
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                          9, 12, 13, 10, 7, 11, 14, 15 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
@@ -839,1546 +839,9 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023
 };
 
-// Neighborhood 2-tuples for various scans and blocksizes,
-// in {top, left} order for each position in corresponding scan order.
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0,  0, 4, 4, 1, 4, 1,  1,  2,  2,  2,  5, 5,
-  8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 4, 4,  8,  8, 0, 0, 1, 4, 5,  8,  9,  12, 1,
-  1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0,  0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 1, 1, 2,  2, 0, 0, 1,  4,  2,  5,  3,  6, 4,
-  4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,
-  9,  2,  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16,
-  11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
-  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
-  0, 0, 0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 0,
-  0, 1, 4,  5,  8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 1,  1,
-  2, 5, 6,  9,  10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2,  2,  3,
-  6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,
-  4,  5,  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12,
-  13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
-  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0, 1,  8,  1,  1,  8,  8,  2,  9,  9, 16, 10,
-  17, 2,  2,  16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3,  3, 4,  11,
-  19, 26, 12, 19, 4,  4, 20, 27, 5,  12, 13, 20, 21, 28, 5, 5,  6,
-  13, 14, 21, 22, 29, 6, 6,  7,  14, 15, 22, 23, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  16, 16, 0,  0,  1,  8,  9,  16, 17, 24, 1,
-  1,  2,  9,  10, 17, 18, 25, 2,  2,  3,  10, 11, 18, 19, 26, 3,  3,
-  4,  11, 12, 19, 20, 27, 4,  4,  5,  12, 13, 20, 21, 28, 5,  5,  6,
-  13, 14, 21, 22, 29, 6,  6,  7,  14, 15, 22, 23, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,
-  0,  1,  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,
-  9,  16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17,
-  24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,  9,  2,
-  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16, 11, 14, 14, 17,
-  15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
-  24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
-  34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
-  46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
-  53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  1,  16, 1,  1,  16, 16, 2,  17, 17, 32, 18, 33, 2,
-  2,  32, 32, 3,  18, 33, 48, 19, 34, 34, 49, 3,  3,  4,  19, 35, 50, 20, 35,
-  4,  4,  36, 51, 5,  20, 21, 36, 37, 52, 5,  5,  6,  21, 22, 37, 38, 53, 6,
-  6,  7,  22, 23, 38, 39, 54, 7,  7,  8,  23, 24, 39, 40, 55, 8,  8,  9,  24,
-  25, 40, 41, 56, 9,  9,  10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
-  58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
-  14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,  4,  5,
-  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12, 13, 16, 14, 17,
-  15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
-  24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
-  34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
-  46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
-  53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
-  8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 0,  0,  1,  16, 2,  17,
-  3,  18, 4,  19, 5,  20, 6,  21, 7,  22, 8,  23, 9,  24, 10, 25, 11, 26, 12,
-  27, 13, 28, 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36,
-  22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31,
-  46, 32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 40, 55,
-  41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 28, 28, 32,
-  32, 36, 36, 40, 40, 44, 44, 48, 48, 52, 52, 56, 56, 0,  0,  1,  4,  5,  8,
-  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45,
-  48, 49, 52, 53, 56, 57, 60, 1,  1,  2,  5,  6,  9,  10, 13, 14, 17, 18, 21,
-  22, 25, 26, 29, 30, 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58,
-  61, 2,  2,  3,  6,  7,  10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34,
-  35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  16, 16, 32, 32, 0,  0,  1,  16, 17, 32, 33, 48, 1,  1,  2,
-  17, 18, 33, 34, 49, 2,  2,  3,  18, 19, 34, 35, 50, 3,  3,  4,  19, 20, 35,
-  36, 51, 4,  4,  5,  20, 21, 36, 37, 52, 5,  5,  6,  21, 22, 37, 38, 53, 6,
-  6,  7,  22, 23, 38, 39, 54, 7,  7,  8,  23, 24, 39, 40, 55, 8,  8,  9,  24,
-  25, 40, 41, 56, 9,  9,  10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
-  58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
-  14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
-  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
-  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
-  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
-  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
-  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
-  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
-  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
-  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
-  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
-  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
-  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
-  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
-  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
-  106, 113, 113, 120, 120, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107,
-  114, 114, 121, 121, 128, 128, 128, 87,  94,  94,  101, 101, 108, 108, 115,
-  115, 122, 122, 129, 129, 136, 136, 136, 95,  102, 102, 109, 109, 116, 116,
-  123, 123, 130, 130, 137, 137, 144, 144, 144, 103, 110, 110, 117, 117, 124,
-  124, 131, 131, 138, 138, 145, 145, 152, 152, 152, 111, 118, 118, 125, 125,
-  132, 132, 139, 139, 146, 146, 153, 153, 160, 160, 160, 119, 126, 126, 133,
-  133, 140, 140, 147, 147, 154, 154, 161, 161, 168, 168, 168, 127, 134, 134,
-  141, 141, 148, 148, 155, 155, 162, 162, 169, 169, 176, 176, 176, 135, 142,
-  142, 149, 149, 156, 156, 163, 163, 170, 170, 177, 177, 184, 184, 184, 143,
-  150, 150, 157, 157, 164, 164, 171, 171, 178, 178, 185, 185, 192, 192, 192,
-  151, 158, 158, 165, 165, 172, 172, 179, 179, 186, 186, 193, 193, 200, 200,
-  200, 159, 166, 166, 173, 173, 180, 180, 187, 187, 194, 194, 201, 201, 208,
-  208, 208, 167, 174, 174, 181, 181, 188, 188, 195, 195, 202, 202, 209, 209,
-  216, 216, 216, 175, 182, 182, 189, 189, 196, 196, 203, 203, 210, 210, 217,
-  217, 224, 224, 224, 183, 190, 190, 197, 197, 204, 204, 211, 211, 218, 218,
-  225, 225, 232, 232, 232, 191, 198, 198, 205, 205, 212, 212, 219, 219, 226,
-  226, 233, 233, 240, 240, 240, 199, 206, 206, 213, 213, 220, 220, 227, 227,
-  234, 234, 241, 241, 248, 207, 214, 214, 221, 221, 228, 228, 235, 235, 242,
-  242, 249, 215, 222, 222, 229, 229, 236, 236, 243, 243, 250, 223, 230, 230,
-  237, 237, 244, 244, 251, 231, 238, 238, 245, 245, 252, 239, 246, 246, 253,
-  247, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
-  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
-  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
-  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
-  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
-  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 8,   8,
-  8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194, 225, 9,
-  9,   9,   40,  40,  71,  71,  102, 102, 133, 133, 164, 164, 195, 195, 226,
-  10,  10,  10,  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196,
-  227, 11,  11,  11,  42,  42,  73,  73,  104, 104, 135, 135, 166, 166, 197,
-  197, 228, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
-  198, 198, 229, 13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168,
-  168, 199, 199, 230, 14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138,
-  169, 169, 200, 200, 231, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139,
-  139, 170, 170, 201, 201, 232, 16,  16,  16,  47,  47,  78,  78,  109, 109,
-  140, 140, 171, 171, 202, 202, 233, 17,  17,  17,  48,  48,  79,  79,  110,
-  110, 141, 141, 172, 172, 203, 203, 234, 18,  18,  18,  49,  49,  80,  80,
-  111, 111, 142, 142, 173, 173, 204, 204, 235, 19,  19,  19,  50,  50,  81,
-  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 20,  20,  20,  51,  51,
-  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 21,  21,  21,  52,
-  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238, 22,  22,  22,
-  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208, 208, 239, 23,  23,
-  23,  54,  54,  85,  85,  116, 116, 147, 147, 178, 178, 209, 209, 240, 24,
-  24,  24,  55,  55,  86,  86,  117, 117, 148, 148, 179, 179, 210, 210, 241,
-  25,  25,  25,  56,  56,  87,  87,  118, 118, 149, 149, 180, 180, 211, 211,
-  242, 26,  26,  26,  57,  57,  88,  88,  119, 119, 150, 150, 181, 181, 212,
-  212, 243, 27,  27,  27,  58,  58,  89,  89,  120, 120, 151, 151, 182, 182,
-  213, 213, 244, 28,  28,  28,  59,  59,  90,  90,  121, 121, 152, 152, 183,
-  183, 214, 214, 245, 29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153,
-  184, 184, 215, 215, 246, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154,
-  154, 185, 185, 216, 216, 247, 31,  62,  62,  93,  93,  124, 124, 155, 155,
-  186, 186, 217, 217, 248, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218,
-  218, 249, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 127, 158, 158,
-  189, 189, 220, 220, 251, 159, 190, 190, 221, 221, 252, 191, 222, 222, 253,
-  223, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
-  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
-  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
-  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
-  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
-  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
-  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
-  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
-  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
-  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
-  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
-  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
-  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
-  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
-  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
-  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
-  126, 120, 120, 121, 128, 122, 129, 123, 130, 124, 131, 125, 132, 126, 133,
-  127, 134, 128, 128, 129, 136, 130, 137, 131, 138, 132, 139, 133, 140, 134,
-  141, 135, 142, 136, 136, 137, 144, 138, 145, 139, 146, 140, 147, 141, 148,
-  142, 149, 143, 150, 144, 144, 145, 152, 146, 153, 147, 154, 148, 155, 149,
-  156, 150, 157, 151, 158, 152, 152, 153, 160, 154, 161, 155, 162, 156, 163,
-  157, 164, 158, 165, 159, 166, 160, 160, 161, 168, 162, 169, 163, 170, 164,
-  171, 165, 172, 166, 173, 167, 174, 168, 168, 169, 176, 170, 177, 171, 178,
-  172, 179, 173, 180, 174, 181, 175, 182, 176, 176, 177, 184, 178, 185, 179,
-  186, 180, 187, 181, 188, 182, 189, 183, 190, 184, 184, 185, 192, 186, 193,
-  187, 194, 188, 195, 189, 196, 190, 197, 191, 198, 192, 192, 193, 200, 194,
-  201, 195, 202, 196, 203, 197, 204, 198, 205, 199, 206, 200, 200, 201, 208,
-  202, 209, 203, 210, 204, 211, 205, 212, 206, 213, 207, 214, 208, 208, 209,
-  216, 210, 217, 211, 218, 212, 219, 213, 220, 214, 221, 215, 222, 216, 216,
-  217, 224, 218, 225, 219, 226, 220, 227, 221, 228, 222, 229, 223, 230, 224,
-  224, 225, 232, 226, 233, 227, 234, 228, 235, 229, 236, 230, 237, 231, 238,
-  232, 232, 233, 240, 234, 241, 235, 242, 236, 243, 237, 244, 238, 245, 239,
-  246, 240, 240, 241, 248, 242, 249, 243, 250, 244, 251, 245, 252, 246, 253,
-  247, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
-  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
-  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
-  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
-  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
-  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
-  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
-  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
-  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
-  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
-  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
-  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
-  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
-  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
-  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
-  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
-  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
-  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
-  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
-  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
-  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
-  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
-  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
-  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
-  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
-  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
-  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
-  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
-  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
-  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
-  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
-  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
-  223, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   8,   8,   16,  16,  24,  24,  32,  32,  40,  40,  48,
-  48,  56,  56,  64,  64,  72,  72,  80,  80,  88,  88,  96,  96,  104, 104,
-  112, 112, 120, 120, 128, 128, 136, 136, 144, 144, 152, 152, 160, 160, 168,
-  168, 176, 176, 184, 184, 192, 192, 200, 200, 208, 208, 216, 216, 224, 224,
-  232, 232, 240, 240, 0,   0,   1,   8,   9,   16,  17,  24,  25,  32,  33,
-  40,  41,  48,  49,  56,  57,  64,  65,  72,  73,  80,  81,  88,  89,  96,
-  97,  104, 105, 112, 113, 120, 121, 128, 129, 136, 137, 144, 145, 152, 153,
-  160, 161, 168, 169, 176, 177, 184, 185, 192, 193, 200, 201, 208, 209, 216,
-  217, 224, 225, 232, 233, 240, 241, 248, 1,   1,   2,   9,   10,  17,  18,
-  25,  26,  33,  34,  41,  42,  49,  50,  57,  58,  65,  66,  73,  74,  81,
-  82,  89,  90,  97,  98,  105, 106, 113, 114, 121, 122, 129, 130, 137, 138,
-  145, 146, 153, 154, 161, 162, 169, 170, 177, 178, 185, 186, 193, 194, 201,
-  202, 209, 210, 217, 218, 225, 226, 233, 234, 241, 242, 249, 2,   2,   3,
-  10,  11,  18,  19,  26,  27,  34,  35,  42,  43,  50,  51,  58,  59,  66,
-  67,  74,  75,  82,  83,  90,  91,  98,  99,  106, 107, 114, 115, 122, 123,
-  130, 131, 138, 139, 146, 147, 154, 155, 162, 163, 170, 171, 178, 179, 186,
-  187, 194, 195, 202, 203, 210, 211, 218, 219, 226, 227, 234, 235, 242, 243,
-  250, 3,   3,   4,   11,  12,  19,  20,  27,  28,  35,  36,  43,  44,  51,
-  52,  59,  60,  67,  68,  75,  76,  83,  84,  91,  92,  99,  100, 107, 108,
-  115, 116, 123, 124, 131, 132, 139, 140, 147, 148, 155, 156, 163, 164, 171,
-  172, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 219, 220, 227, 228,
-  235, 236, 243, 244, 251, 4,   4,   5,   12,  13,  20,  21,  28,  29,  36,
-  37,  44,  45,  52,  53,  60,  61,  68,  69,  76,  77,  84,  85,  92,  93,
-  100, 101, 108, 109, 116, 117, 124, 125, 132, 133, 140, 141, 148, 149, 156,
-  157, 164, 165, 172, 173, 180, 181, 188, 189, 196, 197, 204, 205, 212, 213,
-  220, 221, 228, 229, 236, 237, 244, 245, 252, 5,   5,   6,   13,  14,  21,
-  22,  29,  30,  37,  38,  45,  46,  53,  54,  61,  62,  69,  70,  77,  78,
-  85,  86,  93,  94,  101, 102, 109, 110, 117, 118, 125, 126, 133, 134, 141,
-  142, 149, 150, 157, 158, 165, 166, 173, 174, 181, 182, 189, 190, 197, 198,
-  205, 206, 213, 214, 221, 222, 229, 230, 237, 238, 245, 246, 253, 6,   6,
-  7,   14,  15,  22,  23,  30,  31,  38,  39,  46,  47,  54,  55,  62,  63,
-  70,  71,  78,  79,  86,  87,  94,  95,  102, 103, 110, 111, 118, 119, 126,
-  127, 134, 135, 142, 143, 150, 151, 158, 159, 166, 167, 174, 175, 182, 183,
-  190, 191, 198, 199, 206, 207, 214, 215, 222, 223, 230, 231, 238, 239, 246,
-  247, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  32, 32, 64, 64,  96,  96,  128, 128, 160, 160, 192, 192,
-  0,  0,  1,  32, 33, 64, 65, 96,  97,  128, 129, 160, 161, 192, 193, 224,
-  1,  1,  2,  33, 34, 65, 66, 97,  98,  129, 130, 161, 162, 193, 194, 225,
-  2,  2,  3,  34, 35, 66, 67, 98,  99,  130, 131, 162, 163, 194, 195, 226,
-  3,  3,  4,  35, 36, 67, 68, 99,  100, 131, 132, 163, 164, 195, 196, 227,
-  4,  4,  5,  36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
-  5,  5,  6,  37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229,
-  6,  6,  7,  38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
-  7,  7,  8,  39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200, 231,
-  8,  8,  9,  40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201, 232,
-  9,  9,  10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233,
-  10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202, 203, 234,
-  11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
-  12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236,
-  13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237,
-  14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, 238,
-  15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207, 208, 239,
-  16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
-  17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209, 210, 241,
-  18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
-  19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243,
-  20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244,
-  21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
-  22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, 215, 246,
-  23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247,
-  24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216, 217, 248,
-  25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
-  26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218, 219, 250,
-  27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251,
-  28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252,
-  29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, 253,
-  30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
-  0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0,  0,  1,
-  8,  9,  16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1,  1,  2,  9,  10, 17,
-  18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2,  2,  3,  10, 11, 18, 19, 26, 27,
-  34, 35, 42, 43, 50, 51, 58, 3,  3,  4,  11, 12, 19, 20, 27, 28, 35, 36, 43,
-  44, 51, 52, 59, 4,  4,  5,  12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53,
-  60, 5,  5,  6,  13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6,  6,
-  7,  14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0,  0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,  0,  1,
-  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,  9,  16, 10, 17,
-  11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20,
-  27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36,
-  30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39,
-  46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
-  49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0,  0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  2,  2,  2,  9,  9,  16, 16,
-  16, 24, 24, 17, 24, 10, 17, 3,  10, 3,  3,  4,  4,  4,  11, 11, 18, 18, 25,
-  25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5,  12, 5,  5,  6,
-  6,  6,  13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49,
-  35, 42, 28, 35, 21, 28, 14, 21, 7,  14, 15, 22, 22, 29, 29, 36, 36, 43, 43,
-  50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52,
-  52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
-  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
-  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
-  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
-  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
-  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
-  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
-  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
-  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
-  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
-  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
-  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
-  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
-  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
-  106, 113, 113, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107, 114, 114,
-  121, 87,  94,  94,  101, 101, 108, 108, 115, 115, 122, 95,  102, 102, 109,
-  109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,  1,  1,   1,   16,  16,  16,  2,   2,   2,
-  17,  17,  32,  32,  32,  3,  3,  3,   18,  18,  33,  33,  48,  48,  48,
-  4,   4,   4,   19,  19,  34, 34, 49,  49,  64,  64,  64,  5,   5,   5,
-  20,  20,  35,  35,  50,  50, 65, 65,  80,  80,  80,  6,   6,   6,   21,
-  21,  36,  36,  51,  51,  66, 66, 81,  81,  96,  96,  96,  7,   7,   7,
-  22,  22,  37,  37,  52,  52, 67, 67,  82,  82,  97,  97,  112, 8,   8,
-  8,   23,  23,  38,  38,  53, 53, 68,  68,  83,  83,  98,  98,  113, 9,
-  9,   9,   24,  24,  39,  39, 54, 54,  69,  69,  84,  84,  99,  99,  114,
-  10,  10,  10,  25,  25,  40, 40, 55,  55,  70,  70,  85,  85,  100, 100,
-  115, 11,  11,  11,  26,  26, 41, 41,  56,  56,  71,  71,  86,  86,  101,
-  101, 116, 12,  12,  12,  27, 27, 42,  42,  57,  57,  72,  72,  87,  87,
-  102, 102, 117, 13,  13,  13, 28, 28,  43,  43,  58,  58,  73,  73,  88,
-  88,  103, 103, 118, 14,  14, 14, 29,  29,  44,  44,  59,  59,  74,  74,
-  89,  89,  104, 104, 119, 15, 30, 30,  45,  45,  60,  60,  75,  75,  90,
-  90,  105, 105, 120, 31,  46, 46, 61,  61,  76,  76,  91,  91,  106, 106,
-  121, 47,  62,  62,  77,  77, 92, 92,  107, 107, 122, 63,  78,  78,  93,
-  93,  108, 108, 123, 79,  94, 94, 109, 109, 124, 95,  110, 110, 125, 111,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  16, 16, 24, 24,  32,  32,  40,  40,  48,  48,
-  56, 56, 64, 64, 72, 72, 80, 80, 88, 88,  96,  96,  104, 104, 112, 112,
-  0,  0,  1,  8,  9,  16, 17, 24, 25, 32,  33,  40,  41,  48,  49,  56,
-  57, 64, 65, 72, 73, 80, 81, 88, 89, 96,  97,  104, 105, 112, 113, 120,
-  1,  1,  2,  9,  10, 17, 18, 25, 26, 33,  34,  41,  42,  49,  50,  57,
-  58, 65, 66, 73, 74, 81, 82, 89, 90, 97,  98,  105, 106, 113, 114, 121,
-  2,  2,  3,  10, 11, 18, 19, 26, 27, 34,  35,  42,  43,  50,  51,  58,
-  59, 66, 67, 74, 75, 82, 83, 90, 91, 98,  99,  106, 107, 114, 115, 122,
-  3,  3,  4,  11, 12, 19, 20, 27, 28, 35,  36,  43,  44,  51,  52,  59,
-  60, 67, 68, 75, 76, 83, 84, 91, 92, 99,  100, 107, 108, 115, 116, 123,
-  4,  4,  5,  12, 13, 20, 21, 28, 29, 36,  37,  44,  45,  52,  53,  60,
-  61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124,
-  5,  5,  6,  13, 14, 21, 22, 29, 30, 37,  38,  45,  46,  53,  54,  61,
-  62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125,
-  6,  6,  7,  14, 15, 22, 23, 30, 31, 38,  39,  46,  47,  54,  55,  62,
-  63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
-  0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  16, 16, 32, 32, 48, 48, 64, 64, 80, 80,  96,  96,
-  0,  0,  1,  16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96,  97,  112,
-  1,  1,  2,  17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97,  98,  113,
-  2,  2,  3,  18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98,  99,  114,
-  3,  3,  4,  19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99,  100, 115,
-  4,  4,  5,  20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
-  5,  5,  6,  21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
-  6,  6,  7,  22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
-  7,  7,  8,  23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
-  8,  8,  9,  24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
-  9,  9,  10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
-  10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
-  11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
-  12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
-  13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
-  14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
-  0,  0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
-  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
-  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
-  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
-  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
-  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
-  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
-  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
-  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
-  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
-  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
-  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
-  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
-  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
-  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
-  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
-  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
-  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
-  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
-  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
-  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
-  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
-  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
-  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
-  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
-  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
-  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
-  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
-  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
-  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
-  126, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   16,  16,  16,  2,   2,   2,
-  17,  17,  32,  32,  32,  3,   3,   3,   18,  18,  33,  33,  48,  48,  48,
-  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  5,   5,   5,
-  20,  20,  35,  35,  50,  50,  65,  65,  80,  80,  80,  6,   6,   6,   21,
-  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  7,   7,   7,
-  22,  22,  37,  37,  52,  52,  67,  67,  82,  82,  97,  97,  112, 112, 112,
-  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
-  113, 113, 128, 128, 128, 9,   9,   9,   24,  24,  39,  39,  54,  54,  69,
-  69,  84,  84,  99,  99,  114, 114, 129, 129, 144, 144, 144, 10,  10,  10,
-  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
-  130, 145, 145, 160, 160, 160, 11,  11,  11,  26,  26,  41,  41,  56,  56,
-  71,  71,  86,  86,  101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176,
-  176, 176, 12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
-  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
-  13,  13,  13,  28,  28,  43,  43,  58,  58,  73,  73,  88,  88,  103, 103,
-  118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208,
-  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
-  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
-  224, 224, 15,  30,  30,  45,  45,  60,  60,  75,  75,  90,  90,  105, 105,
-  120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225,
-  225, 240, 240, 240, 31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106,
-  121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226,
-  226, 241, 241, 256, 256, 256, 47,  62,  62,  77,  77,  92,  92,  107, 107,
-  122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227,
-  227, 242, 242, 257, 257, 272, 272, 272, 63,  78,  78,  93,  93,  108, 108,
-  123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228,
-  228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79,  94,  94,  109, 109,
-  124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229,
-  229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95,  110, 110,
-  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
-  230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111,
-  126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231,
-  231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336,
-  336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232,
-  232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337,
-  352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233,
-  233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338,
-  353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
-  234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339,
-  354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235,
-  235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340,
-  355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236,
-  236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341,
-  356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237,
-  237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342,
-  357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238,
-  238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343,
-  358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448,
-  239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344,
-  359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464,
-  464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345,
-  360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465,
-  465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346,
-  361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466,
-  466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362,
-  377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482,
-  482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393,
-  408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334,
-  334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439,
-  454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395,
-  395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351,
-  366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471,
-  471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442,
-  457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443,
-  443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444,
-  459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475,
-  475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447,
-  462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494,
-  494, 509, 495, 510, 0,   0
-};
-
 DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
-  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
-  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
-  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
-  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
-  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224,
-  8,   8,   8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194,
-  225, 225, 256, 256, 256, 9,   9,   9,   40,  40,  71,  71,  102, 102, 133,
-  133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10,  10,  10,
-  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258,
-  258, 289, 289, 320, 320, 320, 11,  11,  11,  42,  42,  73,  73,  104, 104,
-  135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352,
-  352, 352, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
-  198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
-  13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168, 168, 199, 199,
-  230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416,
-  14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138, 169, 169, 200, 200,
-  231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448,
-  448, 448, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139, 139, 170, 170,
-  201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418,
-  418, 449, 449, 480, 16,  16,  16,  47,  47,  78,  78,  109, 109, 140, 140,
-  171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388,
-  388, 419, 419, 450, 450, 481, 17,  17,  17,  48,  48,  79,  79,  110, 110,
-  141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358,
-  358, 389, 389, 420, 420, 451, 451, 482, 18,  18,  18,  49,  49,  80,  80,
-  111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328,
-  328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19,  19,  19,  50,  50,
-  81,  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298,
-  298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20,  20,  20,
-  51,  51,  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268,
-  268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21,
-  21,  21,  52,  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238,
-  238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455,
-  486, 22,  22,  22,  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208,
-  208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425,
-  456, 456, 487, 23,  23,  23,  54,  54,  85,  85,  116, 116, 147, 147, 178,
-  178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395,
-  426, 426, 457, 457, 488, 24,  24,  24,  55,  55,  86,  86,  117, 117, 148,
-  148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365,
-  396, 396, 427, 427, 458, 458, 489, 25,  25,  25,  56,  56,  87,  87,  118,
-  118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335,
-  366, 366, 397, 397, 428, 428, 459, 459, 490, 26,  26,  26,  57,  57,  88,
-  88,  119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305,
-  336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27,  27,  27,  58,
-  58,  89,  89,  120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275,
-  306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28,  28,
-  28,  59,  59,  90,  90,  121, 121, 152, 152, 183, 183, 214, 214, 245, 245,
-  276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493,
-  29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153, 184, 184, 215, 215,
-  246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463,
-  463, 494, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154, 154, 185, 185,
-  216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433,
-  433, 464, 464, 495, 31,  62,  62,  93,  93,  124, 124, 155, 155, 186, 186,
-  217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434,
-  434, 465, 465, 496, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218, 218,
-  249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466,
-  466, 497, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281,
-  312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158,
-  158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375,
-  406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283,
-  283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191,
-  222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439,
-  439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378,
-  409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379,
-  379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380,
-  411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443,
-  443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383,
-  414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478,
-  478, 509, 479, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
-  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
-  224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336,
-  336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448,
-  464, 464, 480, 480, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,
-  80,  81,  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192,
-  193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305,
-  320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432,
-  433, 448, 449, 464, 465, 480, 481, 496, 1,   1,   2,   17,  18,  33,  34,
-  49,  50,  65,  66,  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161,
-  162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274,
-  289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401,
-  402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2,   2,   3,
-  18,  19,  34,  35,  50,  51,  66,  67,  82,  83,  98,  99,  114, 115, 130,
-  131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243,
-  258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370,
-  371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483,
-  498, 3,   3,   4,   19,  20,  35,  36,  51,  52,  67,  68,  83,  84,  99,
-  100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
-  227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339,
-  340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452,
-  467, 468, 483, 484, 499, 4,   4,   5,   20,  21,  36,  37,  52,  53,  68,
-  69,  84,  85,  100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181,
-  196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308,
-  309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421,
-  436, 437, 452, 453, 468, 469, 484, 485, 500, 5,   5,   6,   21,  22,  37,
-  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133, 134, 149, 150,
-  165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277,
-  278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390,
-  405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6,   6,
-  7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118, 119,
-  134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246,
-  247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359,
-  374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486,
-  487, 502, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,
-  103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215,
-  216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328,
-  343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455,
-  456, 471, 472, 487, 488, 503, 8,   8,   9,   24,  25,  40,  41,  56,  57,
-  72,  73,  88,  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184,
-  185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297,
-  312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424,
-  425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9,   9,   10,  25,  26,
-  41,  42,  57,  58,  73,  74,  89,  90,  105, 106, 121, 122, 137, 138, 153,
-  154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266,
-  281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393,
-  394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10,
-  10,  11,  26,  27,  42,  43,  58,  59,  74,  75,  90,  91,  106, 107, 122,
-  123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235,
-  250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362,
-  363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475,
-  490, 491, 506, 11,  11,  12,  27,  28,  43,  44,  59,  60,  75,  76,  91,
-  92,  107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204,
-  219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331,
-  332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444,
-  459, 460, 475, 476, 491, 492, 507, 12,  12,  13,  28,  29,  44,  45,  60,
-  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141, 156, 157, 172, 173,
-  188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300,
-  301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413,
-  428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13,  13,  14,  29,
-  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126, 141, 142,
-  157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269,
-  270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382,
-  397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509,
-  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
-  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
-  239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351,
-  366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478,
-  479, 494, 495, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   32,  32,  64,  64,  96,  96,  128, 128, 160, 160, 192,
-  192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416,
-  448, 448, 0,   0,   1,   32,  33,  64,  65,  96,  97,  128, 129, 160, 161,
-  192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
-  417, 448, 449, 480, 1,   1,   2,   33,  34,  65,  66,  97,  98,  129, 130,
-  161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
-  386, 417, 418, 449, 450, 481, 2,   2,   3,   34,  35,  66,  67,  98,  99,
-  130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354,
-  355, 386, 387, 418, 419, 450, 451, 482, 3,   3,   4,   35,  36,  67,  68,
-  99,  100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323,
-  324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4,   4,   5,   36,  37,
-  68,  69,  100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292,
-  293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5,   5,   6,
-  37,  38,  69,  70,  101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261,
-  262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6,
-  6,   7,   38,  39,  70,  71,  102, 103, 134, 135, 166, 167, 198, 199, 230,
-  231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455,
-  486, 7,   7,   8,   39,  40,  71,  72,  103, 104, 135, 136, 167, 168, 199,
-  200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424,
-  455, 456, 487, 8,   8,   9,   40,  41,  72,  73,  104, 105, 136, 137, 168,
-  169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393,
-  424, 425, 456, 457, 488, 9,   9,   10,  41,  42,  73,  74,  105, 106, 137,
-  138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362,
-  393, 394, 425, 426, 457, 458, 489, 10,  10,  11,  42,  43,  74,  75,  106,
-  107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331,
-  362, 363, 394, 395, 426, 427, 458, 459, 490, 11,  11,  12,  43,  44,  75,
-  76,  107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300,
-  331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12,  12,  13,  44,
-  45,  76,  77,  108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269,
-  300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13,  13,
-  14,  45,  46,  77,  78,  109, 110, 141, 142, 173, 174, 205, 206, 237, 238,
-  269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493,
-  14,  14,  15,  46,  47,  78,  79,  110, 111, 142, 143, 174, 175, 206, 207,
-  238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462,
-  463, 494, 15,  15,  16,  47,  48,  79,  80,  111, 112, 143, 144, 175, 176,
-  207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431,
-  432, 463, 464, 495, 16,  16,  17,  48,  49,  80,  81,  112, 113, 144, 145,
-  176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400,
-  401, 432, 433, 464, 465, 496, 17,  17,  18,  49,  50,  81,  82,  113, 114,
-  145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
-  370, 401, 402, 433, 434, 465, 466, 497, 18,  18,  19,  50,  51,  82,  83,
-  114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338,
-  339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19,  19,  20,  51,  52,
-  83,  84,  115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307,
-  308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20,  20,  21,
-  52,  53,  84,  85,  116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276,
-  277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21,
-  21,  22,  53,  54,  85,  86,  117, 118, 149, 150, 181, 182, 213, 214, 245,
-  246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470,
-  501, 22,  22,  23,  54,  55,  86,  87,  118, 119, 150, 151, 182, 183, 214,
-  215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439,
-  470, 471, 502, 23,  23,  24,  55,  56,  87,  88,  119, 120, 151, 152, 183,
-  184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
-  439, 440, 471, 472, 503, 24,  24,  25,  56,  57,  88,  89,  120, 121, 152,
-  153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377,
-  408, 409, 440, 441, 472, 473, 504, 25,  25,  26,  57,  58,  89,  90,  121,
-  122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346,
-  377, 378, 409, 410, 441, 442, 473, 474, 505, 26,  26,  27,  58,  59,  90,
-  91,  122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315,
-  346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27,  27,  28,  59,
-  60,  91,  92,  123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284,
-  315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28,  28,
-  29,  60,  61,  92,  93,  124, 125, 156, 157, 188, 189, 220, 221, 252, 253,
-  284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508,
-  29,  29,  30,  61,  62,  93,  94,  125, 126, 157, 158, 189, 190, 221, 222,
-  253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477,
-  478, 509, 30,  30,  31,  62,  63,  94,  95,  126, 127, 158, 159, 190, 191,
-  222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446,
-  447, 478, 479, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
-  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
-  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
-  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
-  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
-  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
-  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
-  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
-  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
-  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
-  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
-  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
-  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
-  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
-  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
-  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
-  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
-  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
-  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
-  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
-  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
-  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
-  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
-  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
-  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
-  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
-  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
-  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
-  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
-  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
-  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
-  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
-  239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246,
-  261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268,
-  254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261,
-  276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283,
-  269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276,
-  291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298,
-  284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291,
-  306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313,
-  299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306,
-  321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328,
-  314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321,
-  336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343,
-  329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336,
-  336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358,
-  344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351,
-  366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373,
-  359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366,
-  381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388,
-  374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381,
-  396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403,
-  389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396,
-  411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418,
-  404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411,
-  426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433,
-  419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426,
-  441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448,
-  434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441,
-  456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448,
-  449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456,
-  471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478,
-  464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471,
-  486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493,
-  479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486,
-  501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508,
-  494, 509, 495, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
-  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
-  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
-  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
-  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
-  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
-  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
-  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
-  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
-  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
-  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
-  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
-  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
-  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
-  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
-  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
-  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
-  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
-  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
-  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
-  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
-  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
-  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
-  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
-  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
-  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
-  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
-  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
-  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
-  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
-  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
-  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
-  223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230,
-  261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268,
-  238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245,
-  276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283,
-  253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260,
-  291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
-  268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275,
-  306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
-  283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290,
-  321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328,
-  298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305,
-  336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343,
-  313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320,
-  320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358,
-  328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335,
-  366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373,
-  343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350,
-  381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
-  358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365,
-  396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403,
-  373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380,
-  411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
-  388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395,
-  426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433,
-  403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410,
-  441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448,
-  418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425,
-  456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463,
-  433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440,
-  471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478,
-  448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455,
-  486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493,
-  463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470,
-  501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508,
-  478, 509, 479, 510, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
-  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
-  224, 224, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,  80,  81,
-  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208,
-  209, 224, 225, 240, 1,   1,   2,   17,  18,  33,  34,  49,  50,  65,  66,
-  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193,
-  194, 209, 210, 225, 226, 241, 2,   2,   3,   18,  19,  34,  35,  50,  51,
-  66,  67,  82,  83,  98,  99,  114, 115, 130, 131, 146, 147, 162, 163, 178,
-  179, 194, 195, 210, 211, 226, 227, 242, 3,   3,   4,   19,  20,  35,  36,
-  51,  52,  67,  68,  83,  84,  99,  100, 115, 116, 131, 132, 147, 148, 163,
-  164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4,   4,   5,   20,  21,
-  36,  37,  52,  53,  68,  69,  84,  85,  100, 101, 116, 117, 132, 133, 148,
-  149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5,   5,   6,
-  21,  22,  37,  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133,
-  134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6,
-  6,   7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118,
-  119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231,
-  246, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,  103,
-  104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
-  231, 232, 247, 8,   8,   9,   24,  25,  40,  41,  56,  57,  72,  73,  88,
-  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201,
-  216, 217, 232, 233, 248, 9,   9,   10,  25,  26,  41,  42,  57,  58,  73,
-  74,  89,  90,  105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186,
-  201, 202, 217, 218, 233, 234, 249, 10,  10,  11,  26,  27,  42,  43,  58,
-  59,  74,  75,  90,  91,  106, 107, 122, 123, 138, 139, 154, 155, 170, 171,
-  186, 187, 202, 203, 218, 219, 234, 235, 250, 11,  11,  12,  27,  28,  43,
-  44,  59,  60,  75,  76,  91,  92,  107, 108, 123, 124, 139, 140, 155, 156,
-  171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12,  12,  13,  28,
-  29,  44,  45,  60,  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141,
-  156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13,  13,
-  14,  29,  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126,
-  141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253,
-  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
-  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
-  239, 254, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
-  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
-  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
-  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
-  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
-  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
-  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
-  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
-  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
-  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
-  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
-  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
-  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
-  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
-  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
-  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
-  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
-  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
-  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
-  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
-  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
-  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
-  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
-  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
-  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
-  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
-  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
-  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
-  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
-  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
-  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
-  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
-  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
-  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
-  239, 254, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   2,   2,   2,
-  17,  17,  32,  32,  32,  48,  48,  33,  48,  18,  33,  3,   18,  3,   3,
-  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  80,  80,  65,
-  80,  50,  65,  35,  50,  20,  35,  5,   20,  5,   5,   6,   6,   6,   21,
-  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  112, 112, 97,
-  112, 82,  97,  67,  82,  52,  67,  37,  52,  22,  37,  7,   22,  7,   7,
-  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
-  113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99,  114, 84,  99,
-  69,  84,  54,  69,  39,  54,  24,  39,  9,   24,  9,   9,   10,  10,  10,
-  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
-  130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116,
-  131, 101, 116, 86,  101, 71,  86,  56,  71,  41,  56,  26,  41,  11,  26,
-  11,  11,  12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
-  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
-  208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103,
-  118, 88,  103, 73,  88,  58,  73,  43,  58,  28,  43,  13,  28,  13,  13,
-  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
-  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
-  224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135,
-  150, 120, 135, 105, 120, 90,  105, 75,  90,  60,  75,  45,  60,  30,  45,
-  15,  30,  31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106, 121, 121,
-  136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241,
-  227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122,
-  137, 107, 122, 92,  107, 77,  92,  62,  77,  47,  62,  63,  78,  78,  93,
-  93,  108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198,
-  213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184,
-  154, 169, 139, 154, 124, 139, 109, 124, 94,  109, 79,  94,  95,  110, 110,
-  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
-  230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141,
-  156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202,
-  202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173,
-  188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
-  234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206,
-  221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253,
-  239, 254, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    32,  32,   64,  64,   96,  96,   128, 128,  160, 160,
-  192, 192,  224, 224,  256, 256,  288, 288,  320, 320,  352, 352,  384, 384,
-  416, 416,  448, 448,  480, 480,  512, 512,  544, 544,  576, 576,  608, 608,
-  640, 640,  672, 672,  704, 704,  736, 736,  768, 768,  800, 800,  832, 832,
-  864, 864,  896, 896,  928, 928,  960, 960,  0,   0,    1,   32,   33,  64,
-  65,  96,   97,  128,  129, 160,  161, 192,  193, 224,  225, 256,  257, 288,
-  289, 320,  321, 352,  353, 384,  385, 416,  417, 448,  449, 480,  481, 512,
-  513, 544,  545, 576,  577, 608,  609, 640,  641, 672,  673, 704,  705, 736,
-  737, 768,  769, 800,  801, 832,  833, 864,  865, 896,  897, 928,  929, 960,
-  961, 992,  1,   1,    2,   33,   34,  65,   66,  97,   98,  129,  130, 161,
-  162, 193,  194, 225,  226, 257,  258, 289,  290, 321,  322, 353,  354, 385,
-  386, 417,  418, 449,  450, 481,  482, 513,  514, 545,  546, 577,  578, 609,
-  610, 641,  642, 673,  674, 705,  706, 737,  738, 769,  770, 801,  802, 833,
-  834, 865,  866, 897,  898, 929,  930, 961,  962, 993,  2,   2,    3,   34,
-  35,  66,   67,  98,   99,  130,  131, 162,  163, 194,  195, 226,  227, 258,
-  259, 290,  291, 322,  323, 354,  355, 386,  387, 418,  419, 450,  451, 482,
-  483, 514,  515, 546,  547, 578,  579, 610,  611, 642,  643, 674,  675, 706,
-  707, 738,  739, 770,  771, 802,  803, 834,  835, 866,  867, 898,  899, 930,
-  931, 962,  963, 994,  3,   3,    4,   35,   36,  67,   68,  99,   100, 131,
-  132, 163,  164, 195,  196, 227,  228, 259,  260, 291,  292, 323,  324, 355,
-  356, 387,  388, 419,  420, 451,  452, 483,  484, 515,  516, 547,  548, 579,
-  580, 611,  612, 643,  644, 675,  676, 707,  708, 739,  740, 771,  772, 803,
-  804, 835,  836, 867,  868, 899,  900, 931,  932, 963,  964, 995,  4,   4,
-  5,   36,   37,  68,   69,  100,  101, 132,  133, 164,  165, 196,  197, 228,
-  229, 260,  261, 292,  293, 324,  325, 356,  357, 388,  389, 420,  421, 452,
-  453, 484,  485, 516,  517, 548,  549, 580,  581, 612,  613, 644,  645, 676,
-  677, 708,  709, 740,  741, 772,  773, 804,  805, 836,  837, 868,  869, 900,
-  901, 932,  933, 964,  965, 996,  5,   5,    6,   37,   38,  69,   70,  101,
-  102, 133,  134, 165,  166, 197,  198, 229,  230, 261,  262, 293,  294, 325,
-  326, 357,  358, 389,  390, 421,  422, 453,  454, 485,  486, 517,  518, 549,
-  550, 581,  582, 613,  614, 645,  646, 677,  678, 709,  710, 741,  742, 773,
-  774, 805,  806, 837,  838, 869,  870, 901,  902, 933,  934, 965,  966, 997,
-  6,   6,    7,   38,   39,  70,   71,  102,  103, 134,  135, 166,  167, 198,
-  199, 230,  231, 262,  263, 294,  295, 326,  327, 358,  359, 390,  391, 422,
-  423, 454,  455, 486,  487, 518,  519, 550,  551, 582,  583, 614,  615, 646,
-  647, 678,  679, 710,  711, 742,  743, 774,  775, 806,  807, 838,  839, 870,
-  871, 902,  903, 934,  935, 966,  967, 998,  7,   7,    8,   39,   40,  71,
-  72,  103,  104, 135,  136, 167,  168, 199,  200, 231,  232, 263,  264, 295,
-  296, 327,  328, 359,  360, 391,  392, 423,  424, 455,  456, 487,  488, 519,
-  520, 551,  552, 583,  584, 615,  616, 647,  648, 679,  680, 711,  712, 743,
-  744, 775,  776, 807,  808, 839,  840, 871,  872, 903,  904, 935,  936, 967,
-  968, 999,  8,   8,    9,   40,   41,  72,   73,  104,  105, 136,  137, 168,
-  169, 200,  201, 232,  233, 264,  265, 296,  297, 328,  329, 360,  361, 392,
-  393, 424,  425, 456,  457, 488,  489, 520,  521, 552,  553, 584,  585, 616,
-  617, 648,  649, 680,  681, 712,  713, 744,  745, 776,  777, 808,  809, 840,
-  841, 872,  873, 904,  905, 936,  937, 968,  969, 1000, 9,   9,    10,  41,
-  42,  73,   74,  105,  106, 137,  138, 169,  170, 201,  202, 233,  234, 265,
-  266, 297,  298, 329,  330, 361,  362, 393,  394, 425,  426, 457,  458, 489,
-  490, 521,  522, 553,  554, 585,  586, 617,  618, 649,  650, 681,  682, 713,
-  714, 745,  746, 777,  778, 809,  810, 841,  842, 873,  874, 905,  906, 937,
-  938, 969,  970, 1001, 10,  10,   11,  42,   43,  74,   75,  106,  107, 138,
-  139, 170,  171, 202,  203, 234,  235, 266,  267, 298,  299, 330,  331, 362,
-  363, 394,  395, 426,  427, 458,  459, 490,  491, 522,  523, 554,  555, 586,
-  587, 618,  619, 650,  651, 682,  683, 714,  715, 746,  747, 778,  779, 810,
-  811, 842,  843, 874,  875, 906,  907, 938,  939, 970,  971, 1002, 11,  11,
-  12,  43,   44,  75,   76,  107,  108, 139,  140, 171,  172, 203,  204, 235,
-  236, 267,  268, 299,  300, 331,  332, 363,  364, 395,  396, 427,  428, 459,
-  460, 491,  492, 523,  524, 555,  556, 587,  588, 619,  620, 651,  652, 683,
-  684, 715,  716, 747,  748, 779,  780, 811,  812, 843,  844, 875,  876, 907,
-  908, 939,  940, 971,  972, 1003, 12,  12,   13,  44,   45,  76,   77,  108,
-  109, 140,  141, 172,  173, 204,  205, 236,  237, 268,  269, 300,  301, 332,
-  333, 364,  365, 396,  397, 428,  429, 460,  461, 492,  493, 524,  525, 556,
-  557, 588,  589, 620,  621, 652,  653, 684,  685, 716,  717, 748,  749, 780,
-  781, 812,  813, 844,  845, 876,  877, 908,  909, 940,  941, 972,  973, 1004,
-  13,  13,   14,  45,   46,  77,   78,  109,  110, 141,  142, 173,  174, 205,
-  206, 237,  238, 269,  270, 301,  302, 333,  334, 365,  366, 397,  398, 429,
-  430, 461,  462, 493,  494, 525,  526, 557,  558, 589,  590, 621,  622, 653,
-  654, 685,  686, 717,  718, 749,  750, 781,  782, 813,  814, 845,  846, 877,
-  878, 909,  910, 941,  942, 973,  974, 1005, 14,  14,   15,  46,   47,  78,
-  79,  110,  111, 142,  143, 174,  175, 206,  207, 238,  239, 270,  271, 302,
-  303, 334,  335, 366,  367, 398,  399, 430,  431, 462,  463, 494,  495, 526,
-  527, 558,  559, 590,  591, 622,  623, 654,  655, 686,  687, 718,  719, 750,
-  751, 782,  783, 814,  815, 846,  847, 878,  879, 910,  911, 942,  943, 974,
-  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111,  112, 143,  144, 175,
-  176, 207,  208, 239,  240, 271,  272, 303,  304, 335,  336, 367,  368, 399,
-  400, 431,  432, 463,  464, 495,  496, 527,  528, 559,  560, 591,  592, 623,
-  624, 655,  656, 687,  688, 719,  720, 751,  752, 783,  784, 815,  816, 847,
-  848, 879,  880, 911,  912, 943,  944, 975,  976, 1007, 16,  16,   17,  48,
-  49,  80,   81,  112,  113, 144,  145, 176,  177, 208,  209, 240,  241, 272,
-  273, 304,  305, 336,  337, 368,  369, 400,  401, 432,  433, 464,  465, 496,
-  497, 528,  529, 560,  561, 592,  593, 624,  625, 656,  657, 688,  689, 720,
-  721, 752,  753, 784,  785, 816,  817, 848,  849, 880,  881, 912,  913, 944,
-  945, 976,  977, 1008, 17,  17,   18,  49,   50,  81,   82,  113,  114, 145,
-  146, 177,  178, 209,  210, 241,  242, 273,  274, 305,  306, 337,  338, 369,
-  370, 401,  402, 433,  434, 465,  466, 497,  498, 529,  530, 561,  562, 593,
-  594, 625,  626, 657,  658, 689,  690, 721,  722, 753,  754, 785,  786, 817,
-  818, 849,  850, 881,  882, 913,  914, 945,  946, 977,  978, 1009, 18,  18,
-  19,  50,   51,  82,   83,  114,  115, 146,  147, 178,  179, 210,  211, 242,
-  243, 274,  275, 306,  307, 338,  339, 370,  371, 402,  403, 434,  435, 466,
-  467, 498,  499, 530,  531, 562,  563, 594,  595, 626,  627, 658,  659, 690,
-  691, 722,  723, 754,  755, 786,  787, 818,  819, 850,  851, 882,  883, 914,
-  915, 946,  947, 978,  979, 1010, 19,  19,   20,  51,   52,  83,   84,  115,
-  116, 147,  148, 179,  180, 211,  212, 243,  244, 275,  276, 307,  308, 339,
-  340, 371,  372, 403,  404, 435,  436, 467,  468, 499,  500, 531,  532, 563,
-  564, 595,  596, 627,  628, 659,  660, 691,  692, 723,  724, 755,  756, 787,
-  788, 819,  820, 851,  852, 883,  884, 915,  916, 947,  948, 979,  980, 1011,
-  20,  20,   21,  52,   53,  84,   85,  116,  117, 148,  149, 180,  181, 212,
-  213, 244,  245, 276,  277, 308,  309, 340,  341, 372,  373, 404,  405, 436,
-  437, 468,  469, 500,  501, 532,  533, 564,  565, 596,  597, 628,  629, 660,
-  661, 692,  693, 724,  725, 756,  757, 788,  789, 820,  821, 852,  853, 884,
-  885, 916,  917, 948,  949, 980,  981, 1012, 21,  21,   22,  53,   54,  85,
-  86,  117,  118, 149,  150, 181,  182, 213,  214, 245,  246, 277,  278, 309,
-  310, 341,  342, 373,  374, 405,  406, 437,  438, 469,  470, 501,  502, 533,
-  534, 565,  566, 597,  598, 629,  630, 661,  662, 693,  694, 725,  726, 757,
-  758, 789,  790, 821,  822, 853,  854, 885,  886, 917,  918, 949,  950, 981,
-  982, 1013, 22,  22,   23,  54,   55,  86,   87,  118,  119, 150,  151, 182,
-  183, 214,  215, 246,  247, 278,  279, 310,  311, 342,  343, 374,  375, 406,
-  407, 438,  439, 470,  471, 502,  503, 534,  535, 566,  567, 598,  599, 630,
-  631, 662,  663, 694,  695, 726,  727, 758,  759, 790,  791, 822,  823, 854,
-  855, 886,  887, 918,  919, 950,  951, 982,  983, 1014, 23,  23,   24,  55,
-  56,  87,   88,  119,  120, 151,  152, 183,  184, 215,  216, 247,  248, 279,
-  280, 311,  312, 343,  344, 375,  376, 407,  408, 439,  440, 471,  472, 503,
-  504, 535,  536, 567,  568, 599,  600, 631,  632, 663,  664, 695,  696, 727,
-  728, 759,  760, 791,  792, 823,  824, 855,  856, 887,  888, 919,  920, 951,
-  952, 983,  984, 1015, 24,  24,   25,  56,   57,  88,   89,  120,  121, 152,
-  153, 184,  185, 216,  217, 248,  249, 280,  281, 312,  313, 344,  345, 376,
-  377, 408,  409, 440,  441, 472,  473, 504,  505, 536,  537, 568,  569, 600,
-  601, 632,  633, 664,  665, 696,  697, 728,  729, 760,  761, 792,  793, 824,
-  825, 856,  857, 888,  889, 920,  921, 952,  953, 984,  985, 1016, 25,  25,
-  26,  57,   58,  89,   90,  121,  122, 153,  154, 185,  186, 217,  218, 249,
-  250, 281,  282, 313,  314, 345,  346, 377,  378, 409,  410, 441,  442, 473,
-  474, 505,  506, 537,  538, 569,  570, 601,  602, 633,  634, 665,  666, 697,
-  698, 729,  730, 761,  762, 793,  794, 825,  826, 857,  858, 889,  890, 921,
-  922, 953,  954, 985,  986, 1017, 26,  26,   27,  58,   59,  90,   91,  122,
-  123, 154,  155, 186,  187, 218,  219, 250,  251, 282,  283, 314,  315, 346,
-  347, 378,  379, 410,  411, 442,  443, 474,  475, 506,  507, 538,  539, 570,
-  571, 602,  603, 634,  635, 666,  667, 698,  699, 730,  731, 762,  763, 794,
-  795, 826,  827, 858,  859, 890,  891, 922,  923, 954,  955, 986,  987, 1018,
-  27,  27,   28,  59,   60,  91,   92,  123,  124, 155,  156, 187,  188, 219,
-  220, 251,  252, 283,  284, 315,  316, 347,  348, 379,  380, 411,  412, 443,
-  444, 475,  476, 507,  508, 539,  540, 571,  572, 603,  604, 635,  636, 667,
-  668, 699,  700, 731,  732, 763,  764, 795,  796, 827,  828, 859,  860, 891,
-  892, 923,  924, 955,  956, 987,  988, 1019, 28,  28,   29,  60,   61,  92,
-  93,  124,  125, 156,  157, 188,  189, 220,  221, 252,  253, 284,  285, 316,
-  317, 348,  349, 380,  381, 412,  413, 444,  445, 476,  477, 508,  509, 540,
-  541, 572,  573, 604,  605, 636,  637, 668,  669, 700,  701, 732,  733, 764,
-  765, 796,  797, 828,  829, 860,  861, 892,  893, 924,  925, 956,  957, 988,
-  989, 1020, 29,  29,   30,  61,   62,  93,   94,  125,  126, 157,  158, 189,
-  190, 221,  222, 253,  254, 285,  286, 317,  318, 349,  350, 381,  382, 413,
-  414, 445,  446, 477,  478, 509,  510, 541,  542, 573,  574, 605,  606, 637,
-  638, 669,  670, 701,  702, 733,  734, 765,  766, 797,  798, 829,  830, 861,
-  862, 893,  894, 925,  926, 957,  958, 989,  990, 1021, 30,  30,   31,  62,
-  63,  94,   95,  126,  127, 158,  159, 190,  191, 222,  223, 254,  255, 286,
-  287, 318,  319, 350,  351, 382,  383, 414,  415, 446,  447, 478,  479, 510,
-  511, 542,  543, 574,  575, 606,  607, 638,  639, 670,  671, 702,  703, 734,
-  735, 766,  767, 798,  799, 830,  831, 862,  863, 894,  895, 926,  927, 958,
-  959, 990,  991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    1,   1,    2,   2,    3,   3,    4,   4,    5,   5,
-  6,   6,    7,   7,    8,   8,    9,   9,    10,  10,   11,  11,   12,  12,
-  13,  13,   14,  14,   15,  15,   16,  16,   17,  17,   18,  18,   19,  19,
-  20,  20,   21,  21,   22,  22,   23,  23,   24,  24,   25,  25,   26,  26,
-  27,  27,   28,  28,   29,  29,   30,  30,   0,   0,    1,   32,   2,   33,
-  3,   34,   4,   35,   5,   36,   6,   37,   7,   38,   8,   39,   9,   40,
-  10,  41,   11,  42,   12,  43,   13,  44,   14,  45,   15,  46,   16,  47,
-  17,  48,   18,  49,   19,  50,   20,  51,   21,  52,   22,  53,   23,  54,
-  24,  55,   25,  56,   26,  57,   27,  58,   28,  59,   29,  60,   30,  61,
-  31,  62,   32,  32,   33,  64,   34,  65,   35,  66,   36,  67,   37,  68,
-  38,  69,   39,  70,   40,  71,   41,  72,   42,  73,   43,  74,   44,  75,
-  45,  76,   46,  77,   47,  78,   48,  79,   49,  80,   50,  81,   51,  82,
-  52,  83,   53,  84,   54,  85,   55,  86,   56,  87,   57,  88,   58,  89,
-  59,  90,   60,  91,   61,  92,   62,  93,   63,  94,   64,  64,   65,  96,
-  66,  97,   67,  98,   68,  99,   69,  100,  70,  101,  71,  102,  72,  103,
-  73,  104,  74,  105,  75,  106,  76,  107,  77,  108,  78,  109,  79,  110,
-  80,  111,  81,  112,  82,  113,  83,  114,  84,  115,  85,  116,  86,  117,
-  87,  118,  88,  119,  89,  120,  90,  121,  91,  122,  92,  123,  93,  124,
-  94,  125,  95,  126,  96,  96,   97,  128,  98,  129,  99,  130,  100, 131,
-  101, 132,  102, 133,  103, 134,  104, 135,  105, 136,  106, 137,  107, 138,
-  108, 139,  109, 140,  110, 141,  111, 142,  112, 143,  113, 144,  114, 145,
-  115, 146,  116, 147,  117, 148,  118, 149,  119, 150,  120, 151,  121, 152,
-  122, 153,  123, 154,  124, 155,  125, 156,  126, 157,  127, 158,  128, 128,
-  129, 160,  130, 161,  131, 162,  132, 163,  133, 164,  134, 165,  135, 166,
-  136, 167,  137, 168,  138, 169,  139, 170,  140, 171,  141, 172,  142, 173,
-  143, 174,  144, 175,  145, 176,  146, 177,  147, 178,  148, 179,  149, 180,
-  150, 181,  151, 182,  152, 183,  153, 184,  154, 185,  155, 186,  156, 187,
-  157, 188,  158, 189,  159, 190,  160, 160,  161, 192,  162, 193,  163, 194,
-  164, 195,  165, 196,  166, 197,  167, 198,  168, 199,  169, 200,  170, 201,
-  171, 202,  172, 203,  173, 204,  174, 205,  175, 206,  176, 207,  177, 208,
-  178, 209,  179, 210,  180, 211,  181, 212,  182, 213,  183, 214,  184, 215,
-  185, 216,  186, 217,  187, 218,  188, 219,  189, 220,  190, 221,  191, 222,
-  192, 192,  193, 224,  194, 225,  195, 226,  196, 227,  197, 228,  198, 229,
-  199, 230,  200, 231,  201, 232,  202, 233,  203, 234,  204, 235,  205, 236,
-  206, 237,  207, 238,  208, 239,  209, 240,  210, 241,  211, 242,  212, 243,
-  213, 244,  214, 245,  215, 246,  216, 247,  217, 248,  218, 249,  219, 250,
-  220, 251,  221, 252,  222, 253,  223, 254,  224, 224,  225, 256,  226, 257,
-  227, 258,  228, 259,  229, 260,  230, 261,  231, 262,  232, 263,  233, 264,
-  234, 265,  235, 266,  236, 267,  237, 268,  238, 269,  239, 270,  240, 271,
-  241, 272,  242, 273,  243, 274,  244, 275,  245, 276,  246, 277,  247, 278,
-  248, 279,  249, 280,  250, 281,  251, 282,  252, 283,  253, 284,  254, 285,
-  255, 286,  256, 256,  257, 288,  258, 289,  259, 290,  260, 291,  261, 292,
-  262, 293,  263, 294,  264, 295,  265, 296,  266, 297,  267, 298,  268, 299,
-  269, 300,  270, 301,  271, 302,  272, 303,  273, 304,  274, 305,  275, 306,
-  276, 307,  277, 308,  278, 309,  279, 310,  280, 311,  281, 312,  282, 313,
-  283, 314,  284, 315,  285, 316,  286, 317,  287, 318,  288, 288,  289, 320,
-  290, 321,  291, 322,  292, 323,  293, 324,  294, 325,  295, 326,  296, 327,
-  297, 328,  298, 329,  299, 330,  300, 331,  301, 332,  302, 333,  303, 334,
-  304, 335,  305, 336,  306, 337,  307, 338,  308, 339,  309, 340,  310, 341,
-  311, 342,  312, 343,  313, 344,  314, 345,  315, 346,  316, 347,  317, 348,
-  318, 349,  319, 350,  320, 320,  321, 352,  322, 353,  323, 354,  324, 355,
-  325, 356,  326, 357,  327, 358,  328, 359,  329, 360,  330, 361,  331, 362,
-  332, 363,  333, 364,  334, 365,  335, 366,  336, 367,  337, 368,  338, 369,
-  339, 370,  340, 371,  341, 372,  342, 373,  343, 374,  344, 375,  345, 376,
-  346, 377,  347, 378,  348, 379,  349, 380,  350, 381,  351, 382,  352, 352,
-  353, 384,  354, 385,  355, 386,  356, 387,  357, 388,  358, 389,  359, 390,
-  360, 391,  361, 392,  362, 393,  363, 394,  364, 395,  365, 396,  366, 397,
-  367, 398,  368, 399,  369, 400,  370, 401,  371, 402,  372, 403,  373, 404,
-  374, 405,  375, 406,  376, 407,  377, 408,  378, 409,  379, 410,  380, 411,
-  381, 412,  382, 413,  383, 414,  384, 384,  385, 416,  386, 417,  387, 418,
-  388, 419,  389, 420,  390, 421,  391, 422,  392, 423,  393, 424,  394, 425,
-  395, 426,  396, 427,  397, 428,  398, 429,  399, 430,  400, 431,  401, 432,
-  402, 433,  403, 434,  404, 435,  405, 436,  406, 437,  407, 438,  408, 439,
-  409, 440,  410, 441,  411, 442,  412, 443,  413, 444,  414, 445,  415, 446,
-  416, 416,  417, 448,  418, 449,  419, 450,  420, 451,  421, 452,  422, 453,
-  423, 454,  424, 455,  425, 456,  426, 457,  427, 458,  428, 459,  429, 460,
-  430, 461,  431, 462,  432, 463,  433, 464,  434, 465,  435, 466,  436, 467,
-  437, 468,  438, 469,  439, 470,  440, 471,  441, 472,  442, 473,  443, 474,
-  444, 475,  445, 476,  446, 477,  447, 478,  448, 448,  449, 480,  450, 481,
-  451, 482,  452, 483,  453, 484,  454, 485,  455, 486,  456, 487,  457, 488,
-  458, 489,  459, 490,  460, 491,  461, 492,  462, 493,  463, 494,  464, 495,
-  465, 496,  466, 497,  467, 498,  468, 499,  469, 500,  470, 501,  471, 502,
-  472, 503,  473, 504,  474, 505,  475, 506,  476, 507,  477, 508,  478, 509,
-  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
-  486, 517,  487, 518,  488, 519,  489, 520,  490, 521,  491, 522,  492, 523,
-  493, 524,  494, 525,  495, 526,  496, 527,  497, 528,  498, 529,  499, 530,
-  500, 531,  501, 532,  502, 533,  503, 534,  504, 535,  505, 536,  506, 537,
-  507, 538,  508, 539,  509, 540,  510, 541,  511, 542,  512, 512,  513, 544,
-  514, 545,  515, 546,  516, 547,  517, 548,  518, 549,  519, 550,  520, 551,
-  521, 552,  522, 553,  523, 554,  524, 555,  525, 556,  526, 557,  527, 558,
-  528, 559,  529, 560,  530, 561,  531, 562,  532, 563,  533, 564,  534, 565,
-  535, 566,  536, 567,  537, 568,  538, 569,  539, 570,  540, 571,  541, 572,
-  542, 573,  543, 574,  544, 544,  545, 576,  546, 577,  547, 578,  548, 579,
-  549, 580,  550, 581,  551, 582,  552, 583,  553, 584,  554, 585,  555, 586,
-  556, 587,  557, 588,  558, 589,  559, 590,  560, 591,  561, 592,  562, 593,
-  563, 594,  564, 595,  565, 596,  566, 597,  567, 598,  568, 599,  569, 600,
-  570, 601,  571, 602,  572, 603,  573, 604,  574, 605,  575, 606,  576, 576,
-  577, 608,  578, 609,  579, 610,  580, 611,  581, 612,  582, 613,  583, 614,
-  584, 615,  585, 616,  586, 617,  587, 618,  588, 619,  589, 620,  590, 621,
-  591, 622,  592, 623,  593, 624,  594, 625,  595, 626,  596, 627,  597, 628,
-  598, 629,  599, 630,  600, 631,  601, 632,  602, 633,  603, 634,  604, 635,
-  605, 636,  606, 637,  607, 638,  608, 608,  609, 640,  610, 641,  611, 642,
-  612, 643,  613, 644,  614, 645,  615, 646,  616, 647,  617, 648,  618, 649,
-  619, 650,  620, 651,  621, 652,  622, 653,  623, 654,  624, 655,  625, 656,
-  626, 657,  627, 658,  628, 659,  629, 660,  630, 661,  631, 662,  632, 663,
-  633, 664,  634, 665,  635, 666,  636, 667,  637, 668,  638, 669,  639, 670,
-  640, 640,  641, 672,  642, 673,  643, 674,  644, 675,  645, 676,  646, 677,
-  647, 678,  648, 679,  649, 680,  650, 681,  651, 682,  652, 683,  653, 684,
-  654, 685,  655, 686,  656, 687,  657, 688,  658, 689,  659, 690,  660, 691,
-  661, 692,  662, 693,  663, 694,  664, 695,  665, 696,  666, 697,  667, 698,
-  668, 699,  669, 700,  670, 701,  671, 702,  672, 672,  673, 704,  674, 705,
-  675, 706,  676, 707,  677, 708,  678, 709,  679, 710,  680, 711,  681, 712,
-  682, 713,  683, 714,  684, 715,  685, 716,  686, 717,  687, 718,  688, 719,
-  689, 720,  690, 721,  691, 722,  692, 723,  693, 724,  694, 725,  695, 726,
-  696, 727,  697, 728,  698, 729,  699, 730,  700, 731,  701, 732,  702, 733,
-  703, 734,  704, 704,  705, 736,  706, 737,  707, 738,  708, 739,  709, 740,
-  710, 741,  711, 742,  712, 743,  713, 744,  714, 745,  715, 746,  716, 747,
-  717, 748,  718, 749,  719, 750,  720, 751,  721, 752,  722, 753,  723, 754,
-  724, 755,  725, 756,  726, 757,  727, 758,  728, 759,  729, 760,  730, 761,
-  731, 762,  732, 763,  733, 764,  734, 765,  735, 766,  736, 736,  737, 768,
-  738, 769,  739, 770,  740, 771,  741, 772,  742, 773,  743, 774,  744, 775,
-  745, 776,  746, 777,  747, 778,  748, 779,  749, 780,  750, 781,  751, 782,
-  752, 783,  753, 784,  754, 785,  755, 786,  756, 787,  757, 788,  758, 789,
-  759, 790,  760, 791,  761, 792,  762, 793,  763, 794,  764, 795,  765, 796,
-  766, 797,  767, 798,  768, 768,  769, 800,  770, 801,  771, 802,  772, 803,
-  773, 804,  774, 805,  775, 806,  776, 807,  777, 808,  778, 809,  779, 810,
-  780, 811,  781, 812,  782, 813,  783, 814,  784, 815,  785, 816,  786, 817,
-  787, 818,  788, 819,  789, 820,  790, 821,  791, 822,  792, 823,  793, 824,
-  794, 825,  795, 826,  796, 827,  797, 828,  798, 829,  799, 830,  800, 800,
-  801, 832,  802, 833,  803, 834,  804, 835,  805, 836,  806, 837,  807, 838,
-  808, 839,  809, 840,  810, 841,  811, 842,  812, 843,  813, 844,  814, 845,
-  815, 846,  816, 847,  817, 848,  818, 849,  819, 850,  820, 851,  821, 852,
-  822, 853,  823, 854,  824, 855,  825, 856,  826, 857,  827, 858,  828, 859,
-  829, 860,  830, 861,  831, 862,  832, 832,  833, 864,  834, 865,  835, 866,
-  836, 867,  837, 868,  838, 869,  839, 870,  840, 871,  841, 872,  842, 873,
-  843, 874,  844, 875,  845, 876,  846, 877,  847, 878,  848, 879,  849, 880,
-  850, 881,  851, 882,  852, 883,  853, 884,  854, 885,  855, 886,  856, 887,
-  857, 888,  858, 889,  859, 890,  860, 891,  861, 892,  862, 893,  863, 894,
-  864, 864,  865, 896,  866, 897,  867, 898,  868, 899,  869, 900,  870, 901,
-  871, 902,  872, 903,  873, 904,  874, 905,  875, 906,  876, 907,  877, 908,
-  878, 909,  879, 910,  880, 911,  881, 912,  882, 913,  883, 914,  884, 915,
-  885, 916,  886, 917,  887, 918,  888, 919,  889, 920,  890, 921,  891, 922,
-  892, 923,  893, 924,  894, 925,  895, 926,  896, 896,  897, 928,  898, 929,
-  899, 930,  900, 931,  901, 932,  902, 933,  903, 934,  904, 935,  905, 936,
-  906, 937,  907, 938,  908, 939,  909, 940,  910, 941,  911, 942,  912, 943,
-  913, 944,  914, 945,  915, 946,  916, 947,  917, 948,  918, 949,  919, 950,
-  920, 951,  921, 952,  922, 953,  923, 954,  924, 955,  925, 956,  926, 957,
-  927, 958,  928, 928,  929, 960,  930, 961,  931, 962,  932, 963,  933, 964,
-  934, 965,  935, 966,  936, 967,  937, 968,  938, 969,  939, 970,  940, 971,
-  941, 972,  942, 973,  943, 974,  944, 975,  945, 976,  946, 977,  947, 978,
-  948, 979,  949, 980,  950, 981,  951, 982,  952, 983,  953, 984,  954, 985,
-  955, 986,  956, 987,  957, 988,  958, 989,  959, 990,  960, 960,  961, 992,
-  962, 993,  963, 994,  964, 995,  965, 996,  966, 997,  967, 998,  968, 999,
-  969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006,
-  976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013,
-  983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
-  990, 1021, 991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    2,   2,
-  2,   33,   33,  64,   64,  64,   96,  96,   65,  96,  34,  65,   3,   34,
-  3,   3,    4,   4,    4,   35,   35,  66,   66,  97,  97,  128,  128, 128,
-  160, 160,  129, 160,  98,  129,  67,  98,   36,  67,  5,   36,   5,   5,
-  6,   6,    6,   37,   37,  68,   68,  99,   99,  130, 130, 161,  161, 192,
-  192, 192,  224, 224,  193, 224,  162, 193,  131, 162, 100, 131,  69,  100,
-  38,  69,   7,   38,   7,   7,    8,   8,    8,   39,  39,  70,   70,  101,
-  101, 132,  132, 163,  163, 194,  194, 225,  225, 256, 256, 256,  288, 288,
-  257, 288,  226, 257,  195, 226,  164, 195,  133, 164, 102, 133,  71,  102,
-  40,  71,   9,   40,   9,   9,    10,  10,   10,  41,  41,  72,   72,  103,
-  103, 134,  134, 165,  165, 196,  196, 227,  227, 258, 258, 289,  289, 320,
-  320, 320,  352, 352,  321, 352,  290, 321,  259, 290, 228, 259,  197, 228,
-  166, 197,  135, 166,  104, 135,  73,  104,  42,  73,  11,  42,   11,  11,
-  12,  12,   12,  43,   43,  74,   74,  105,  105, 136, 136, 167,  167, 198,
-  198, 229,  229, 260,  260, 291,  291, 322,  322, 353, 353, 384,  384, 384,
-  416, 416,  385, 416,  354, 385,  323, 354,  292, 323, 261, 292,  230, 261,
-  199, 230,  168, 199,  137, 168,  106, 137,  75,  106, 44,  75,   13,  44,
-  13,  13,   14,  14,   14,  45,   45,  76,   76,  107, 107, 138,  138, 169,
-  169, 200,  200, 231,  231, 262,  262, 293,  293, 324, 324, 355,  355, 386,
-  386, 417,  417, 448,  448, 448,  480, 480,  449, 480, 418, 449,  387, 418,
-  356, 387,  325, 356,  294, 325,  263, 294,  232, 263, 201, 232,  170, 201,
-  139, 170,  108, 139,  77,  108,  46,  77,   15,  46,  15,  15,   16,  16,
-  16,  47,   47,  78,   78,  109,  109, 140,  140, 171, 171, 202,  202, 233,
-  233, 264,  264, 295,  295, 326,  326, 357,  357, 388, 388, 419,  419, 450,
-  450, 481,  481, 512,  512, 512,  544, 544,  513, 544, 482, 513,  451, 482,
-  420, 451,  389, 420,  358, 389,  327, 358,  296, 327, 265, 296,  234, 265,
-  203, 234,  172, 203,  141, 172,  110, 141,  79,  110, 48,  79,   17,  48,
-  17,  17,   18,  18,   18,  49,   49,  80,   80,  111, 111, 142,  142, 173,
-  173, 204,  204, 235,  235, 266,  266, 297,  297, 328, 328, 359,  359, 390,
-  390, 421,  421, 452,  452, 483,  483, 514,  514, 545, 545, 576,  576, 576,
-  608, 608,  577, 608,  546, 577,  515, 546,  484, 515, 453, 484,  422, 453,
-  391, 422,  360, 391,  329, 360,  298, 329,  267, 298, 236, 267,  205, 236,
-  174, 205,  143, 174,  112, 143,  81,  112,  50,  81,  19,  50,   19,  19,
-  20,  20,   20,  51,   51,  82,   82,  113,  113, 144, 144, 175,  175, 206,
-  206, 237,  237, 268,  268, 299,  299, 330,  330, 361, 361, 392,  392, 423,
-  423, 454,  454, 485,  485, 516,  516, 547,  547, 578, 578, 609,  609, 640,
-  640, 640,  672, 672,  641, 672,  610, 641,  579, 610, 548, 579,  517, 548,
-  486, 517,  455, 486,  424, 455,  393, 424,  362, 393, 331, 362,  300, 331,
-  269, 300,  238, 269,  207, 238,  176, 207,  145, 176, 114, 145,  83,  114,
-  52,  83,   21,  52,   21,  21,   22,  22,   22,  53,  53,  84,   84,  115,
-  115, 146,  146, 177,  177, 208,  208, 239,  239, 270, 270, 301,  301, 332,
-  332, 363,  363, 394,  394, 425,  425, 456,  456, 487, 487, 518,  518, 549,
-  549, 580,  580, 611,  611, 642,  642, 673,  673, 704, 704, 704,  736, 736,
-  705, 736,  674, 705,  643, 674,  612, 643,  581, 612, 550, 581,  519, 550,
-  488, 519,  457, 488,  426, 457,  395, 426,  364, 395, 333, 364,  302, 333,
-  271, 302,  240, 271,  209, 240,  178, 209,  147, 178, 116, 147,  85,  116,
-  54,  85,   23,  54,   23,  23,   24,  24,   24,  55,  55,  86,   86,  117,
-  117, 148,  148, 179,  179, 210,  210, 241,  241, 272, 272, 303,  303, 334,
-  334, 365,  365, 396,  396, 427,  427, 458,  458, 489, 489, 520,  520, 551,
-  551, 582,  582, 613,  613, 644,  644, 675,  675, 706, 706, 737,  737, 768,
-  768, 768,  800, 800,  769, 800,  738, 769,  707, 738, 676, 707,  645, 676,
-  614, 645,  583, 614,  552, 583,  521, 552,  490, 521, 459, 490,  428, 459,
-  397, 428,  366, 397,  335, 366,  304, 335,  273, 304, 242, 273,  211, 242,
-  180, 211,  149, 180,  118, 149,  87,  118,  56,  87,  25,  56,   25,  25,
-  26,  26,   26,  57,   57,  88,   88,  119,  119, 150, 150, 181,  181, 212,
-  212, 243,  243, 274,  274, 305,  305, 336,  336, 367, 367, 398,  398, 429,
-  429, 460,  460, 491,  491, 522,  522, 553,  553, 584, 584, 615,  615, 646,
-  646, 677,  677, 708,  708, 739,  739, 770,  770, 801, 801, 832,  832, 832,
-  864, 864,  833, 864,  802, 833,  771, 802,  740, 771, 709, 740,  678, 709,
-  647, 678,  616, 647,  585, 616,  554, 585,  523, 554, 492, 523,  461, 492,
-  430, 461,  399, 430,  368, 399,  337, 368,  306, 337, 275, 306,  244, 275,
-  213, 244,  182, 213,  151, 182,  120, 151,  89,  120, 58,  89,   27,  58,
-  27,  27,   28,  28,   28,  59,   59,  90,   90,  121, 121, 152,  152, 183,
-  183, 214,  214, 245,  245, 276,  276, 307,  307, 338, 338, 369,  369, 400,
-  400, 431,  431, 462,  462, 493,  493, 524,  524, 555, 555, 586,  586, 617,
-  617, 648,  648, 679,  679, 710,  710, 741,  741, 772, 772, 803,  803, 834,
-  834, 865,  865, 896,  896, 896,  928, 928,  897, 928, 866, 897,  835, 866,
-  804, 835,  773, 804,  742, 773,  711, 742,  680, 711, 649, 680,  618, 649,
-  587, 618,  556, 587,  525, 556,  494, 525,  463, 494, 432, 463,  401, 432,
-  370, 401,  339, 370,  308, 339,  277, 308,  246, 277, 215, 246,  184, 215,
-  153, 184,  122, 153,  91,  122,  60,  91,   29,  60,  29,  29,   30,  30,
-  30,  61,   61,  92,   92,  123,  123, 154,  154, 185, 185, 216,  216, 247,
-  247, 278,  278, 309,  309, 340,  340, 371,  371, 402, 402, 433,  433, 464,
-  464, 495,  495, 526,  526, 557,  557, 588,  588, 619, 619, 650,  650, 681,
-  681, 712,  712, 743,  743, 774,  774, 805,  805, 836, 836, 867,  867, 898,
-  898, 929,  929, 960,  960, 960,  961, 992,  930, 961, 899, 930,  868, 899,
-  837, 868,  806, 837,  775, 806,  744, 775,  713, 744, 682, 713,  651, 682,
-  620, 651,  589, 620,  558, 589,  527, 558,  496, 527, 465, 496,  434, 465,
-  403, 434,  372, 403,  341, 372,  310, 341,  279, 310, 248, 279,  217, 248,
-  186, 217,  155, 186,  124, 155,  93,  124,  62,  93,  31,  62,   63,  94,
-  94,  125,  125, 156,  156, 187,  187, 218,  218, 249, 249, 280,  280, 311,
-  311, 342,  342, 373,  373, 404,  404, 435,  435, 466, 466, 497,  497, 528,
-  528, 559,  559, 590,  590, 621,  621, 652,  652, 683, 683, 714,  714, 745,
-  745, 776,  776, 807,  807, 838,  838, 869,  869, 900, 900, 931,  931, 962,
-  962, 993,  963, 994,  932, 963,  901, 932,  870, 901, 839, 870,  808, 839,
-  777, 808,  746, 777,  715, 746,  684, 715,  653, 684, 622, 653,  591, 622,
-  560, 591,  529, 560,  498, 529,  467, 498,  436, 467, 405, 436,  374, 405,
-  343, 374,  312, 343,  281, 312,  250, 281,  219, 250, 188, 219,  157, 188,
-  126, 157,  95,  126,  127, 158,  158, 189,  189, 220, 220, 251,  251, 282,
-  282, 313,  313, 344,  344, 375,  375, 406,  406, 437, 437, 468,  468, 499,
-  499, 530,  530, 561,  561, 592,  592, 623,  623, 654, 654, 685,  685, 716,
-  716, 747,  747, 778,  778, 809,  809, 840,  840, 871, 871, 902,  902, 933,
-  933, 964,  964, 995,  965, 996,  934, 965,  903, 934, 872, 903,  841, 872,
-  810, 841,  779, 810,  748, 779,  717, 748,  686, 717, 655, 686,  624, 655,
-  593, 624,  562, 593,  531, 562,  500, 531,  469, 500, 438, 469,  407, 438,
-  376, 407,  345, 376,  314, 345,  283, 314,  252, 283, 221, 252,  190, 221,
-  159, 190,  191, 222,  222, 253,  253, 284,  284, 315, 315, 346,  346, 377,
-  377, 408,  408, 439,  439, 470,  470, 501,  501, 532, 532, 563,  563, 594,
-  594, 625,  625, 656,  656, 687,  687, 718,  718, 749, 749, 780,  780, 811,
-  811, 842,  842, 873,  873, 904,  904, 935,  935, 966, 966, 997,  967, 998,
-  936, 967,  905, 936,  874, 905,  843, 874,  812, 843, 781, 812,  750, 781,
-  719, 750,  688, 719,  657, 688,  626, 657,  595, 626, 564, 595,  533, 564,
-  502, 533,  471, 502,  440, 471,  409, 440,  378, 409, 347, 378,  316, 347,
-  285, 316,  254, 285,  223, 254,  255, 286,  286, 317, 317, 348,  348, 379,
-  379, 410,  410, 441,  441, 472,  472, 503,  503, 534, 534, 565,  565, 596,
-  596, 627,  627, 658,  658, 689,  689, 720,  720, 751, 751, 782,  782, 813,
-  813, 844,  844, 875,  875, 906,  906, 937,  937, 968, 968, 999,  969, 1000,
-  938, 969,  907, 938,  876, 907,  845, 876,  814, 845, 783, 814,  752, 783,
-  721, 752,  690, 721,  659, 690,  628, 659,  597, 628, 566, 597,  535, 566,
-  504, 535,  473, 504,  442, 473,  411, 442,  380, 411, 349, 380,  318, 349,
-  287, 318,  319, 350,  350, 381,  381, 412,  412, 443, 443, 474,  474, 505,
-  505, 536,  536, 567,  567, 598,  598, 629,  629, 660, 660, 691,  691, 722,
-  722, 753,  753, 784,  784, 815,  815, 846,  846, 877, 877, 908,  908, 939,
-  939, 970,  970, 1001, 971, 1002, 940, 971,  909, 940, 878, 909,  847, 878,
-  816, 847,  785, 816,  754, 785,  723, 754,  692, 723, 661, 692,  630, 661,
-  599, 630,  568, 599,  537, 568,  506, 537,  475, 506, 444, 475,  413, 444,
-  382, 413,  351, 382,  383, 414,  414, 445,  445, 476, 476, 507,  507, 538,
-  538, 569,  569, 600,  600, 631,  631, 662,  662, 693, 693, 724,  724, 755,
-  755, 786,  786, 817,  817, 848,  848, 879,  879, 910, 910, 941,  941, 972,
-  972, 1003, 973, 1004, 942, 973,  911, 942,  880, 911, 849, 880,  818, 849,
-  787, 818,  756, 787,  725, 756,  694, 725,  663, 694, 632, 663,  601, 632,
-  570, 601,  539, 570,  508, 539,  477, 508,  446, 477, 415, 446,  447, 478,
-  478, 509,  509, 540,  540, 571,  571, 602,  602, 633, 633, 664,  664, 695,
-  695, 726,  726, 757,  757, 788,  788, 819,  819, 850, 850, 881,  881, 912,
-  912, 943,  943, 974,  974, 1005, 975, 1006, 944, 975, 913, 944,  882, 913,
-  851, 882,  820, 851,  789, 820,  758, 789,  727, 758, 696, 727,  665, 696,
-  634, 665,  603, 634,  572, 603,  541, 572,  510, 541, 479, 510,  511, 542,
-  542, 573,  573, 604,  604, 635,  635, 666,  666, 697, 697, 728,  728, 759,
-  759, 790,  790, 821,  821, 852,  852, 883,  883, 914, 914, 945,  945, 976,
-  976, 1007, 977, 1008, 946, 977,  915, 946,  884, 915, 853, 884,  822, 853,
-  791, 822,  760, 791,  729, 760,  698, 729,  667, 698, 636, 667,  605, 636,
-  574, 605,  543, 574,  575, 606,  606, 637,  637, 668, 668, 699,  699, 730,
-  730, 761,  761, 792,  792, 823,  823, 854,  854, 885, 885, 916,  916, 947,
-  947, 978,  978, 1009, 979, 1010, 948, 979,  917, 948, 886, 917,  855, 886,
-  824, 855,  793, 824,  762, 793,  731, 762,  700, 731, 669, 700,  638, 669,
-  607, 638,  639, 670,  670, 701,  701, 732,  732, 763, 763, 794,  794, 825,
-  825, 856,  856, 887,  887, 918,  918, 949,  949, 980, 980, 1011, 981, 1012,
-  950, 981,  919, 950,  888, 919,  857, 888,  826, 857, 795, 826,  764, 795,
-  733, 764,  702, 733,  671, 702,  703, 734,  734, 765, 765, 796,  796, 827,
-  827, 858,  858, 889,  889, 920,  920, 951,  951, 982, 982, 1013, 983, 1014,
-  952, 983,  921, 952,  890, 921,  859, 890,  828, 859, 797, 828,  766, 797,
-  735, 766,  767, 798,  798, 829,  829, 860,  860, 891, 891, 922,  922, 953,
-  953, 984,  984, 1015, 985, 1016, 954, 985,  923, 954, 892, 923,  861, 892,
-  830, 861,  799, 830,  831, 862,  862, 893,  893, 924, 924, 955,  955, 986,
-  986, 1017, 987, 1018, 956, 987,  925, 956,  894, 925, 863, 894,  895, 926,
-  926, 957,  957, 988,  988, 1019, 989, 1020, 958, 989, 927, 958,  959, 990,
-  990, 1021, 991, 1022, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
-  0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15
-};
+                av1_default_iscan_4x4[16]) = { 0, 1, 5,  6,  2, 4,  7,  12,
+                                               3, 8, 11, 13, 9, 10, 14, 15 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
@@ -3201,535 +1664,385 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
 };
 
 const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
-  { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-  { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-  { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
-  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+  { default_scan_4x4, av1_default_iscan_4x4 },
+  { default_scan_8x8, av1_default_iscan_8x8 },
+  { default_scan_16x16, av1_default_iscan_16x16 },
+  { default_scan_32x32, av1_default_iscan_32x32 },
   // Half of the coefficients of tx64 at higher frequencies are set to
   // zeros. So tx32's scan order is used.
-  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+  { default_scan_32x32, av1_default_iscan_32x32 },
 };
 
 const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
   {
       // TX_4X4
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
   },
   {
       // TX_8X8
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
   },
   {
       // TX_16X16
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
   },
   {
       // TX_32X32
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_64X64
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_4X8
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
   },
   {
       // TX_8X4
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
   },
   {
       // TX_8X16
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
   },
   {
       // TX_16X8
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
   },
   {
       // TX_16X32
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
   },
   {
       // TX_32X16
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
   },
   {
       // TX_32X64
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_64X32
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
   },
   {
       // TX_4X16
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { default_scan_4x16, av1_default_iscan_4x16,
-        default_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
-      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
-      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
-      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
   },
   {
       // TX_16X4
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { default_scan_16x4, av1_default_iscan_16x4,
-        default_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
-      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
-      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
-      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
   },
   {
       // TX_8X32
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { default_scan_8x32, av1_default_iscan_8x32,
-        default_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
-      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
-      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
-      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
   },
   {
       // TX_32X8
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { default_scan_32x8, av1_default_iscan_32x8,
-        default_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
-      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
-      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
-      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
   },
   {
       // TX_16X64
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
   },
   {
       // TX_64X16
       // Half of the coefficients of tx64 at higher frequencies are set to
       // zeros. So tx32's scan order is used.
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
   },
 };
diff --git a/media/libaom/src/av1/common/scan.h b/media/libaom/src/av1/common/scan.h
index 233dc0efa..d9620e1c5 100644
--- a/media/libaom/src/av1/common/scan.h
+++ b/media/libaom/src/av1/common/scan.h
@@ -15,9 +15,9 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-#include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,14 +25,14 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef enum SCAN_MODE {
+enum {
   SCAN_MODE_ZIG_ZAG,
   SCAN_MODE_COL_DIAG,
   SCAN_MODE_ROW_DIAG,
   SCAN_MODE_COL_1D,
   SCAN_MODE_ROW_1D,
   SCAN_MODES
-} SCAN_MODE;
+} UENUM1BYTE(SCAN_MODE);
 
 extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
 extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
diff --git a/media/libaom/src/av1/common/seg_common.c b/media/libaom/src/av1/common/seg_common.c
index cd189ad76..60b185161 100644
--- a/media/libaom/src/av1/common/seg_common.c
+++ b/media/libaom/src/av1/common/seg_common.c
@@ -16,12 +16,19 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/quant_common.h"
 
-static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0 };
-
-static const int seg_feature_data_max[SEG_LVL_MAX] = {
-  MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0
+static const int seg_feature_data_signed[SEG_LVL_MAX] = {
+  1, 1, 1, 1, 1, 0, 0, 0
 };
 
+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ,
+                                                       MAX_LOOP_FILTER,
+                                                       MAX_LOOP_FILTER,
+                                                       MAX_LOOP_FILTER,
+                                                       MAX_LOOP_FILTER,
+                                                       7,
+                                                       0,
+                                                       0 };
+
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
 // the coding mechanism is still subject to change so these provide a
@@ -32,7 +39,7 @@ void av1_clearall_segfeatures(struct segmentation *seg) {
   av1_zero(seg->feature_mask);
 }
 
-void calculate_segdata(struct segmentation *seg) {
+void av1_calculate_segdata(struct segmentation *seg) {
   seg->segid_preskip = 0;
   seg->last_active_segid = 0;
   for (int i = 0; i < MAX_SEGMENTS; i++) {
diff --git a/media/libaom/src/av1/common/seg_common.h b/media/libaom/src/av1/common/seg_common.h
index 8c35bba86..aeb9c1768 100644
--- a/media/libaom/src/av1/common/seg_common.h
+++ b/media/libaom/src/av1/common/seg_common.h
@@ -24,7 +24,7 @@ extern "C" {
 #define SEG_TEMPORAL_PRED_CTXS 3
 #define SPATIAL_PREDICTION_PROBS 3
 
-typedef enum {
+enum {
   SEG_LVL_ALT_Q,       // Use alternate Quantizer ....
   SEG_LVL_ALT_LF_Y_V,  // Use alternate loop filter value on y plane vertical
   SEG_LVL_ALT_LF_Y_H,  // Use alternate loop filter value on y plane horizontal
@@ -34,7 +34,7 @@ typedef enum {
   SEG_LVL_SKIP,        // Optional Segment (0,0) + skip mode
   SEG_LVL_GLOBALMV,
   SEG_LVL_MAX
-} SEG_LVL_FEATURES;
+} UENUM1BYTE(SEG_LVL_FEATURES);
 
 struct segmentation {
   uint8_t enabled;
@@ -83,7 +83,7 @@ void av1_clearall_segfeatures(struct segmentation *seg);
 void av1_enable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id);
 
-void calculate_segdata(struct segmentation *seg);
+void av1_calculate_segdata(struct segmentation *seg);
 
 int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
 
diff --git a/media/libaom/src/av1/common/thread_common.c b/media/libaom/src/av1/common/thread_common.c
index 8df4c9a09..f3c8795f8 100644
--- a/media/libaom/src/av1/common/thread_common.c
+++ b/media/libaom/src/av1/common/thread_common.c
@@ -205,7 +205,11 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
 }
 
 static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
-                            int stop, int plane_start, int plane_end) {
+                            int stop,
+#if CONFIG_LPF_MASK
+                            int is_decoding,
+#endif
+                            int plane_start, int plane_end) {
   int mi_row, plane, dir;
   AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
   lf_sync->jobs_enqueued = 0;
@@ -219,7 +223,16 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
         continue;
       else if (plane == 2 && !(cm->lf.filter_level_v))
         continue;
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+#if CONFIG_LPF_MASK
+      int step = MAX_MIB_SIZE;
+      if (is_decoding) {
+        step = MI_SIZE_64X64;
+      }
+      for (mi_row = start; mi_row < stop; mi_row += step)
+#else
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE)
+#endif
+      {
         lf_job_queue->mi_row = mi_row;
         lf_job_queue->plane = plane;
         lf_job_queue->dir = dir;
@@ -230,7 +243,7 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
   }
 }
 
-AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+static AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
   AV1LfMTInfo *cur_job_info = NULL;
 
 #if CONFIG_MULTITHREAD
@@ -255,7 +268,8 @@ static INLINE void thread_loop_filter_rows(
     struct macroblockd_plane *planes, MACROBLOCKD *xd,
     AV1LfSync *const lf_sync) {
   const int sb_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
+      MAX_MIB_SIZE_LOG2;
   int mi_row, mi_col, plane, dir;
   int r, c;
 
@@ -269,7 +283,8 @@ static INLINE void thread_loop_filter_rows(
       r = mi_row >> MAX_MIB_SIZE_LOG2;
 
       if (dir == 0) {
-        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MAX_MIB_SIZE) {
           c = mi_col >> MAX_MIB_SIZE_LOG2;
 
           av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
@@ -280,7 +295,8 @@ static INLINE void thread_loop_filter_rows(
           sync_write(lf_sync, r, c, sb_cols, plane);
         }
       } else if (dir == 1) {
-        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MAX_MIB_SIZE) {
           c = mi_col >> MAX_MIB_SIZE_LOG2;
 
           // Wait for vertical edge filtering of the top-right block to be
@@ -312,15 +328,98 @@ static int loop_filter_row_worker(void *arg1, void *arg2) {
   return 1;
 }
 
+#if CONFIG_LPF_MASK
+static INLINE void thread_loop_filter_bitmask_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd,
+    AV1LfSync *const lf_sync) {
+  const int sb_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >>
+      MIN_MIB_SIZE_LOG2;
+  int mi_row, mi_col, plane, dir;
+  int r, c;
+  (void)xd;
+
+  while (1) {
+    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+    if (cur_job_info != NULL) {
+      mi_row = cur_job_info->mi_row;
+      plane = cur_job_info->plane;
+      dir = cur_job_info->dir;
+      r = mi_row >> MIN_MIB_SIZE_LOG2;
+
+      if (dir == 0) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MI_SIZE_64X64) {
+          c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+
+          av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row,
+                                              mi_col);
+          sync_write(lf_sync, r, c, sb_cols, plane);
+        }
+      } else if (dir == 1) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MI_SIZE_64X64) {
+          c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+          // Wait for vertical edge filtering of the top-right block to be
+          // completed
+          sync_read(lf_sync, r, c, plane);
+
+          // Wait for vertical edge filtering of the right block to be
+          // completed
+          sync_read(lf_sync, r + 1, c, plane);
+
+          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row,
+                                              mi_col);
+        }
+      }
+    } else {
+      break;
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+  thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm,
+                                  lf_data->planes, lf_data->xd, lf_sync);
+  return 1;
+}
+#endif  // CONFIG_LPF_MASK
+
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 MACROBLOCKD *xd, int start, int stop,
                                 int plane_start, int plane_end,
+#if CONFIG_LPF_MASK
+                                int is_decoding,
+#endif
                                 AVxWorker *workers, int nworkers,
                                 AV1LfSync *lf_sync) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+#if CONFIG_LPF_MASK
+  int sb_rows;
+  if (is_decoding) {
+    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >>
+              MIN_MIB_SIZE_LOG2;
+  } else {
+    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+              MAX_MIB_SIZE_LOG2;
+  }
+#else
   // Number of superblock rows and cols
   const int sb_rows =
-      ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+      MAX_MIB_SIZE_LOG2;
+#endif
   const int num_workers = nworkers;
   int i;
 
@@ -336,14 +435,26 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
            sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
   }
 
-  enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end);
+  enqueue_lf_jobs(lf_sync, cm, start, stop,
+#if CONFIG_LPF_MASK
+                  is_decoding,
+#endif
+                  plane_start, plane_end);
 
   // Set up loopfilter thread data.
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
+#if CONFIG_LPF_MASK
+    if (is_decoding) {
+      worker->hook = loop_filter_bitmask_row_worker;
+    } else {
+      worker->hook = loop_filter_row_worker;
+    }
+#else
     worker->hook = loop_filter_row_worker;
+#endif
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
@@ -366,22 +477,55 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               MACROBLOCKD *xd, int plane_start, int plane_end,
-                              int partial_frame, AVxWorker *workers,
-                              int num_workers, AV1LfSync *lf_sync) {
+                              int partial_frame,
+#if CONFIG_LPF_MASK
+                              int is_decoding,
+#endif
+                              AVxWorker *workers, int num_workers,
+                              AV1LfSync *lf_sync) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
-  mi_rows_to_filter = cm->mi_rows;
-  if (partial_frame && cm->mi_rows > 8) {
-    start_mi_row = cm->mi_rows >> 1;
+  mi_rows_to_filter = cm->mi_params.mi_rows;
+  if (partial_frame && cm->mi_params.mi_rows > 8) {
+    start_mi_row = cm->mi_params.mi_rows >> 1;
     start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
 
+#if CONFIG_LPF_MASK
+  if (is_decoding) {
+    cm->is_decoding = is_decoding;
+    // TODO(chengchen): currently use one thread to build bitmasks for the
+    // frame. Make it support multi-thread later.
+    for (int plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+
+      // TODO(chengchen): can we remove this?
+      struct macroblockd_plane *pd = xd->plane;
+      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
+                           plane + 1);
+
+      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+    }
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                        plane_end, 1, workers, num_workers, lf_sync);
+  } else {
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                        plane_end, 0, workers, num_workers, lf_sync);
+  }
+#else
   loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
                       plane_end, workers, num_workers, lf_sync);
+#endif
 }
 
 static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
@@ -630,7 +774,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
   }
 }
 
-AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
   AV1LrMTInfo *cur_job_info = NULL;
 
 #if CONFIG_MULTITHREAD
@@ -664,9 +808,9 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
                            int vstart, int vend);
-  static const copy_fun copy_funs[3] = {
-    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
-  };
+  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
+                                         aom_yv12_partial_coloc_copy_u,
+                                         aom_yv12_partial_coloc_copy_v };
 
   while (1) {
     AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
@@ -772,7 +916,7 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                                           AV1_COMMON *cm, int optimized_lr,
                                           AVxWorker *workers, int num_workers,
                                           AV1LrSync *lr_sync, void *lr_ctxt) {
-  assert(!cm->all_lossless);
+  assert(!cm->features.all_lossless);
 
   const int num_planes = av1_num_planes(cm);
 
diff --git a/media/libaom/src/av1/common/thread_common.h b/media/libaom/src/av1/common/thread_common.h
index 23d61d72a..7397f1c54 100644
--- a/media/libaom/src/av1/common/thread_common.h
+++ b/media/libaom/src/av1/common/thread_common.h
@@ -101,8 +101,11 @@ typedef struct AV1LrSyncData {
 void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                              struct macroblockd *mbd, int plane_start,
+                              struct macroblockd *xd, int plane_start,
                               int plane_end, int partial_frame,
+#if CONFIG_LPF_MASK
+                              int is_decoding,
+#endif
                               AVxWorker *workers, int num_workers,
                               AV1LfSync *lf_sync);
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/media/libaom/src/av1/common/tile_common.c b/media/libaom/src/av1/common/tile_common.c
index 1b413487f..1b11bd760 100644
--- a/media/libaom/src/av1/common/tile_common.c
+++ b/media/libaom/src/av1/common/tile_common.c
@@ -9,9 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/tile_common.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
 #include "aom_dsp/aom_dsp_common.h"
 
 void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
@@ -28,102 +28,126 @@ static int tile_log2(int blk_size, int target) {
 }
 
 void av1_get_tile_limits(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
-  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
-
-  int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2;
-  cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
-  int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
-
-  cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols);
-  cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
-  cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
-  cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
-  cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols);
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int mi_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+  const int mi_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+  const int sb_cols = mi_cols >> seq_params->mib_size_log2;
+  const int sb_rows = mi_rows >> seq_params->mib_size_log2;
+
+  const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
+  tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+  const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+  tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
+  tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
+  tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
+  tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
+  tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols);
 }
 
-void av1_calculate_tile_cols(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
-  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
+                             int cm_mi_rows, int cm_mi_cols,
+                             CommonTileParams *const tiles) {
+  int mi_cols = ALIGN_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+  int sb_cols = mi_cols >> seq_params->mib_size_log2;
+  int sb_rows = mi_rows >> seq_params->mib_size_log2;
   int i;
 
-  if (cm->uniform_tile_spacing_flag) {
+  // This will be overridden if there is at least two columns of tiles
+  // (otherwise there is no inner tile width)
+  tiles->min_inner_width = -1;
+
+  if (tiles->uniform_spacing) {
     int start_sb;
-    int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols);
-    size_sb >>= cm->log2_tile_cols;
+    int size_sb = ALIGN_POWER_OF_TWO(sb_cols, tiles->log2_cols);
+    size_sb >>= tiles->log2_cols;
     assert(size_sb > 0);
     for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
-      cm->tile_col_start_sb[i] = start_sb;
+      tiles->col_start_sb[i] = start_sb;
       start_sb += size_sb;
     }
-    cm->tile_cols = i;
-    cm->tile_col_start_sb[i] = sb_cols;
-    cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0);
-    cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows;
-
-    cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
-    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+    tiles->cols = i;
+    tiles->col_start_sb[i] = sb_cols;
+    tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0);
+    tiles->max_height_sb = sb_rows >> tiles->min_log2_rows;
+
+    tiles->width = size_sb << seq_params->mib_size_log2;
+    tiles->width = AOMMIN(tiles->width, cm_mi_cols);
+    if (tiles->cols > 1) {
+      tiles->min_inner_width = tiles->width;
+    }
   } else {
     int max_tile_area_sb = (sb_rows * sb_cols);
     int widest_tile_sb = 1;
-    cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
-    for (i = 0; i < cm->tile_cols; i++) {
-      int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+    int narrowest_inner_tile_sb = 65536;
+    tiles->log2_cols = tile_log2(1, tiles->cols);
+    for (i = 0; i < tiles->cols; i++) {
+      int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
       widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+      // ignore the rightmost tile in frame for determining the narrowest
+      if (i < tiles->cols - 1)
+        narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
     }
-    if (cm->min_log2_tiles) {
-      max_tile_area_sb >>= (cm->min_log2_tiles + 1);
+    if (tiles->min_log2) {
+      max_tile_area_sb >>= (tiles->min_log2 + 1);
+    }
+    tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+    if (tiles->cols > 1) {
+      tiles->min_inner_width = narrowest_inner_tile_sb
+                               << seq_params->mib_size_log2;
     }
-    cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
   }
 }
 
-void av1_calculate_tile_rows(AV1_COMMON *const cm) {
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
+                             int cm_mi_rows, CommonTileParams *const tiles) {
+  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+  int sb_rows = mi_rows >> seq_params->mib_size_log2;
   int start_sb, size_sb, i;
 
-  if (cm->uniform_tile_spacing_flag) {
-    size_sb = ALIGN_POWER_OF_TWO(sb_rows, cm->log2_tile_rows);
-    size_sb >>= cm->log2_tile_rows;
+  if (tiles->uniform_spacing) {
+    size_sb = ALIGN_POWER_OF_TWO(sb_rows, tiles->log2_rows);
+    size_sb >>= tiles->log2_rows;
     assert(size_sb > 0);
     for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
-      cm->tile_row_start_sb[i] = start_sb;
+      tiles->row_start_sb[i] = start_sb;
       start_sb += size_sb;
     }
-    cm->tile_rows = i;
-    cm->tile_row_start_sb[i] = sb_rows;
+    tiles->rows = i;
+    tiles->row_start_sb[i] = sb_rows;
 
-    cm->tile_height = size_sb << cm->seq_params.mib_size_log2;
-    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+    tiles->height = size_sb << seq_params->mib_size_log2;
+    tiles->height = AOMMIN(tiles->height, cm_mi_rows);
   } else {
-    cm->log2_tile_rows = tile_log2(1, cm->tile_rows);
+    tiles->log2_rows = tile_log2(1, tiles->rows);
   }
 }
 
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
-  assert(row < cm->tile_rows);
-  int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2;
-  int mi_row_end = cm->tile_row_start_sb[row + 1]
+  assert(row < cm->tiles.rows);
+  int mi_row_start = cm->tiles.row_start_sb[row]
+                     << cm->seq_params.mib_size_log2;
+  int mi_row_end = cm->tiles.row_start_sb[row + 1]
                    << cm->seq_params.mib_size_log2;
   tile->tile_row = row;
   tile->mi_row_start = mi_row_start;
-  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows);
+  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
   assert(tile->mi_row_end > tile->mi_row_start);
 }
 
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
-  assert(col < cm->tile_cols);
-  int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2;
-  int mi_col_end = cm->tile_col_start_sb[col + 1]
+  assert(col < cm->tiles.cols);
+  int mi_col_start = cm->tiles.col_start_sb[col]
+                     << cm->seq_params.mib_size_log2;
+  int mi_col_end = cm->tiles.col_start_sb[col + 1]
                    << cm->seq_params.mib_size_log2;
   tile->tile_col = col;
   tile->mi_col_start = mi_col_start;
-  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols);
+  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
   assert(tile->mi_col_end > tile->mi_col_start);
 }
 
@@ -143,30 +167,6 @@ int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
   return sb_cols;
 }
 
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
-  // Round the frame up to a whole number of max superblocks
-  mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
-
-  // Divide by the signalled number of tiles, rounding up to the multiple of
-  // the max superblock size. To do this, shift right (and round up) to get the
-  // tile size in max super-blocks and then shift left again to convert it to
-  // mi units.
-  const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2;
-  const int max_sb_tile_size =
-      ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift;
-  const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2;
-
-  // The actual number of tiles is the ceiling of the frame size in mi units
-  // divided by mi_size. This is at most 1 << log2_tile_num but might be
-  // strictly less if max_sb_tile_size got rounded up significantly.
-  if (ntiles) {
-    *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size;
-    assert(*ntiles <= (1 << log2_tile_num));
-  }
-
-  return mi_tile_size;
-}
-
 AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
                                int is_uv) {
   AV1PixelRect r;
@@ -205,3 +205,35 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
 
   return r;
 }
+
+void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
+  const CommonTileParams *const tiles = &cm->tiles;
+  if (tiles->uniform_spacing) {
+    *w = tiles->width;
+    *h = tiles->height;
+  } else {
+    for (int i = 0; i < tiles->cols; ++i) {
+      const int tile_width_sb =
+          tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+      const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+      assert(i == 0 || tile_w == *w);  // ensure all tiles have same dimension
+      *w = tile_w;
+    }
+
+    for (int i = 0; i < tiles->rows; ++i) {
+      const int tile_height_sb =
+          tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+      const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+      assert(i == 0 || tile_h == *h);  // ensure all tiles have same dimension
+      *h = tile_h;
+    }
+  }
+}
+
+int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
+  // Disable check if there is a single tile col in the frame
+  if (cm->tiles.cols == 1) return 1;
+
+  return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >=
+          (64 << av1_superres_scaled(cm)));
+}
diff --git a/media/libaom/src/av1/common/tile_common.h b/media/libaom/src/av1/common/tile_common.h
index c03553dc6..ca7c5f496 100644
--- a/media/libaom/src/av1/common/tile_common.h
+++ b/media/libaom/src/av1/common/tile_common.h
@@ -19,13 +19,14 @@ extern "C" {
 #include "config/aom_config.h"
 
 struct AV1Common;
+struct SequenceHeader;
+struct CommonTileParams;
 
 #define DEFAULT_MAX_NUM_TG 1
 
 typedef struct TileInfo {
   int mi_row_start, mi_row_end;
   int mi_col_start, mi_col_end;
-  int tg_horz_boundary;
   int tile_row;
   int tile_col;
 } TileInfo;
@@ -37,12 +38,6 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
 
 void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
 void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
-                         int *max_log2_tile_cols);
-
-// Calculate the correct tile size (width or height) for (1 << log2_tile_num)
-// tiles horizontally or vertically in the frame.
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
 
 int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
 int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
@@ -61,9 +56,17 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
 #define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
 #define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
 
+void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
 void av1_get_tile_limits(struct AV1Common *const cm);
-void av1_calculate_tile_cols(struct AV1Common *const cm);
-void av1_calculate_tile_rows(struct AV1Common *const cm);
+void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params,
+                             int cm_mi_rows, int cm_mi_cols,
+                             struct CommonTileParams *const tiles);
+void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params,
+                             int cm_mi_rows,
+                             struct CommonTileParams *const tiles);
+
+// Checks if the minimum tile_width requirement is satisfied
+int av1_is_min_tile_width_satisfied(const struct AV1Common *cm);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libaom/src/av1/common/timing.c b/media/libaom/src/av1/common/timing.c
index 49dbde78f..a959cdf76 100644
--- a/media/libaom/src/av1/common/timing.c
+++ b/media/libaom/src/av1/common/timing.c
@@ -15,22 +15,35 @@
  * The tables are in Kbps instead of Mbps in the specification.
  * Note that depending on the profile, a multiplier is needed.
  */
+#define UNDEFINED_RATE \
+  (1 << 21)  // Placeholder rate for levels with undefined rate
+#define INVALID_RATE \
+  (0)  // For invalid profile-level configuration, set rate to 0
 
 /* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
 /* is a dummy value. The decoder model is not applicable for level 31. */
 static int32_t main_kbps[1 << LEVEL_BITS] = {
-  1500, 3000,  0,     0,     6000,  10000, 0,      0,      12000,  20000,    0,
-  0,    30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0,        0,
-  0,    0,     0,     0,     0,     0,     0,      0,      0,      (1 << 26)
+  1500,           3000,           UNDEFINED_RATE, UNDEFINED_RATE,
+  6000,           10000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  12000,          20000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  30000,          40000,          60000,          60000,
+  60000,          100000,         160000,         160000,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
 };
 
 /* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
 /* is a dummy value. The decoder model is not applicable for level 31. */
 static int32_t high_kbps[1 << LEVEL_BITS] = {
-  0,      0,      0,      0,      0,      0,      0,      0,
-  30000,  50000,  0,      0,      100000, 160000, 240000, 240000,
-  240000, 480000, 800000, 800000, 0,      0,      0,      0,
-  0,      0,      0,      0,      0,      0,      0,      (1 << 26)
+  INVALID_RATE,   INVALID_RATE,   INVALID_RATE,   INVALID_RATE,
+  INVALID_RATE,   INVALID_RATE,   INVALID_RATE,   INVALID_RATE,
+  30000,          50000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  100000,         160000,         240000,         240000,
+  240000,         480000,         800000,         800000,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
 };
 
 /* BitrateProfileFactor */
@@ -38,8 +51,8 @@ static int bitrate_profile_factor[1 << PROFILE_BITS] = {
   1, 2, 3, 0, 0, 0, 0, 0
 };
 
-int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
-                          int seq_tier) {
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                              int seq_tier) {
   int64_t bitrate;
 
   if (seq_tier) {
@@ -51,13 +64,13 @@ int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
   return bitrate * 1000;
 }
 
-void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
   decoder_model->encoder_decoder_buffer_delay_length = 16;
   decoder_model->buffer_removal_time_length = 10;
   decoder_model->frame_presentation_time_length = 10;
 }
 
-void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
   op_params->decoder_model_param_present_flag = 1;
   op_params->decoder_buffer_delay = 90000 >> 1;  //  0.5 s
   op_params->encoder_buffer_delay = 90000 >> 1;  //  0.5 s
@@ -66,7 +79,7 @@ void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
   op_params->initial_display_delay = 8;  // 8 frames delay
 }
 
-void set_resource_availability_parameters(
+void av1_set_resource_availability_parameters(
     aom_dec_model_op_parameters_t *op_params) {
   op_params->decoder_model_param_present_flag = 0;
   op_params->decoder_buffer_delay =
diff --git a/media/libaom/src/av1/common/timing.h b/media/libaom/src/av1/common/timing.h
index 06939ae43..9192124f7 100644
--- a/media/libaom/src/av1/common/timing.h
+++ b/media/libaom/src/av1/common/timing.h
@@ -42,18 +42,14 @@ typedef struct aom_dec_model_op_parameters {
   int initial_display_delay;
 } aom_dec_model_op_parameters_t;
 
-typedef struct aom_op_timing_info_t {
-  uint32_t buffer_removal_time;
-} aom_op_timing_info_t;
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
 
-void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
 
-void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
-
-void set_resource_availability_parameters(
+void av1_set_resource_availability_parameters(
     aom_dec_model_op_parameters_t *op_params);
 
-int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
-                          int seq_tier);
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                              int seq_tier);
 
 #endif  // AOM_AV1_COMMON_TIMING_H_
diff --git a/media/libaom/src/av1/common/token_cdfs.h b/media/libaom/src/av1/common/token_cdfs.h
index 53e956450..f1edda58d 100644
--- a/media/libaom/src/av1/common/token_cdfs.h
+++ b/media/libaom/src/av1/common/token_cdfs.h
@@ -1707,1687 +1707,1687 @@ static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
 
 static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
     [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
-    [CDF_SIZE(NUM_BASE_LEVELS + 2)] =
-        { { { { { AOM_CDF4(4034, 8930, 12727) },
-                { AOM_CDF4(18082, 29741, 31877) },
-                { AOM_CDF4(12596, 26124, 30493) },
-                { AOM_CDF4(9446, 21118, 27005) },
-                { AOM_CDF4(6308, 15141, 21279) },
-                { AOM_CDF4(2463, 6357, 9783) },
-                { AOM_CDF4(20667, 30546, 31929) },
-                { AOM_CDF4(13043, 26123, 30134) },
-                { AOM_CDF4(8151, 18757, 24778) },
-                { AOM_CDF4(5255, 12839, 18632) },
-                { AOM_CDF4(2820, 7206, 11161) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(15736, 27553, 30604) },
-                { AOM_CDF4(11210, 23794, 28787) },
-                { AOM_CDF4(5947, 13874, 19701) },
-                { AOM_CDF4(4215, 9323, 13891) },
-                { AOM_CDF4(2833, 6462, 10059) },
-                { AOM_CDF4(19605, 30393, 31582) },
-                { AOM_CDF4(13523, 26252, 30248) },
-                { AOM_CDF4(8446, 18622, 24512) },
-                { AOM_CDF4(3818, 10343, 15974) },
-                { AOM_CDF4(1481, 4117, 6796) },
-                { AOM_CDF4(22649, 31302, 32190) },
-                { AOM_CDF4(14829, 27127, 30449) },
-                { AOM_CDF4(8313, 17702, 23304) },
-                { AOM_CDF4(3022, 8301, 12786) },
-                { AOM_CDF4(1536, 4412, 7184) },
-                { AOM_CDF4(22354, 29774, 31372) },
-                { AOM_CDF4(14723, 25472, 29214) },
-                { AOM_CDF4(6673, 13745, 18662) },
-                { AOM_CDF4(2068, 5766, 9322) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(6302, 16444, 21761) },
-                { AOM_CDF4(23040, 31538, 32475) },
-                { AOM_CDF4(15196, 28452, 31496) },
-                { AOM_CDF4(10020, 22946, 28514) },
-                { AOM_CDF4(6533, 16862, 23501) },
-                { AOM_CDF4(3538, 9816, 15076) },
-                { AOM_CDF4(24444, 31875, 32525) },
-                { AOM_CDF4(15881, 28924, 31635) },
-                { AOM_CDF4(9922, 22873, 28466) },
-                { AOM_CDF4(6527, 16966, 23691) },
-                { AOM_CDF4(4114, 11303, 17220) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(20201, 30770, 32209) },
-                { AOM_CDF4(14754, 28071, 31258) },
-                { AOM_CDF4(8378, 20186, 26517) },
-                { AOM_CDF4(5916, 15299, 21978) },
-                { AOM_CDF4(4268, 11583, 17901) },
-                { AOM_CDF4(24361, 32025, 32581) },
-                { AOM_CDF4(18673, 30105, 31943) },
-                { AOM_CDF4(10196, 22244, 27576) },
-                { AOM_CDF4(5495, 14349, 20417) },
-                { AOM_CDF4(2676, 7415, 11498) },
-                { AOM_CDF4(24678, 31958, 32585) },
-                { AOM_CDF4(18629, 29906, 31831) },
-                { AOM_CDF4(9364, 20724, 26315) },
-                { AOM_CDF4(4641, 12318, 18094) },
-                { AOM_CDF4(2758, 7387, 11579) },
-                { AOM_CDF4(25433, 31842, 32469) },
-                { AOM_CDF4(18795, 29289, 31411) },
-                { AOM_CDF4(7644, 17584, 23592) },
-                { AOM_CDF4(3408, 9014, 15047) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(4536, 10072, 14001) },
-                { AOM_CDF4(25459, 31416, 32206) },
-                { AOM_CDF4(16605, 28048, 30818) },
-                { AOM_CDF4(11008, 22857, 27719) },
-                { AOM_CDF4(6915, 16268, 22315) },
-                { AOM_CDF4(2625, 6812, 10537) },
-                { AOM_CDF4(24257, 31788, 32499) },
-                { AOM_CDF4(16880, 29454, 31879) },
-                { AOM_CDF4(11958, 25054, 29778) },
-                { AOM_CDF4(7916, 18718, 25084) },
-                { AOM_CDF4(3383, 8777, 13446) },
-                { AOM_CDF4(22720, 31603, 32393) },
-                { AOM_CDF4(14960, 28125, 31335) },
-                { AOM_CDF4(9731, 22210, 27928) },
-                { AOM_CDF4(6304, 15832, 22277) },
-                { AOM_CDF4(2910, 7818, 12166) },
-                { AOM_CDF4(20375, 30627, 32131) },
-                { AOM_CDF4(13904, 27284, 30887) },
-                { AOM_CDF4(9368, 21558, 27144) },
-                { AOM_CDF4(5937, 14966, 21119) },
-                { AOM_CDF4(2667, 7225, 11319) },
-                { AOM_CDF4(23970, 31470, 32378) },
-                { AOM_CDF4(17173, 29734, 32018) },
-                { AOM_CDF4(12795, 25441, 29965) },
-                { AOM_CDF4(8981, 19680, 25893) },
-                { AOM_CDF4(4728, 11372, 16902) },
-                { AOM_CDF4(24287, 31797, 32439) },
-                { AOM_CDF4(16703, 29145, 31696) },
-                { AOM_CDF4(10833, 23554, 28725) },
-                { AOM_CDF4(6468, 16566, 23057) },
-                { AOM_CDF4(2415, 6562, 10278) },
-                { AOM_CDF4(26610, 32395, 32659) },
-                { AOM_CDF4(18590, 30498, 32117) },
-                { AOM_CDF4(12420, 25756, 29950) },
-                { AOM_CDF4(7639, 18746, 24710) },
-                { AOM_CDF4(3001, 8086, 12347) },
-                { AOM_CDF4(25076, 32064, 32580) },
-                { AOM_CDF4(17946, 30128, 32028) },
-                { AOM_CDF4(12024, 24985, 29378) },
-                { AOM_CDF4(7517, 18390, 24304) },
-                { AOM_CDF4(3243, 8781, 13331) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(6037, 16771, 21957) },
-                { AOM_CDF4(24774, 31704, 32426) },
-                { AOM_CDF4(16830, 28589, 31056) },
-                { AOM_CDF4(10602, 22828, 27760) },
-                { AOM_CDF4(6733, 16829, 23071) },
-                { AOM_CDF4(3250, 8914, 13556) },
-                { AOM_CDF4(25582, 32220, 32668) },
-                { AOM_CDF4(18659, 30342, 32223) },
-                { AOM_CDF4(12546, 26149, 30515) },
-                { AOM_CDF4(8420, 20451, 26801) },
-                { AOM_CDF4(4636, 12420, 18344) },
-                { AOM_CDF4(27581, 32362, 32639) },
-                { AOM_CDF4(18987, 30083, 31978) },
-                { AOM_CDF4(11327, 24248, 29084) },
-                { AOM_CDF4(7264, 17719, 24120) },
-                { AOM_CDF4(3995, 10768, 16169) },
-                { AOM_CDF4(25893, 31831, 32487) },
-                { AOM_CDF4(16577, 28587, 31379) },
-                { AOM_CDF4(10189, 22748, 28182) },
-                { AOM_CDF4(6832, 17094, 23556) },
-                { AOM_CDF4(3708, 10110, 15334) },
-                { AOM_CDF4(25904, 32282, 32656) },
-                { AOM_CDF4(19721, 30792, 32276) },
-                { AOM_CDF4(12819, 26243, 30411) },
-                { AOM_CDF4(8572, 20614, 26891) },
-                { AOM_CDF4(5364, 14059, 20467) },
-                { AOM_CDF4(26580, 32438, 32677) },
-                { AOM_CDF4(20852, 31225, 32340) },
-                { AOM_CDF4(12435, 25700, 29967) },
-                { AOM_CDF4(8691, 20825, 26976) },
-                { AOM_CDF4(4446, 12209, 17269) },
-                { AOM_CDF4(27350, 32429, 32696) },
-                { AOM_CDF4(21372, 30977, 32272) },
-                { AOM_CDF4(12673, 25270, 29853) },
-                { AOM_CDF4(9208, 20925, 26640) },
-                { AOM_CDF4(5018, 13351, 18732) },
-                { AOM_CDF4(27351, 32479, 32713) },
-                { AOM_CDF4(21398, 31209, 32387) },
-                { AOM_CDF4(12162, 25047, 29842) },
-                { AOM_CDF4(7896, 18691, 25319) },
-                { AOM_CDF4(4670, 12882, 18881) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(5487, 10460, 13708) },
-                { AOM_CDF4(21597, 28303, 30674) },
-                { AOM_CDF4(11037, 21953, 26476) },
-                { AOM_CDF4(8147, 17962, 22952) },
-                { AOM_CDF4(5242, 13061, 18532) },
-                { AOM_CDF4(1889, 5208, 8182) },
-                { AOM_CDF4(26774, 32133, 32590) },
-                { AOM_CDF4(17844, 29564, 31767) },
-                { AOM_CDF4(11690, 24438, 29171) },
-                { AOM_CDF4(7542, 18215, 24459) },
-                { AOM_CDF4(2993, 8050, 12319) },
-                { AOM_CDF4(28023, 32328, 32591) },
-                { AOM_CDF4(18651, 30126, 31954) },
-                { AOM_CDF4(12164, 25146, 29589) },
-                { AOM_CDF4(7762, 18530, 24771) },
-                { AOM_CDF4(3492, 9183, 13920) },
-                { AOM_CDF4(27591, 32008, 32491) },
-                { AOM_CDF4(17149, 28853, 31510) },
-                { AOM_CDF4(11485, 24003, 28860) },
-                { AOM_CDF4(7697, 18086, 24210) },
-                { AOM_CDF4(3075, 7999, 12218) },
-                { AOM_CDF4(28268, 32482, 32654) },
-                { AOM_CDF4(19631, 31051, 32404) },
-                { AOM_CDF4(13860, 27260, 31020) },
-                { AOM_CDF4(9605, 21613, 27594) },
-                { AOM_CDF4(4876, 12162, 17908) },
-                { AOM_CDF4(27248, 32316, 32576) },
-                { AOM_CDF4(18955, 30457, 32075) },
-                { AOM_CDF4(11824, 23997, 28795) },
-                { AOM_CDF4(7346, 18196, 24647) },
-                { AOM_CDF4(3403, 9247, 14111) },
-                { AOM_CDF4(29711, 32655, 32735) },
-                { AOM_CDF4(21169, 31394, 32417) },
-                { AOM_CDF4(13487, 27198, 30957) },
-                { AOM_CDF4(8828, 21683, 27614) },
-                { AOM_CDF4(4270, 11451, 17038) },
-                { AOM_CDF4(28708, 32578, 32731) },
-                { AOM_CDF4(20120, 31241, 32482) },
-                { AOM_CDF4(13692, 27550, 31321) },
-                { AOM_CDF4(9418, 22514, 28439) },
-                { AOM_CDF4(4999, 13283, 19462) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(5673, 14302, 19711) },
-                { AOM_CDF4(26251, 30701, 31834) },
-                { AOM_CDF4(12782, 23783, 27803) },
-                { AOM_CDF4(9127, 20657, 25808) },
-                { AOM_CDF4(6368, 16208, 21462) },
-                { AOM_CDF4(2465, 7177, 10822) },
-                { AOM_CDF4(29961, 32563, 32719) },
-                { AOM_CDF4(18318, 29891, 31949) },
-                { AOM_CDF4(11361, 24514, 29357) },
-                { AOM_CDF4(7900, 19603, 25607) },
-                { AOM_CDF4(4002, 10590, 15546) },
-                { AOM_CDF4(29637, 32310, 32595) },
-                { AOM_CDF4(18296, 29913, 31809) },
-                { AOM_CDF4(10144, 21515, 26871) },
-                { AOM_CDF4(5358, 14322, 20394) },
-                { AOM_CDF4(3067, 8362, 13346) },
-                { AOM_CDF4(28652, 32470, 32676) },
-                { AOM_CDF4(17538, 30771, 32209) },
-                { AOM_CDF4(13924, 26882, 30494) },
-                { AOM_CDF4(10496, 22837, 27869) },
-                { AOM_CDF4(7236, 16396, 21621) },
-                { AOM_CDF4(30743, 32687, 32746) },
-                { AOM_CDF4(23006, 31676, 32489) },
-                { AOM_CDF4(14494, 27828, 31120) },
-                { AOM_CDF4(10174, 22801, 28352) },
-                { AOM_CDF4(6242, 15281, 21043) },
-                { AOM_CDF4(25817, 32243, 32720) },
-                { AOM_CDF4(18618, 31367, 32325) },
-                { AOM_CDF4(13997, 28318, 31878) },
-                { AOM_CDF4(12255, 26534, 31383) },
-                { AOM_CDF4(9561, 21588, 28450) },
-                { AOM_CDF4(28188, 32635, 32724) },
-                { AOM_CDF4(22060, 32365, 32728) },
-                { AOM_CDF4(18102, 30690, 32528) },
-                { AOM_CDF4(14196, 28864, 31999) },
-                { AOM_CDF4(12262, 25792, 30865) },
-                { AOM_CDF4(24176, 32109, 32628) },
-                { AOM_CDF4(18280, 29681, 31963) },
-                { AOM_CDF4(10205, 23703, 29664) },
-                { AOM_CDF4(7889, 20025, 27676) },
-                { AOM_CDF4(6060, 16743, 23970) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(5141, 7096, 8260) },
-                { AOM_CDF4(27186, 29022, 29789) },
-                { AOM_CDF4(6668, 12568, 15682) },
-                { AOM_CDF4(2172, 6181, 8638) },
-                { AOM_CDF4(1126, 3379, 4531) },
-                { AOM_CDF4(443, 1361, 2254) },
-                { AOM_CDF4(26083, 31153, 32436) },
-                { AOM_CDF4(13486, 24603, 28483) },
-                { AOM_CDF4(6508, 14840, 19910) },
-                { AOM_CDF4(3386, 8800, 13286) },
-                { AOM_CDF4(1530, 4322, 7054) },
-                { AOM_CDF4(29639, 32080, 32548) },
-                { AOM_CDF4(15897, 27552, 30290) },
-                { AOM_CDF4(8588, 20047, 25383) },
-                { AOM_CDF4(4889, 13339, 19269) },
-                { AOM_CDF4(2240, 6871, 10498) },
-                { AOM_CDF4(28165, 32197, 32517) },
-                { AOM_CDF4(20735, 30427, 31568) },
-                { AOM_CDF4(14325, 24671, 27692) },
-                { AOM_CDF4(5119, 12554, 17805) },
-                { AOM_CDF4(1810, 5441, 8261) },
-                { AOM_CDF4(31212, 32724, 32748) },
-                { AOM_CDF4(23352, 31766, 32545) },
-                { AOM_CDF4(14669, 27570, 31059) },
-                { AOM_CDF4(8492, 20894, 27272) },
-                { AOM_CDF4(3644, 10194, 15204) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(2461, 7013, 9371) },
-                { AOM_CDF4(24749, 29600, 30986) },
-                { AOM_CDF4(9466, 19037, 22417) },
-                { AOM_CDF4(3584, 9280, 14400) },
-                { AOM_CDF4(1505, 3929, 5433) },
-                { AOM_CDF4(677, 1500, 2736) },
-                { AOM_CDF4(23987, 30702, 32117) },
-                { AOM_CDF4(13554, 24571, 29263) },
-                { AOM_CDF4(6211, 14556, 21155) },
-                { AOM_CDF4(3135, 10972, 15625) },
-                { AOM_CDF4(2435, 7127, 11427) },
-                { AOM_CDF4(31300, 32532, 32550) },
-                { AOM_CDF4(14757, 30365, 31954) },
-                { AOM_CDF4(4405, 11612, 18553) },
-                { AOM_CDF4(580, 4132, 7322) },
-                { AOM_CDF4(1695, 10169, 14124) },
-                { AOM_CDF4(30008, 32282, 32591) },
-                { AOM_CDF4(19244, 30108, 31748) },
-                { AOM_CDF4(11180, 24158, 29555) },
-                { AOM_CDF4(5650, 14972, 19209) },
-                { AOM_CDF4(2114, 5109, 8456) },
-                { AOM_CDF4(31856, 32716, 32748) },
-                { AOM_CDF4(23012, 31664, 32572) },
-                { AOM_CDF4(13694, 26656, 30636) },
-                { AOM_CDF4(8142, 19508, 26093) },
-                { AOM_CDF4(4253, 10955, 16724) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(601, 983, 1311) },
-                { AOM_CDF4(18725, 23406, 28087) },
-                { AOM_CDF4(5461, 8192, 10923) },
-                { AOM_CDF4(3781, 15124, 21425) },
-                { AOM_CDF4(2587, 7761, 12072) },
-                { AOM_CDF4(106, 458, 810) },
-                { AOM_CDF4(22282, 29710, 31894) },
-                { AOM_CDF4(8508, 20926, 25984) },
-                { AOM_CDF4(3726, 12713, 18083) },
-                { AOM_CDF4(1620, 7112, 10893) },
-                { AOM_CDF4(729, 2236, 3495) },
-                { AOM_CDF4(30163, 32474, 32684) },
-                { AOM_CDF4(18304, 30464, 32000) },
-                { AOM_CDF4(11443, 26526, 29647) },
-                { AOM_CDF4(6007, 15292, 21299) },
-                { AOM_CDF4(2234, 6703, 8937) },
-                { AOM_CDF4(30954, 32177, 32571) },
-                { AOM_CDF4(17363, 29562, 31076) },
-                { AOM_CDF4(9686, 22464, 27410) },
-                { AOM_CDF4(8192, 16384, 21390) },
-                { AOM_CDF4(1755, 8046, 11264) },
-                { AOM_CDF4(31168, 32734, 32748) },
-                { AOM_CDF4(22486, 31441, 32471) },
-                { AOM_CDF4(12833, 25627, 29738) },
-                { AOM_CDF4(6980, 17379, 23122) },
-                { AOM_CDF4(3111, 8887, 13479) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } },
-          { { { { AOM_CDF4(6041, 11854, 15927) },
-                { AOM_CDF4(20326, 30905, 32251) },
-                { AOM_CDF4(14164, 26831, 30725) },
-                { AOM_CDF4(9760, 20647, 26585) },
-                { AOM_CDF4(6416, 14953, 21219) },
-                { AOM_CDF4(2966, 7151, 10891) },
-                { AOM_CDF4(23567, 31374, 32254) },
-                { AOM_CDF4(14978, 27416, 30946) },
-                { AOM_CDF4(9434, 20225, 26254) },
-                { AOM_CDF4(6658, 14558, 20535) },
-                { AOM_CDF4(3916, 8677, 12989) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(18088, 29545, 31587) },
-                { AOM_CDF4(13062, 25843, 30073) },
-                { AOM_CDF4(8940, 16827, 22251) },
-                { AOM_CDF4(7654, 13220, 17973) },
-                { AOM_CDF4(5733, 10316, 14456) },
-                { AOM_CDF4(22879, 31388, 32114) },
-                { AOM_CDF4(15215, 27993, 30955) },
-                { AOM_CDF4(9397, 19445, 24978) },
-                { AOM_CDF4(3442, 9813, 15344) },
-                { AOM_CDF4(1368, 3936, 6532) },
-                { AOM_CDF4(25494, 32033, 32406) },
-                { AOM_CDF4(16772, 27963, 30718) },
-                { AOM_CDF4(9419, 18165, 23260) },
-                { AOM_CDF4(2677, 7501, 11797) },
-                { AOM_CDF4(1516, 4344, 7170) },
-                { AOM_CDF4(26556, 31454, 32101) },
-                { AOM_CDF4(17128, 27035, 30108) },
-                { AOM_CDF4(8324, 15344, 20249) },
-                { AOM_CDF4(1903, 5696, 9469) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8455, 19003, 24368) },
-                { AOM_CDF4(23563, 32021, 32604) },
-                { AOM_CDF4(16237, 29446, 31935) },
-                { AOM_CDF4(10724, 23999, 29358) },
-                { AOM_CDF4(6725, 17528, 24416) },
-                { AOM_CDF4(3927, 10927, 16825) },
-                { AOM_CDF4(26313, 32288, 32634) },
-                { AOM_CDF4(17430, 30095, 32095) },
-                { AOM_CDF4(11116, 24606, 29679) },
-                { AOM_CDF4(7195, 18384, 25269) },
-                { AOM_CDF4(4726, 12852, 19315) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(22822, 31648, 32483) },
-                { AOM_CDF4(16724, 29633, 31929) },
-                { AOM_CDF4(10261, 23033, 28725) },
-                { AOM_CDF4(7029, 17840, 24528) },
-                { AOM_CDF4(4867, 13886, 21502) },
-                { AOM_CDF4(25298, 31892, 32491) },
-                { AOM_CDF4(17809, 29330, 31512) },
-                { AOM_CDF4(9668, 21329, 26579) },
-                { AOM_CDF4(4774, 12956, 18976) },
-                { AOM_CDF4(2322, 7030, 11540) },
-                { AOM_CDF4(25472, 31920, 32543) },
-                { AOM_CDF4(17957, 29387, 31632) },
-                { AOM_CDF4(9196, 20593, 26400) },
-                { AOM_CDF4(4680, 12705, 19202) },
-                { AOM_CDF4(2917, 8456, 13436) },
-                { AOM_CDF4(26471, 32059, 32574) },
-                { AOM_CDF4(18458, 29783, 31909) },
-                { AOM_CDF4(8400, 19464, 25956) },
-                { AOM_CDF4(3812, 10973, 17206) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(6779, 13743, 17678) },
-                { AOM_CDF4(24806, 31797, 32457) },
-                { AOM_CDF4(17616, 29047, 31372) },
-                { AOM_CDF4(11063, 23175, 28003) },
-                { AOM_CDF4(6521, 16110, 22324) },
-                { AOM_CDF4(2764, 7504, 11654) },
-                { AOM_CDF4(25266, 32367, 32637) },
-                { AOM_CDF4(19054, 30553, 32175) },
-                { AOM_CDF4(12139, 25212, 29807) },
-                { AOM_CDF4(7311, 18162, 24704) },
-                { AOM_CDF4(3397, 9164, 14074) },
-                { AOM_CDF4(25988, 32208, 32522) },
-                { AOM_CDF4(16253, 28912, 31526) },
-                { AOM_CDF4(9151, 21387, 27372) },
-                { AOM_CDF4(5688, 14915, 21496) },
-                { AOM_CDF4(2717, 7627, 12004) },
-                { AOM_CDF4(23144, 31855, 32443) },
-                { AOM_CDF4(16070, 28491, 31325) },
-                { AOM_CDF4(8702, 20467, 26517) },
-                { AOM_CDF4(5243, 13956, 20367) },
-                { AOM_CDF4(2621, 7335, 11567) },
-                { AOM_CDF4(26636, 32340, 32630) },
-                { AOM_CDF4(19990, 31050, 32341) },
-                { AOM_CDF4(13243, 26105, 30315) },
-                { AOM_CDF4(8588, 19521, 25918) },
-                { AOM_CDF4(4717, 11585, 17304) },
-                { AOM_CDF4(25844, 32292, 32582) },
-                { AOM_CDF4(19090, 30635, 32097) },
-                { AOM_CDF4(11963, 24546, 28939) },
-                { AOM_CDF4(6218, 16087, 22354) },
-                { AOM_CDF4(2340, 6608, 10426) },
-                { AOM_CDF4(28046, 32576, 32694) },
-                { AOM_CDF4(21178, 31313, 32296) },
-                { AOM_CDF4(13486, 26184, 29870) },
-                { AOM_CDF4(7149, 17871, 23723) },
-                { AOM_CDF4(2833, 7958, 12259) },
-                { AOM_CDF4(27710, 32528, 32686) },
-                { AOM_CDF4(20674, 31076, 32268) },
-                { AOM_CDF4(12413, 24955, 29243) },
-                { AOM_CDF4(6676, 16927, 23097) },
-                { AOM_CDF4(2966, 8333, 12919) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8639, 19339, 24429) },
-                { AOM_CDF4(24404, 31837, 32525) },
-                { AOM_CDF4(16997, 29425, 31784) },
-                { AOM_CDF4(11253, 24234, 29149) },
-                { AOM_CDF4(6751, 17394, 24028) },
-                { AOM_CDF4(3490, 9830, 15191) },
-                { AOM_CDF4(26283, 32471, 32714) },
-                { AOM_CDF4(19599, 31168, 32442) },
-                { AOM_CDF4(13146, 26954, 30893) },
-                { AOM_CDF4(8214, 20588, 26890) },
-                { AOM_CDF4(4699, 13081, 19300) },
-                { AOM_CDF4(28212, 32458, 32669) },
-                { AOM_CDF4(18594, 30316, 32100) },
-                { AOM_CDF4(11219, 24408, 29234) },
-                { AOM_CDF4(6865, 17656, 24149) },
-                { AOM_CDF4(3678, 10362, 16006) },
-                { AOM_CDF4(25825, 32136, 32616) },
-                { AOM_CDF4(17313, 29853, 32021) },
-                { AOM_CDF4(11197, 24471, 29472) },
-                { AOM_CDF4(6947, 17781, 24405) },
-                { AOM_CDF4(3768, 10660, 16261) },
-                { AOM_CDF4(27352, 32500, 32706) },
-                { AOM_CDF4(20850, 31468, 32469) },
-                { AOM_CDF4(14021, 27707, 31133) },
-                { AOM_CDF4(8964, 21748, 27838) },
-                { AOM_CDF4(5437, 14665, 21187) },
-                { AOM_CDF4(26304, 32492, 32698) },
-                { AOM_CDF4(20409, 31380, 32385) },
-                { AOM_CDF4(13682, 27222, 30632) },
-                { AOM_CDF4(8974, 21236, 26685) },
-                { AOM_CDF4(4234, 11665, 16934) },
-                { AOM_CDF4(26273, 32357, 32711) },
-                { AOM_CDF4(20672, 31242, 32441) },
-                { AOM_CDF4(14172, 27254, 30902) },
-                { AOM_CDF4(9870, 21898, 27275) },
-                { AOM_CDF4(5164, 13506, 19270) },
-                { AOM_CDF4(26725, 32459, 32728) },
-                { AOM_CDF4(20991, 31442, 32527) },
-                { AOM_CDF4(13071, 26434, 30811) },
-                { AOM_CDF4(8184, 20090, 26742) },
-                { AOM_CDF4(4803, 13255, 19895) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(7555, 14942, 18501) },
-                { AOM_CDF4(24410, 31178, 32287) },
-                { AOM_CDF4(14394, 26738, 30253) },
-                { AOM_CDF4(8413, 19554, 25195) },
-                { AOM_CDF4(4766, 12924, 18785) },
-                { AOM_CDF4(2029, 5806, 9207) },
-                { AOM_CDF4(26776, 32364, 32663) },
-                { AOM_CDF4(18732, 29967, 31931) },
-                { AOM_CDF4(11005, 23786, 28852) },
-                { AOM_CDF4(6466, 16909, 23510) },
-                { AOM_CDF4(3044, 8638, 13419) },
-                { AOM_CDF4(29208, 32582, 32704) },
-                { AOM_CDF4(20068, 30857, 32208) },
-                { AOM_CDF4(12003, 25085, 29595) },
-                { AOM_CDF4(6947, 17750, 24189) },
-                { AOM_CDF4(3245, 9103, 14007) },
-                { AOM_CDF4(27359, 32465, 32669) },
-                { AOM_CDF4(19421, 30614, 32174) },
-                { AOM_CDF4(11915, 25010, 29579) },
-                { AOM_CDF4(6950, 17676, 24074) },
-                { AOM_CDF4(3007, 8473, 13096) },
-                { AOM_CDF4(29002, 32676, 32735) },
-                { AOM_CDF4(22102, 31849, 32576) },
-                { AOM_CDF4(14408, 28009, 31405) },
-                { AOM_CDF4(9027, 21679, 27931) },
-                { AOM_CDF4(4694, 12678, 18748) },
-                { AOM_CDF4(28216, 32528, 32682) },
-                { AOM_CDF4(20849, 31264, 32318) },
-                { AOM_CDF4(12756, 25815, 29751) },
-                { AOM_CDF4(7565, 18801, 24923) },
-                { AOM_CDF4(3509, 9533, 14477) },
-                { AOM_CDF4(30133, 32687, 32739) },
-                { AOM_CDF4(23063, 31910, 32515) },
-                { AOM_CDF4(14588, 28051, 31132) },
-                { AOM_CDF4(9085, 21649, 27457) },
-                { AOM_CDF4(4261, 11654, 17264) },
-                { AOM_CDF4(29518, 32691, 32748) },
-                { AOM_CDF4(22451, 31959, 32613) },
-                { AOM_CDF4(14864, 28722, 31700) },
-                { AOM_CDF4(9695, 22964, 28716) },
-                { AOM_CDF4(4932, 13358, 19502) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(6465, 16958, 21688) },
-                { AOM_CDF4(25199, 31514, 32360) },
-                { AOM_CDF4(14774, 27149, 30607) },
-                { AOM_CDF4(9257, 21438, 26972) },
-                { AOM_CDF4(5723, 15183, 21882) },
-                { AOM_CDF4(3150, 8879, 13731) },
-                { AOM_CDF4(26989, 32262, 32682) },
-                { AOM_CDF4(17396, 29937, 32085) },
-                { AOM_CDF4(11387, 24901, 29784) },
-                { AOM_CDF4(7289, 18821, 25548) },
-                { AOM_CDF4(3734, 10577, 16086) },
-                { AOM_CDF4(29728, 32501, 32695) },
-                { AOM_CDF4(17431, 29701, 31903) },
-                { AOM_CDF4(9921, 22826, 28300) },
-                { AOM_CDF4(5896, 15434, 22068) },
-                { AOM_CDF4(3430, 9646, 14757) },
-                { AOM_CDF4(28614, 32511, 32705) },
-                { AOM_CDF4(19364, 30638, 32263) },
-                { AOM_CDF4(13129, 26254, 30402) },
-                { AOM_CDF4(8754, 20484, 26440) },
-                { AOM_CDF4(4378, 11607, 17110) },
-                { AOM_CDF4(30292, 32671, 32744) },
-                { AOM_CDF4(21780, 31603, 32501) },
-                { AOM_CDF4(14314, 27829, 31291) },
-                { AOM_CDF4(9611, 22327, 28263) },
-                { AOM_CDF4(4890, 13087, 19065) },
-                { AOM_CDF4(25862, 32567, 32733) },
-                { AOM_CDF4(20794, 32050, 32567) },
-                { AOM_CDF4(17243, 30625, 32254) },
-                { AOM_CDF4(13283, 27628, 31474) },
-                { AOM_CDF4(9669, 22532, 28918) },
-                { AOM_CDF4(27435, 32697, 32748) },
-                { AOM_CDF4(24922, 32390, 32714) },
-                { AOM_CDF4(21449, 31504, 32536) },
-                { AOM_CDF4(16392, 29729, 31832) },
-                { AOM_CDF4(11692, 24884, 29076) },
-                { AOM_CDF4(24193, 32290, 32735) },
-                { AOM_CDF4(18909, 31104, 32563) },
-                { AOM_CDF4(12236, 26841, 31403) },
-                { AOM_CDF4(8171, 21840, 29082) },
-                { AOM_CDF4(7224, 17280, 25275) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(3078, 6839, 9890) },
-                { AOM_CDF4(13837, 20450, 24479) },
-                { AOM_CDF4(5914, 14222, 19328) },
-                { AOM_CDF4(3866, 10267, 14762) },
-                { AOM_CDF4(2612, 7208, 11042) },
-                { AOM_CDF4(1067, 2991, 4776) },
-                { AOM_CDF4(25817, 31646, 32529) },
-                { AOM_CDF4(13708, 26338, 30385) },
-                { AOM_CDF4(7328, 18585, 24870) },
-                { AOM_CDF4(4691, 13080, 19276) },
-                { AOM_CDF4(1825, 5253, 8352) },
-                { AOM_CDF4(29386, 32315, 32624) },
-                { AOM_CDF4(17160, 29001, 31360) },
-                { AOM_CDF4(9602, 21862, 27396) },
-                { AOM_CDF4(5915, 15772, 22148) },
-                { AOM_CDF4(2786, 7779, 12047) },
-                { AOM_CDF4(29246, 32450, 32663) },
-                { AOM_CDF4(18696, 29929, 31818) },
-                { AOM_CDF4(10510, 23369, 28560) },
-                { AOM_CDF4(6229, 16499, 23125) },
-                { AOM_CDF4(2608, 7448, 11705) },
-                { AOM_CDF4(30753, 32710, 32748) },
-                { AOM_CDF4(21638, 31487, 32503) },
-                { AOM_CDF4(12937, 26854, 30870) },
-                { AOM_CDF4(8182, 20596, 26970) },
-                { AOM_CDF4(3637, 10269, 15497) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(5244, 12150, 16906) },
-                { AOM_CDF4(20486, 26858, 29701) },
-                { AOM_CDF4(7756, 18317, 23735) },
-                { AOM_CDF4(3452, 9256, 13146) },
-                { AOM_CDF4(2020, 5206, 8229) },
-                { AOM_CDF4(1801, 4993, 7903) },
-                { AOM_CDF4(27051, 31858, 32531) },
-                { AOM_CDF4(15988, 27531, 30619) },
-                { AOM_CDF4(9188, 21484, 26719) },
-                { AOM_CDF4(6273, 17186, 23800) },
-                { AOM_CDF4(3108, 9355, 14764) },
-                { AOM_CDF4(31076, 32520, 32680) },
-                { AOM_CDF4(18119, 30037, 31850) },
-                { AOM_CDF4(10244, 22969, 27472) },
-                { AOM_CDF4(4692, 14077, 19273) },
-                { AOM_CDF4(3694, 11677, 17556) },
-                { AOM_CDF4(30060, 32581, 32720) },
-                { AOM_CDF4(21011, 30775, 32120) },
-                { AOM_CDF4(11931, 24820, 29289) },
-                { AOM_CDF4(7119, 17662, 24356) },
-                { AOM_CDF4(3833, 10706, 16304) },
-                { AOM_CDF4(31954, 32731, 32748) },
-                { AOM_CDF4(23913, 31724, 32489) },
-                { AOM_CDF4(15520, 28060, 31286) },
-                { AOM_CDF4(11517, 23008, 28571) },
-                { AOM_CDF4(6193, 14508, 20629) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(1035, 2807, 4156) },
-                { AOM_CDF4(13162, 18138, 20939) },
-                { AOM_CDF4(2696, 6633, 8755) },
-                { AOM_CDF4(1373, 4161, 6853) },
-                { AOM_CDF4(1099, 2746, 4716) },
-                { AOM_CDF4(340, 1021, 1599) },
-                { AOM_CDF4(22826, 30419, 32135) },
-                { AOM_CDF4(10395, 21762, 26942) },
-                { AOM_CDF4(4726, 12407, 17361) },
-                { AOM_CDF4(2447, 7080, 10593) },
-                { AOM_CDF4(1227, 3717, 6011) },
-                { AOM_CDF4(28156, 31424, 31934) },
-                { AOM_CDF4(16915, 27754, 30373) },
-                { AOM_CDF4(9148, 20990, 26431) },
-                { AOM_CDF4(5950, 15515, 21148) },
-                { AOM_CDF4(2492, 7327, 11526) },
-                { AOM_CDF4(30602, 32477, 32670) },
-                { AOM_CDF4(20026, 29955, 31568) },
-                { AOM_CDF4(11220, 23628, 28105) },
-                { AOM_CDF4(6652, 17019, 22973) },
-                { AOM_CDF4(3064, 8536, 13043) },
-                { AOM_CDF4(31769, 32724, 32748) },
-                { AOM_CDF4(22230, 30887, 32373) },
-                { AOM_CDF4(12234, 25079, 29731) },
-                { AOM_CDF4(7326, 18816, 25353) },
-                { AOM_CDF4(3933, 10907, 16616) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } },
-          { { { { AOM_CDF4(8896, 16227, 20630) },
-                { AOM_CDF4(23629, 31782, 32527) },
-                { AOM_CDF4(15173, 27755, 31321) },
-                { AOM_CDF4(10158, 21233, 27382) },
-                { AOM_CDF4(6420, 14857, 21558) },
-                { AOM_CDF4(3269, 8155, 12646) },
-                { AOM_CDF4(24835, 32009, 32496) },
-                { AOM_CDF4(16509, 28421, 31579) },
-                { AOM_CDF4(10957, 21514, 27418) },
-                { AOM_CDF4(7881, 15930, 22096) },
-                { AOM_CDF4(5388, 10960, 15918) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(20745, 30773, 32093) },
-                { AOM_CDF4(15200, 27221, 30861) },
-                { AOM_CDF4(13032, 20873, 25667) },
-                { AOM_CDF4(12285, 18663, 23494) },
-                { AOM_CDF4(11563, 17481, 21489) },
-                { AOM_CDF4(26260, 31982, 32320) },
-                { AOM_CDF4(15397, 28083, 31100) },
-                { AOM_CDF4(9742, 19217, 24824) },
-                { AOM_CDF4(3261, 9629, 15362) },
-                { AOM_CDF4(1480, 4322, 7499) },
-                { AOM_CDF4(27599, 32256, 32460) },
-                { AOM_CDF4(16857, 27659, 30774) },
-                { AOM_CDF4(9551, 18290, 23748) },
-                { AOM_CDF4(3052, 8933, 14103) },
-                { AOM_CDF4(2021, 5910, 9787) },
-                { AOM_CDF4(29005, 32015, 32392) },
-                { AOM_CDF4(17677, 27694, 30863) },
-                { AOM_CDF4(9204, 17356, 23219) },
-                { AOM_CDF4(2403, 7516, 12814) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(10808, 22056, 26896) },
-                { AOM_CDF4(25739, 32313, 32676) },
-                { AOM_CDF4(17288, 30203, 32221) },
-                { AOM_CDF4(11359, 24878, 29896) },
-                { AOM_CDF4(6949, 17767, 24893) },
-                { AOM_CDF4(4287, 11796, 18071) },
-                { AOM_CDF4(27880, 32521, 32705) },
-                { AOM_CDF4(19038, 31004, 32414) },
-                { AOM_CDF4(12564, 26345, 30768) },
-                { AOM_CDF4(8269, 19947, 26779) },
-                { AOM_CDF4(5674, 14657, 21674) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(25742, 32319, 32671) },
-                { AOM_CDF4(19557, 31164, 32454) },
-                { AOM_CDF4(13381, 26381, 30755) },
-                { AOM_CDF4(10101, 21466, 26722) },
-                { AOM_CDF4(9209, 19650, 26825) },
-                { AOM_CDF4(27107, 31917, 32432) },
-                { AOM_CDF4(18056, 28893, 31203) },
-                { AOM_CDF4(10200, 21434, 26764) },
-                { AOM_CDF4(4660, 12913, 19502) },
-                { AOM_CDF4(2368, 6930, 12504) },
-                { AOM_CDF4(26960, 32158, 32613) },
-                { AOM_CDF4(18628, 30005, 32031) },
-                { AOM_CDF4(10233, 22442, 28232) },
-                { AOM_CDF4(5471, 14630, 21516) },
-                { AOM_CDF4(3235, 10767, 17109) },
-                { AOM_CDF4(27696, 32440, 32692) },
-                { AOM_CDF4(20032, 31167, 32438) },
-                { AOM_CDF4(8700, 21341, 28442) },
-                { AOM_CDF4(5662, 14831, 21795) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(9704, 17294, 21132) },
-                { AOM_CDF4(26762, 32278, 32633) },
-                { AOM_CDF4(18382, 29620, 31819) },
-                { AOM_CDF4(10891, 23475, 28723) },
-                { AOM_CDF4(6358, 16583, 23309) },
-                { AOM_CDF4(3248, 9118, 14141) },
-                { AOM_CDF4(27204, 32573, 32699) },
-                { AOM_CDF4(19818, 30824, 32329) },
-                { AOM_CDF4(11772, 25120, 30041) },
-                { AOM_CDF4(6995, 18033, 25039) },
-                { AOM_CDF4(3752, 10442, 16098) },
-                { AOM_CDF4(27222, 32256, 32559) },
-                { AOM_CDF4(15356, 28399, 31475) },
-                { AOM_CDF4(8821, 20635, 27057) },
-                { AOM_CDF4(5511, 14404, 21239) },
-                { AOM_CDF4(2935, 8222, 13051) },
-                { AOM_CDF4(24875, 32120, 32529) },
-                { AOM_CDF4(15233, 28265, 31445) },
-                { AOM_CDF4(8605, 20570, 26932) },
-                { AOM_CDF4(5431, 14413, 21196) },
-                { AOM_CDF4(2994, 8341, 13223) },
-                { AOM_CDF4(28201, 32604, 32700) },
-                { AOM_CDF4(21041, 31446, 32456) },
-                { AOM_CDF4(13221, 26213, 30475) },
-                { AOM_CDF4(8255, 19385, 26037) },
-                { AOM_CDF4(4930, 12585, 18830) },
-                { AOM_CDF4(28768, 32448, 32627) },
-                { AOM_CDF4(19705, 30561, 32021) },
-                { AOM_CDF4(11572, 23589, 28220) },
-                { AOM_CDF4(5532, 15034, 21446) },
-                { AOM_CDF4(2460, 7150, 11456) },
-                { AOM_CDF4(29874, 32619, 32699) },
-                { AOM_CDF4(21621, 31071, 32201) },
-                { AOM_CDF4(12511, 24747, 28992) },
-                { AOM_CDF4(6281, 16395, 22748) },
-                { AOM_CDF4(3246, 9278, 14497) },
-                { AOM_CDF4(29715, 32625, 32712) },
-                { AOM_CDF4(20958, 31011, 32283) },
-                { AOM_CDF4(11233, 23671, 28806) },
-                { AOM_CDF4(6012, 16128, 22868) },
-                { AOM_CDF4(3427, 9851, 15414) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(11016, 22111, 26794) },
-                { AOM_CDF4(25946, 32357, 32677) },
-                { AOM_CDF4(17890, 30452, 32252) },
-                { AOM_CDF4(11678, 25142, 29816) },
-                { AOM_CDF4(6720, 17534, 24584) },
-                { AOM_CDF4(4230, 11665, 17820) },
-                { AOM_CDF4(28400, 32623, 32747) },
-                { AOM_CDF4(21164, 31668, 32575) },
-                { AOM_CDF4(13572, 27388, 31182) },
-                { AOM_CDF4(8234, 20750, 27358) },
-                { AOM_CDF4(5065, 14055, 20897) },
-                { AOM_CDF4(28981, 32547, 32705) },
-                { AOM_CDF4(18681, 30543, 32239) },
-                { AOM_CDF4(10919, 24075, 29286) },
-                { AOM_CDF4(6431, 17199, 24077) },
-                { AOM_CDF4(3819, 10464, 16618) },
-                { AOM_CDF4(26870, 32467, 32693) },
-                { AOM_CDF4(19041, 30831, 32347) },
-                { AOM_CDF4(11794, 25211, 30016) },
-                { AOM_CDF4(6888, 18019, 24970) },
-                { AOM_CDF4(4370, 12363, 18992) },
-                { AOM_CDF4(29578, 32670, 32744) },
-                { AOM_CDF4(23159, 32007, 32613) },
-                { AOM_CDF4(15315, 28669, 31676) },
-                { AOM_CDF4(9298, 22607, 28782) },
-                { AOM_CDF4(6144, 15913, 22968) },
-                { AOM_CDF4(28110, 32499, 32669) },
-                { AOM_CDF4(21574, 30937, 32015) },
-                { AOM_CDF4(12759, 24818, 28727) },
-                { AOM_CDF4(6545, 16761, 23042) },
-                { AOM_CDF4(3649, 10597, 16833) },
-                { AOM_CDF4(28163, 32552, 32728) },
-                { AOM_CDF4(22101, 31469, 32464) },
-                { AOM_CDF4(13160, 25472, 30143) },
-                { AOM_CDF4(7303, 18684, 25468) },
-                { AOM_CDF4(5241, 13975, 20955) },
-                { AOM_CDF4(28400, 32631, 32744) },
-                { AOM_CDF4(22104, 31793, 32603) },
-                { AOM_CDF4(13557, 26571, 30846) },
-                { AOM_CDF4(7749, 19861, 26675) },
-                { AOM_CDF4(4873, 14030, 21234) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(9800, 17635, 21073) },
-                { AOM_CDF4(26153, 31885, 32527) },
-                { AOM_CDF4(15038, 27852, 31006) },
-                { AOM_CDF4(8718, 20564, 26486) },
-                { AOM_CDF4(5128, 14076, 20514) },
-                { AOM_CDF4(2636, 7566, 11925) },
-                { AOM_CDF4(27551, 32504, 32701) },
-                { AOM_CDF4(18310, 30054, 32100) },
-                { AOM_CDF4(10211, 23420, 29082) },
-                { AOM_CDF4(6222, 16876, 23916) },
-                { AOM_CDF4(3462, 9954, 15498) },
-                { AOM_CDF4(29991, 32633, 32721) },
-                { AOM_CDF4(19883, 30751, 32201) },
-                { AOM_CDF4(11141, 24184, 29285) },
-                { AOM_CDF4(6420, 16940, 23774) },
-                { AOM_CDF4(3392, 9753, 15118) },
-                { AOM_CDF4(28465, 32616, 32712) },
-                { AOM_CDF4(19850, 30702, 32244) },
-                { AOM_CDF4(10983, 24024, 29223) },
-                { AOM_CDF4(6294, 16770, 23582) },
-                { AOM_CDF4(3244, 9283, 14509) },
-                { AOM_CDF4(30023, 32717, 32748) },
-                { AOM_CDF4(22940, 32032, 32626) },
-                { AOM_CDF4(14282, 27928, 31473) },
-                { AOM_CDF4(8562, 21327, 27914) },
-                { AOM_CDF4(4846, 13393, 19919) },
-                { AOM_CDF4(29981, 32590, 32695) },
-                { AOM_CDF4(20465, 30963, 32166) },
-                { AOM_CDF4(11479, 23579, 28195) },
-                { AOM_CDF4(5916, 15648, 22073) },
-                { AOM_CDF4(3031, 8605, 13398) },
-                { AOM_CDF4(31146, 32691, 32739) },
-                { AOM_CDF4(23106, 31724, 32444) },
-                { AOM_CDF4(13783, 26738, 30439) },
-                { AOM_CDF4(7852, 19468, 25807) },
-                { AOM_CDF4(3860, 11124, 16853) },
-                { AOM_CDF4(31014, 32724, 32748) },
-                { AOM_CDF4(23629, 32109, 32628) },
-                { AOM_CDF4(14747, 28115, 31403) },
-                { AOM_CDF4(8545, 21242, 27478) },
-                { AOM_CDF4(4574, 12781, 19067) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(9185, 19694, 24688) },
-                { AOM_CDF4(26081, 31985, 32621) },
-                { AOM_CDF4(16015, 29000, 31787) },
-                { AOM_CDF4(10542, 23690, 29206) },
-                { AOM_CDF4(6732, 17945, 24677) },
-                { AOM_CDF4(3916, 11039, 16722) },
-                { AOM_CDF4(28224, 32566, 32744) },
-                { AOM_CDF4(19100, 31138, 32485) },
-                { AOM_CDF4(12528, 26620, 30879) },
-                { AOM_CDF4(7741, 20277, 26885) },
-                { AOM_CDF4(4566, 12845, 18990) },
-                { AOM_CDF4(29933, 32593, 32718) },
-                { AOM_CDF4(17670, 30333, 32155) },
-                { AOM_CDF4(10385, 23600, 28909) },
-                { AOM_CDF4(6243, 16236, 22407) },
-                { AOM_CDF4(3976, 10389, 16017) },
-                { AOM_CDF4(28377, 32561, 32738) },
-                { AOM_CDF4(19366, 31175, 32482) },
-                { AOM_CDF4(13327, 27175, 31094) },
-                { AOM_CDF4(8258, 20769, 27143) },
-                { AOM_CDF4(4703, 13198, 19527) },
-                { AOM_CDF4(31086, 32706, 32748) },
-                { AOM_CDF4(22853, 31902, 32583) },
-                { AOM_CDF4(14759, 28186, 31419) },
-                { AOM_CDF4(9284, 22382, 28348) },
-                { AOM_CDF4(5585, 15192, 21868) },
-                { AOM_CDF4(28291, 32652, 32746) },
-                { AOM_CDF4(19849, 32107, 32571) },
-                { AOM_CDF4(14834, 26818, 29214) },
-                { AOM_CDF4(10306, 22594, 28672) },
-                { AOM_CDF4(6615, 17384, 23384) },
-                { AOM_CDF4(28947, 32604, 32745) },
-                { AOM_CDF4(25625, 32289, 32646) },
-                { AOM_CDF4(18758, 28672, 31403) },
-                { AOM_CDF4(10017, 23430, 28523) },
-                { AOM_CDF4(6862, 15269, 22131) },
-                { AOM_CDF4(23933, 32509, 32739) },
-                { AOM_CDF4(19927, 31495, 32631) },
-                { AOM_CDF4(11903, 26023, 30621) },
-                { AOM_CDF4(7026, 20094, 27252) },
-                { AOM_CDF4(5998, 18106, 24437) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(4456, 11274, 15533) },
-                { AOM_CDF4(21219, 29079, 31616) },
-                { AOM_CDF4(11173, 23774, 28567) },
-                { AOM_CDF4(7282, 18293, 24263) },
-                { AOM_CDF4(4890, 13286, 19115) },
-                { AOM_CDF4(1890, 5508, 8659) },
-                { AOM_CDF4(26651, 32136, 32647) },
-                { AOM_CDF4(14630, 28254, 31455) },
-                { AOM_CDF4(8716, 21287, 27395) },
-                { AOM_CDF4(5615, 15331, 22008) },
-                { AOM_CDF4(2675, 7700, 12150) },
-                { AOM_CDF4(29954, 32526, 32690) },
-                { AOM_CDF4(16126, 28982, 31633) },
-                { AOM_CDF4(9030, 21361, 27352) },
-                { AOM_CDF4(5411, 14793, 21271) },
-                { AOM_CDF4(2943, 8422, 13163) },
-                { AOM_CDF4(29539, 32601, 32730) },
-                { AOM_CDF4(18125, 30385, 32201) },
-                { AOM_CDF4(10422, 24090, 29468) },
-                { AOM_CDF4(6468, 17487, 24438) },
-                { AOM_CDF4(2970, 8653, 13531) },
-                { AOM_CDF4(30912, 32715, 32748) },
-                { AOM_CDF4(20666, 31373, 32497) },
-                { AOM_CDF4(12509, 26640, 30917) },
-                { AOM_CDF4(8058, 20629, 27290) },
-                { AOM_CDF4(4231, 12006, 18052) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(10202, 20633, 25484) },
-                { AOM_CDF4(27336, 31445, 32352) },
-                { AOM_CDF4(12420, 24384, 28552) },
-                { AOM_CDF4(7648, 18115, 23856) },
-                { AOM_CDF4(5662, 14341, 19902) },
-                { AOM_CDF4(3611, 10328, 15390) },
-                { AOM_CDF4(30945, 32616, 32736) },
-                { AOM_CDF4(18682, 30505, 32253) },
-                { AOM_CDF4(11513, 25336, 30203) },
-                { AOM_CDF4(7449, 19452, 26148) },
-                { AOM_CDF4(4482, 13051, 18886) },
-                { AOM_CDF4(32022, 32690, 32747) },
-                { AOM_CDF4(18578, 30501, 32146) },
-                { AOM_CDF4(11249, 23368, 28631) },
-                { AOM_CDF4(5645, 16958, 22158) },
-                { AOM_CDF4(5009, 11444, 16637) },
-                { AOM_CDF4(31357, 32710, 32748) },
-                { AOM_CDF4(21552, 31494, 32504) },
-                { AOM_CDF4(13891, 27677, 31340) },
-                { AOM_CDF4(9051, 22098, 28172) },
-                { AOM_CDF4(5190, 13377, 19486) },
-                { AOM_CDF4(32364, 32740, 32748) },
-                { AOM_CDF4(24839, 31907, 32551) },
-                { AOM_CDF4(17160, 28779, 31696) },
-                { AOM_CDF4(12452, 24137, 29602) },
-                { AOM_CDF4(6165, 15389, 22477) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(2575, 7281, 11077) },
-                { AOM_CDF4(14002, 20866, 25402) },
-                { AOM_CDF4(6343, 15056, 19658) },
-                { AOM_CDF4(4474, 11858, 17041) },
-                { AOM_CDF4(2865, 8299, 12534) },
-                { AOM_CDF4(1344, 3949, 6391) },
-                { AOM_CDF4(24720, 31239, 32459) },
-                { AOM_CDF4(12585, 25356, 29968) },
-                { AOM_CDF4(7181, 18246, 24444) },
-                { AOM_CDF4(5025, 13667, 19885) },
-                { AOM_CDF4(2521, 7304, 11605) },
-                { AOM_CDF4(29908, 32252, 32584) },
-                { AOM_CDF4(17421, 29156, 31575) },
-                { AOM_CDF4(9889, 22188, 27782) },
-                { AOM_CDF4(5878, 15647, 22123) },
-                { AOM_CDF4(2814, 8665, 13323) },
-                { AOM_CDF4(30183, 32568, 32713) },
-                { AOM_CDF4(18528, 30195, 32049) },
-                { AOM_CDF4(10982, 24606, 29657) },
-                { AOM_CDF4(6957, 18165, 25231) },
-                { AOM_CDF4(3508, 10118, 15468) },
-                { AOM_CDF4(31761, 32736, 32748) },
-                { AOM_CDF4(21041, 31328, 32546) },
-                { AOM_CDF4(12568, 26732, 31166) },
-                { AOM_CDF4(8052, 20720, 27733) },
-                { AOM_CDF4(4336, 12192, 18396) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } },
-          { { { { AOM_CDF4(7062, 16472, 22319) },
-                { AOM_CDF4(24538, 32261, 32674) },
-                { AOM_CDF4(13675, 28041, 31779) },
-                { AOM_CDF4(8590, 20674, 27631) },
-                { AOM_CDF4(5685, 14675, 22013) },
-                { AOM_CDF4(3655, 9898, 15731) },
-                { AOM_CDF4(26493, 32418, 32658) },
-                { AOM_CDF4(16376, 29342, 32090) },
-                { AOM_CDF4(10594, 22649, 28970) },
-                { AOM_CDF4(8176, 17170, 24303) },
-                { AOM_CDF4(5605, 12694, 19139) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(23888, 31902, 32542) },
-                { AOM_CDF4(18612, 29687, 31987) },
-                { AOM_CDF4(16245, 24852, 29249) },
-                { AOM_CDF4(15765, 22608, 27559) },
-                { AOM_CDF4(19895, 24699, 27510) },
-                { AOM_CDF4(28401, 32212, 32457) },
-                { AOM_CDF4(15274, 27825, 30980) },
-                { AOM_CDF4(9364, 18128, 24332) },
-                { AOM_CDF4(2283, 8193, 15082) },
-                { AOM_CDF4(1228, 3972, 7881) },
-                { AOM_CDF4(29455, 32469, 32620) },
-                { AOM_CDF4(17981, 28245, 31388) },
-                { AOM_CDF4(10921, 20098, 26240) },
-                { AOM_CDF4(3743, 11829, 18657) },
-                { AOM_CDF4(2374, 9593, 15715) },
-                { AOM_CDF4(31068, 32466, 32635) },
-                { AOM_CDF4(20321, 29572, 31971) },
-                { AOM_CDF4(10771, 20255, 27119) },
-                { AOM_CDF4(2795, 10410, 17361) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(9320, 22102, 27840) },
-                { AOM_CDF4(27057, 32464, 32724) },
-                { AOM_CDF4(16331, 30268, 32309) },
-                { AOM_CDF4(10319, 23935, 29720) },
-                { AOM_CDF4(6189, 16448, 24106) },
-                { AOM_CDF4(3589, 10884, 18808) },
-                { AOM_CDF4(29026, 32624, 32748) },
-                { AOM_CDF4(19226, 31507, 32587) },
-                { AOM_CDF4(12692, 26921, 31203) },
-                { AOM_CDF4(7049, 19532, 27635) },
-                { AOM_CDF4(7727, 15669, 23252) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(28056, 32625, 32748) },
-                { AOM_CDF4(22383, 32075, 32669) },
-                { AOM_CDF4(15417, 27098, 31749) },
-                { AOM_CDF4(18127, 26493, 27190) },
-                { AOM_CDF4(5461, 16384, 21845) },
-                { AOM_CDF4(27982, 32091, 32584) },
-                { AOM_CDF4(19045, 29868, 31972) },
-                { AOM_CDF4(10397, 22266, 27932) },
-                { AOM_CDF4(5990, 13697, 21500) },
-                { AOM_CDF4(1792, 6912, 15104) },
-                { AOM_CDF4(28198, 32501, 32718) },
-                { AOM_CDF4(21534, 31521, 32569) },
-                { AOM_CDF4(11109, 25217, 30017) },
-                { AOM_CDF4(5671, 15124, 26151) },
-                { AOM_CDF4(4681, 14043, 18725) },
-                { AOM_CDF4(28688, 32580, 32741) },
-                { AOM_CDF4(22576, 32079, 32661) },
-                { AOM_CDF4(10627, 22141, 28340) },
-                { AOM_CDF4(9362, 14043, 28087) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(7754, 16948, 22142) },
-                { AOM_CDF4(25670, 32330, 32691) },
-                { AOM_CDF4(15663, 29225, 31994) },
-                { AOM_CDF4(9878, 23288, 29158) },
-                { AOM_CDF4(6419, 17088, 24336) },
-                { AOM_CDF4(3859, 11003, 17039) },
-                { AOM_CDF4(27562, 32595, 32725) },
-                { AOM_CDF4(17575, 30588, 32399) },
-                { AOM_CDF4(10819, 24838, 30309) },
-                { AOM_CDF4(7124, 18686, 25916) },
-                { AOM_CDF4(4479, 12688, 19340) },
-                { AOM_CDF4(28385, 32476, 32673) },
-                { AOM_CDF4(15306, 29005, 31938) },
-                { AOM_CDF4(8937, 21615, 28322) },
-                { AOM_CDF4(5982, 15603, 22786) },
-                { AOM_CDF4(3620, 10267, 16136) },
-                { AOM_CDF4(27280, 32464, 32667) },
-                { AOM_CDF4(15607, 29160, 32004) },
-                { AOM_CDF4(9091, 22135, 28740) },
-                { AOM_CDF4(6232, 16632, 24020) },
-                { AOM_CDF4(4047, 11377, 17672) },
-                { AOM_CDF4(29220, 32630, 32718) },
-                { AOM_CDF4(19650, 31220, 32462) },
-                { AOM_CDF4(13050, 26312, 30827) },
-                { AOM_CDF4(9228, 20870, 27468) },
-                { AOM_CDF4(6146, 15149, 21971) },
-                { AOM_CDF4(30169, 32481, 32623) },
-                { AOM_CDF4(17212, 29311, 31554) },
-                { AOM_CDF4(9911, 21311, 26882) },
-                { AOM_CDF4(4487, 13314, 20372) },
-                { AOM_CDF4(2570, 7772, 12889) },
-                { AOM_CDF4(30924, 32613, 32708) },
-                { AOM_CDF4(19490, 30206, 32107) },
-                { AOM_CDF4(11232, 23998, 29276) },
-                { AOM_CDF4(6769, 17955, 25035) },
-                { AOM_CDF4(4398, 12623, 19214) },
-                { AOM_CDF4(30609, 32627, 32722) },
-                { AOM_CDF4(19370, 30582, 32287) },
-                { AOM_CDF4(10457, 23619, 29409) },
-                { AOM_CDF4(6443, 17637, 24834) },
-                { AOM_CDF4(4645, 13236, 20106) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8626, 20271, 26216) },
-                { AOM_CDF4(26707, 32406, 32711) },
-                { AOM_CDF4(16999, 30329, 32286) },
-                { AOM_CDF4(11445, 25123, 30286) },
-                { AOM_CDF4(6411, 18828, 25601) },
-                { AOM_CDF4(6801, 12458, 20248) },
-                { AOM_CDF4(29918, 32682, 32748) },
-                { AOM_CDF4(20649, 31739, 32618) },
-                { AOM_CDF4(12879, 27773, 31581) },
-                { AOM_CDF4(7896, 21751, 28244) },
-                { AOM_CDF4(5260, 14870, 23698) },
-                { AOM_CDF4(29252, 32593, 32731) },
-                { AOM_CDF4(17072, 30460, 32294) },
-                { AOM_CDF4(10653, 24143, 29365) },
-                { AOM_CDF4(6536, 17490, 23983) },
-                { AOM_CDF4(4929, 13170, 20085) },
-                { AOM_CDF4(28137, 32518, 32715) },
-                { AOM_CDF4(18171, 30784, 32407) },
-                { AOM_CDF4(11437, 25436, 30459) },
-                { AOM_CDF4(7252, 18534, 26176) },
-                { AOM_CDF4(4126, 13353, 20978) },
-                { AOM_CDF4(31162, 32726, 32748) },
-                { AOM_CDF4(23017, 32222, 32701) },
-                { AOM_CDF4(15629, 29233, 32046) },
-                { AOM_CDF4(9387, 22621, 29480) },
-                { AOM_CDF4(6922, 17616, 25010) },
-                { AOM_CDF4(28838, 32265, 32614) },
-                { AOM_CDF4(19701, 30206, 31920) },
-                { AOM_CDF4(11214, 22410, 27933) },
-                { AOM_CDF4(5320, 14177, 23034) },
-                { AOM_CDF4(5049, 12881, 17827) },
-                { AOM_CDF4(27484, 32471, 32734) },
-                { AOM_CDF4(21076, 31526, 32561) },
-                { AOM_CDF4(12707, 26303, 31211) },
-                { AOM_CDF4(8169, 21722, 28219) },
-                { AOM_CDF4(6045, 19406, 27042) },
-                { AOM_CDF4(27753, 32572, 32745) },
-                { AOM_CDF4(20832, 31878, 32653) },
-                { AOM_CDF4(13250, 27356, 31674) },
-                { AOM_CDF4(7718, 21508, 29858) },
-                { AOM_CDF4(7209, 18350, 25559) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(7876, 16901, 21741) },
-                { AOM_CDF4(24001, 31898, 32625) },
-                { AOM_CDF4(14529, 27959, 31451) },
-                { AOM_CDF4(8273, 20818, 27258) },
-                { AOM_CDF4(5278, 14673, 21510) },
-                { AOM_CDF4(2983, 8843, 14039) },
-                { AOM_CDF4(28016, 32574, 32732) },
-                { AOM_CDF4(17471, 30306, 32301) },
-                { AOM_CDF4(10224, 24063, 29728) },
-                { AOM_CDF4(6602, 17954, 25052) },
-                { AOM_CDF4(4002, 11585, 17759) },
-                { AOM_CDF4(30190, 32634, 32739) },
-                { AOM_CDF4(17497, 30282, 32270) },
-                { AOM_CDF4(10229, 23729, 29538) },
-                { AOM_CDF4(6344, 17211, 24440) },
-                { AOM_CDF4(3849, 11189, 17108) },
-                { AOM_CDF4(28570, 32583, 32726) },
-                { AOM_CDF4(17521, 30161, 32238) },
-                { AOM_CDF4(10153, 23565, 29378) },
-                { AOM_CDF4(6455, 17341, 24443) },
-                { AOM_CDF4(3907, 11042, 17024) },
-                { AOM_CDF4(30689, 32715, 32748) },
-                { AOM_CDF4(21546, 31840, 32610) },
-                { AOM_CDF4(13547, 27581, 31459) },
-                { AOM_CDF4(8912, 21757, 28309) },
-                { AOM_CDF4(5548, 15080, 22046) },
-                { AOM_CDF4(30783, 32540, 32685) },
-                { AOM_CDF4(17540, 29528, 31668) },
-                { AOM_CDF4(10160, 21468, 26783) },
-                { AOM_CDF4(4724, 13393, 20054) },
-                { AOM_CDF4(2702, 8174, 13102) },
-                { AOM_CDF4(31648, 32686, 32742) },
-                { AOM_CDF4(20954, 31094, 32337) },
-                { AOM_CDF4(12420, 25698, 30179) },
-                { AOM_CDF4(7304, 19320, 26248) },
-                { AOM_CDF4(4366, 12261, 18864) },
-                { AOM_CDF4(31581, 32723, 32748) },
-                { AOM_CDF4(21373, 31586, 32525) },
-                { AOM_CDF4(12744, 26625, 30885) },
-                { AOM_CDF4(7431, 20322, 26950) },
-                { AOM_CDF4(4692, 13323, 20111) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(7833, 18369, 24095) },
-                { AOM_CDF4(26650, 32273, 32702) },
-                { AOM_CDF4(16371, 29961, 32191) },
-                { AOM_CDF4(11055, 24082, 29629) },
-                { AOM_CDF4(6892, 18644, 25400) },
-                { AOM_CDF4(5006, 13057, 19240) },
-                { AOM_CDF4(29834, 32666, 32748) },
-                { AOM_CDF4(19577, 31335, 32570) },
-                { AOM_CDF4(12253, 26509, 31122) },
-                { AOM_CDF4(7991, 20772, 27711) },
-                { AOM_CDF4(5677, 15910, 23059) },
-                { AOM_CDF4(30109, 32532, 32720) },
-                { AOM_CDF4(16747, 30166, 32252) },
-                { AOM_CDF4(10134, 23542, 29184) },
-                { AOM_CDF4(5791, 16176, 23556) },
-                { AOM_CDF4(4362, 10414, 17284) },
-                { AOM_CDF4(29492, 32626, 32748) },
-                { AOM_CDF4(19894, 31402, 32525) },
-                { AOM_CDF4(12942, 27071, 30869) },
-                { AOM_CDF4(8346, 21216, 27405) },
-                { AOM_CDF4(6572, 17087, 23859) },
-                { AOM_CDF4(32035, 32735, 32748) },
-                { AOM_CDF4(22957, 31838, 32618) },
-                { AOM_CDF4(14724, 28572, 31772) },
-                { AOM_CDF4(10364, 23999, 29553) },
-                { AOM_CDF4(7004, 18433, 25655) },
-                { AOM_CDF4(27528, 32277, 32681) },
-                { AOM_CDF4(16959, 31171, 32096) },
-                { AOM_CDF4(10486, 23593, 27962) },
-                { AOM_CDF4(8192, 16384, 23211) },
-                { AOM_CDF4(8937, 17873, 20852) },
-                { AOM_CDF4(27715, 32002, 32615) },
-                { AOM_CDF4(15073, 29491, 31676) },
-                { AOM_CDF4(11264, 24576, 28672) },
-                { AOM_CDF4(2341, 18725, 23406) },
-                { AOM_CDF4(7282, 18204, 25486) },
-                { AOM_CDF4(28547, 32213, 32657) },
-                { AOM_CDF4(20788, 29773, 32239) },
-                { AOM_CDF4(6780, 21469, 30508) },
-                { AOM_CDF4(5958, 14895, 23831) },
-                { AOM_CDF4(16384, 21845, 27307) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(5992, 14304, 19765) },
-                { AOM_CDF4(22612, 31238, 32456) },
-                { AOM_CDF4(13456, 27162, 31087) },
-                { AOM_CDF4(8001, 20062, 26504) },
-                { AOM_CDF4(5168, 14105, 20764) },
-                { AOM_CDF4(2632, 7771, 12385) },
-                { AOM_CDF4(27034, 32344, 32709) },
-                { AOM_CDF4(15850, 29415, 31997) },
-                { AOM_CDF4(9494, 22776, 28841) },
-                { AOM_CDF4(6151, 16830, 23969) },
-                { AOM_CDF4(3461, 10039, 15722) },
-                { AOM_CDF4(30134, 32569, 32731) },
-                { AOM_CDF4(15638, 29422, 31945) },
-                { AOM_CDF4(9150, 21865, 28218) },
-                { AOM_CDF4(5647, 15719, 22676) },
-                { AOM_CDF4(3402, 9772, 15477) },
-                { AOM_CDF4(28530, 32586, 32735) },
-                { AOM_CDF4(17139, 30298, 32292) },
-                { AOM_CDF4(10200, 24039, 29685) },
-                { AOM_CDF4(6419, 17674, 24786) },
-                { AOM_CDF4(3544, 10225, 15824) },
-                { AOM_CDF4(31333, 32726, 32748) },
-                { AOM_CDF4(20618, 31487, 32544) },
-                { AOM_CDF4(12901, 27217, 31232) },
-                { AOM_CDF4(8624, 21734, 28171) },
-                { AOM_CDF4(5104, 14191, 20748) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(11206, 21090, 26561) },
-                { AOM_CDF4(28759, 32279, 32671) },
-                { AOM_CDF4(14171, 27952, 31569) },
-                { AOM_CDF4(9743, 22907, 29141) },
-                { AOM_CDF4(6871, 17886, 24868) },
-                { AOM_CDF4(4960, 13152, 19315) },
-                { AOM_CDF4(31077, 32661, 32748) },
-                { AOM_CDF4(19400, 31195, 32515) },
-                { AOM_CDF4(12752, 26858, 31040) },
-                { AOM_CDF4(8370, 22098, 28591) },
-                { AOM_CDF4(5457, 15373, 22298) },
-                { AOM_CDF4(31697, 32706, 32748) },
-                { AOM_CDF4(17860, 30657, 32333) },
-                { AOM_CDF4(12510, 24812, 29261) },
-                { AOM_CDF4(6180, 19124, 24722) },
-                { AOM_CDF4(5041, 13548, 17959) },
-                { AOM_CDF4(31552, 32716, 32748) },
-                { AOM_CDF4(21908, 31769, 32623) },
-                { AOM_CDF4(14470, 28201, 31565) },
-                { AOM_CDF4(9493, 22982, 28608) },
-                { AOM_CDF4(6858, 17240, 24137) },
-                { AOM_CDF4(32543, 32752, 32756) },
-                { AOM_CDF4(24286, 32097, 32666) },
-                { AOM_CDF4(15958, 29217, 32024) },
-                { AOM_CDF4(10207, 24234, 29958) },
-                { AOM_CDF4(6929, 18305, 25652) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } },
-            { { { AOM_CDF4(4137, 10847, 15682) },
-                { AOM_CDF4(17824, 27001, 30058) },
-                { AOM_CDF4(10204, 22796, 28291) },
-                { AOM_CDF4(6076, 15935, 22125) },
-                { AOM_CDF4(3852, 10937, 16816) },
-                { AOM_CDF4(2252, 6324, 10131) },
-                { AOM_CDF4(25840, 32016, 32662) },
-                { AOM_CDF4(15109, 28268, 31531) },
-                { AOM_CDF4(9385, 22231, 28340) },
-                { AOM_CDF4(6082, 16672, 23479) },
-                { AOM_CDF4(3318, 9427, 14681) },
-                { AOM_CDF4(30594, 32574, 32718) },
-                { AOM_CDF4(16836, 29552, 31859) },
-                { AOM_CDF4(9556, 22542, 28356) },
-                { AOM_CDF4(6305, 16725, 23540) },
-                { AOM_CDF4(3376, 9895, 15184) },
-                { AOM_CDF4(29383, 32617, 32745) },
-                { AOM_CDF4(18891, 30809, 32401) },
-                { AOM_CDF4(11688, 25942, 30687) },
-                { AOM_CDF4(7468, 19469, 26651) },
-                { AOM_CDF4(3909, 11358, 17012) },
-                { AOM_CDF4(31564, 32736, 32748) },
-                { AOM_CDF4(20906, 31611, 32600) },
-                { AOM_CDF4(13191, 27621, 31537) },
-                { AOM_CDF4(8768, 22029, 28676) },
-                { AOM_CDF4(5079, 14109, 20906) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } },
-              { { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) },
-                { AOM_CDF4(8192, 16384, 24576) } } } } };
+    [CDF_SIZE(NUM_BASE_LEVELS +
+              2)] = { { { { { AOM_CDF4(4034, 8930, 12727) },
+                            { AOM_CDF4(18082, 29741, 31877) },
+                            { AOM_CDF4(12596, 26124, 30493) },
+                            { AOM_CDF4(9446, 21118, 27005) },
+                            { AOM_CDF4(6308, 15141, 21279) },
+                            { AOM_CDF4(2463, 6357, 9783) },
+                            { AOM_CDF4(20667, 30546, 31929) },
+                            { AOM_CDF4(13043, 26123, 30134) },
+                            { AOM_CDF4(8151, 18757, 24778) },
+                            { AOM_CDF4(5255, 12839, 18632) },
+                            { AOM_CDF4(2820, 7206, 11161) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(15736, 27553, 30604) },
+                            { AOM_CDF4(11210, 23794, 28787) },
+                            { AOM_CDF4(5947, 13874, 19701) },
+                            { AOM_CDF4(4215, 9323, 13891) },
+                            { AOM_CDF4(2833, 6462, 10059) },
+                            { AOM_CDF4(19605, 30393, 31582) },
+                            { AOM_CDF4(13523, 26252, 30248) },
+                            { AOM_CDF4(8446, 18622, 24512) },
+                            { AOM_CDF4(3818, 10343, 15974) },
+                            { AOM_CDF4(1481, 4117, 6796) },
+                            { AOM_CDF4(22649, 31302, 32190) },
+                            { AOM_CDF4(14829, 27127, 30449) },
+                            { AOM_CDF4(8313, 17702, 23304) },
+                            { AOM_CDF4(3022, 8301, 12786) },
+                            { AOM_CDF4(1536, 4412, 7184) },
+                            { AOM_CDF4(22354, 29774, 31372) },
+                            { AOM_CDF4(14723, 25472, 29214) },
+                            { AOM_CDF4(6673, 13745, 18662) },
+                            { AOM_CDF4(2068, 5766, 9322) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6302, 16444, 21761) },
+                            { AOM_CDF4(23040, 31538, 32475) },
+                            { AOM_CDF4(15196, 28452, 31496) },
+                            { AOM_CDF4(10020, 22946, 28514) },
+                            { AOM_CDF4(6533, 16862, 23501) },
+                            { AOM_CDF4(3538, 9816, 15076) },
+                            { AOM_CDF4(24444, 31875, 32525) },
+                            { AOM_CDF4(15881, 28924, 31635) },
+                            { AOM_CDF4(9922, 22873, 28466) },
+                            { AOM_CDF4(6527, 16966, 23691) },
+                            { AOM_CDF4(4114, 11303, 17220) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(20201, 30770, 32209) },
+                            { AOM_CDF4(14754, 28071, 31258) },
+                            { AOM_CDF4(8378, 20186, 26517) },
+                            { AOM_CDF4(5916, 15299, 21978) },
+                            { AOM_CDF4(4268, 11583, 17901) },
+                            { AOM_CDF4(24361, 32025, 32581) },
+                            { AOM_CDF4(18673, 30105, 31943) },
+                            { AOM_CDF4(10196, 22244, 27576) },
+                            { AOM_CDF4(5495, 14349, 20417) },
+                            { AOM_CDF4(2676, 7415, 11498) },
+                            { AOM_CDF4(24678, 31958, 32585) },
+                            { AOM_CDF4(18629, 29906, 31831) },
+                            { AOM_CDF4(9364, 20724, 26315) },
+                            { AOM_CDF4(4641, 12318, 18094) },
+                            { AOM_CDF4(2758, 7387, 11579) },
+                            { AOM_CDF4(25433, 31842, 32469) },
+                            { AOM_CDF4(18795, 29289, 31411) },
+                            { AOM_CDF4(7644, 17584, 23592) },
+                            { AOM_CDF4(3408, 9014, 15047) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4536, 10072, 14001) },
+                            { AOM_CDF4(25459, 31416, 32206) },
+                            { AOM_CDF4(16605, 28048, 30818) },
+                            { AOM_CDF4(11008, 22857, 27719) },
+                            { AOM_CDF4(6915, 16268, 22315) },
+                            { AOM_CDF4(2625, 6812, 10537) },
+                            { AOM_CDF4(24257, 31788, 32499) },
+                            { AOM_CDF4(16880, 29454, 31879) },
+                            { AOM_CDF4(11958, 25054, 29778) },
+                            { AOM_CDF4(7916, 18718, 25084) },
+                            { AOM_CDF4(3383, 8777, 13446) },
+                            { AOM_CDF4(22720, 31603, 32393) },
+                            { AOM_CDF4(14960, 28125, 31335) },
+                            { AOM_CDF4(9731, 22210, 27928) },
+                            { AOM_CDF4(6304, 15832, 22277) },
+                            { AOM_CDF4(2910, 7818, 12166) },
+                            { AOM_CDF4(20375, 30627, 32131) },
+                            { AOM_CDF4(13904, 27284, 30887) },
+                            { AOM_CDF4(9368, 21558, 27144) },
+                            { AOM_CDF4(5937, 14966, 21119) },
+                            { AOM_CDF4(2667, 7225, 11319) },
+                            { AOM_CDF4(23970, 31470, 32378) },
+                            { AOM_CDF4(17173, 29734, 32018) },
+                            { AOM_CDF4(12795, 25441, 29965) },
+                            { AOM_CDF4(8981, 19680, 25893) },
+                            { AOM_CDF4(4728, 11372, 16902) },
+                            { AOM_CDF4(24287, 31797, 32439) },
+                            { AOM_CDF4(16703, 29145, 31696) },
+                            { AOM_CDF4(10833, 23554, 28725) },
+                            { AOM_CDF4(6468, 16566, 23057) },
+                            { AOM_CDF4(2415, 6562, 10278) },
+                            { AOM_CDF4(26610, 32395, 32659) },
+                            { AOM_CDF4(18590, 30498, 32117) },
+                            { AOM_CDF4(12420, 25756, 29950) },
+                            { AOM_CDF4(7639, 18746, 24710) },
+                            { AOM_CDF4(3001, 8086, 12347) },
+                            { AOM_CDF4(25076, 32064, 32580) },
+                            { AOM_CDF4(17946, 30128, 32028) },
+                            { AOM_CDF4(12024, 24985, 29378) },
+                            { AOM_CDF4(7517, 18390, 24304) },
+                            { AOM_CDF4(3243, 8781, 13331) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6037, 16771, 21957) },
+                            { AOM_CDF4(24774, 31704, 32426) },
+                            { AOM_CDF4(16830, 28589, 31056) },
+                            { AOM_CDF4(10602, 22828, 27760) },
+                            { AOM_CDF4(6733, 16829, 23071) },
+                            { AOM_CDF4(3250, 8914, 13556) },
+                            { AOM_CDF4(25582, 32220, 32668) },
+                            { AOM_CDF4(18659, 30342, 32223) },
+                            { AOM_CDF4(12546, 26149, 30515) },
+                            { AOM_CDF4(8420, 20451, 26801) },
+                            { AOM_CDF4(4636, 12420, 18344) },
+                            { AOM_CDF4(27581, 32362, 32639) },
+                            { AOM_CDF4(18987, 30083, 31978) },
+                            { AOM_CDF4(11327, 24248, 29084) },
+                            { AOM_CDF4(7264, 17719, 24120) },
+                            { AOM_CDF4(3995, 10768, 16169) },
+                            { AOM_CDF4(25893, 31831, 32487) },
+                            { AOM_CDF4(16577, 28587, 31379) },
+                            { AOM_CDF4(10189, 22748, 28182) },
+                            { AOM_CDF4(6832, 17094, 23556) },
+                            { AOM_CDF4(3708, 10110, 15334) },
+                            { AOM_CDF4(25904, 32282, 32656) },
+                            { AOM_CDF4(19721, 30792, 32276) },
+                            { AOM_CDF4(12819, 26243, 30411) },
+                            { AOM_CDF4(8572, 20614, 26891) },
+                            { AOM_CDF4(5364, 14059, 20467) },
+                            { AOM_CDF4(26580, 32438, 32677) },
+                            { AOM_CDF4(20852, 31225, 32340) },
+                            { AOM_CDF4(12435, 25700, 29967) },
+                            { AOM_CDF4(8691, 20825, 26976) },
+                            { AOM_CDF4(4446, 12209, 17269) },
+                            { AOM_CDF4(27350, 32429, 32696) },
+                            { AOM_CDF4(21372, 30977, 32272) },
+                            { AOM_CDF4(12673, 25270, 29853) },
+                            { AOM_CDF4(9208, 20925, 26640) },
+                            { AOM_CDF4(5018, 13351, 18732) },
+                            { AOM_CDF4(27351, 32479, 32713) },
+                            { AOM_CDF4(21398, 31209, 32387) },
+                            { AOM_CDF4(12162, 25047, 29842) },
+                            { AOM_CDF4(7896, 18691, 25319) },
+                            { AOM_CDF4(4670, 12882, 18881) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5487, 10460, 13708) },
+                            { AOM_CDF4(21597, 28303, 30674) },
+                            { AOM_CDF4(11037, 21953, 26476) },
+                            { AOM_CDF4(8147, 17962, 22952) },
+                            { AOM_CDF4(5242, 13061, 18532) },
+                            { AOM_CDF4(1889, 5208, 8182) },
+                            { AOM_CDF4(26774, 32133, 32590) },
+                            { AOM_CDF4(17844, 29564, 31767) },
+                            { AOM_CDF4(11690, 24438, 29171) },
+                            { AOM_CDF4(7542, 18215, 24459) },
+                            { AOM_CDF4(2993, 8050, 12319) },
+                            { AOM_CDF4(28023, 32328, 32591) },
+                            { AOM_CDF4(18651, 30126, 31954) },
+                            { AOM_CDF4(12164, 25146, 29589) },
+                            { AOM_CDF4(7762, 18530, 24771) },
+                            { AOM_CDF4(3492, 9183, 13920) },
+                            { AOM_CDF4(27591, 32008, 32491) },
+                            { AOM_CDF4(17149, 28853, 31510) },
+                            { AOM_CDF4(11485, 24003, 28860) },
+                            { AOM_CDF4(7697, 18086, 24210) },
+                            { AOM_CDF4(3075, 7999, 12218) },
+                            { AOM_CDF4(28268, 32482, 32654) },
+                            { AOM_CDF4(19631, 31051, 32404) },
+                            { AOM_CDF4(13860, 27260, 31020) },
+                            { AOM_CDF4(9605, 21613, 27594) },
+                            { AOM_CDF4(4876, 12162, 17908) },
+                            { AOM_CDF4(27248, 32316, 32576) },
+                            { AOM_CDF4(18955, 30457, 32075) },
+                            { AOM_CDF4(11824, 23997, 28795) },
+                            { AOM_CDF4(7346, 18196, 24647) },
+                            { AOM_CDF4(3403, 9247, 14111) },
+                            { AOM_CDF4(29711, 32655, 32735) },
+                            { AOM_CDF4(21169, 31394, 32417) },
+                            { AOM_CDF4(13487, 27198, 30957) },
+                            { AOM_CDF4(8828, 21683, 27614) },
+                            { AOM_CDF4(4270, 11451, 17038) },
+                            { AOM_CDF4(28708, 32578, 32731) },
+                            { AOM_CDF4(20120, 31241, 32482) },
+                            { AOM_CDF4(13692, 27550, 31321) },
+                            { AOM_CDF4(9418, 22514, 28439) },
+                            { AOM_CDF4(4999, 13283, 19462) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(5673, 14302, 19711) },
+                            { AOM_CDF4(26251, 30701, 31834) },
+                            { AOM_CDF4(12782, 23783, 27803) },
+                            { AOM_CDF4(9127, 20657, 25808) },
+                            { AOM_CDF4(6368, 16208, 21462) },
+                            { AOM_CDF4(2465, 7177, 10822) },
+                            { AOM_CDF4(29961, 32563, 32719) },
+                            { AOM_CDF4(18318, 29891, 31949) },
+                            { AOM_CDF4(11361, 24514, 29357) },
+                            { AOM_CDF4(7900, 19603, 25607) },
+                            { AOM_CDF4(4002, 10590, 15546) },
+                            { AOM_CDF4(29637, 32310, 32595) },
+                            { AOM_CDF4(18296, 29913, 31809) },
+                            { AOM_CDF4(10144, 21515, 26871) },
+                            { AOM_CDF4(5358, 14322, 20394) },
+                            { AOM_CDF4(3067, 8362, 13346) },
+                            { AOM_CDF4(28652, 32470, 32676) },
+                            { AOM_CDF4(17538, 30771, 32209) },
+                            { AOM_CDF4(13924, 26882, 30494) },
+                            { AOM_CDF4(10496, 22837, 27869) },
+                            { AOM_CDF4(7236, 16396, 21621) },
+                            { AOM_CDF4(30743, 32687, 32746) },
+                            { AOM_CDF4(23006, 31676, 32489) },
+                            { AOM_CDF4(14494, 27828, 31120) },
+                            { AOM_CDF4(10174, 22801, 28352) },
+                            { AOM_CDF4(6242, 15281, 21043) },
+                            { AOM_CDF4(25817, 32243, 32720) },
+                            { AOM_CDF4(18618, 31367, 32325) },
+                            { AOM_CDF4(13997, 28318, 31878) },
+                            { AOM_CDF4(12255, 26534, 31383) },
+                            { AOM_CDF4(9561, 21588, 28450) },
+                            { AOM_CDF4(28188, 32635, 32724) },
+                            { AOM_CDF4(22060, 32365, 32728) },
+                            { AOM_CDF4(18102, 30690, 32528) },
+                            { AOM_CDF4(14196, 28864, 31999) },
+                            { AOM_CDF4(12262, 25792, 30865) },
+                            { AOM_CDF4(24176, 32109, 32628) },
+                            { AOM_CDF4(18280, 29681, 31963) },
+                            { AOM_CDF4(10205, 23703, 29664) },
+                            { AOM_CDF4(7889, 20025, 27676) },
+                            { AOM_CDF4(6060, 16743, 23970) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5141, 7096, 8260) },
+                            { AOM_CDF4(27186, 29022, 29789) },
+                            { AOM_CDF4(6668, 12568, 15682) },
+                            { AOM_CDF4(2172, 6181, 8638) },
+                            { AOM_CDF4(1126, 3379, 4531) },
+                            { AOM_CDF4(443, 1361, 2254) },
+                            { AOM_CDF4(26083, 31153, 32436) },
+                            { AOM_CDF4(13486, 24603, 28483) },
+                            { AOM_CDF4(6508, 14840, 19910) },
+                            { AOM_CDF4(3386, 8800, 13286) },
+                            { AOM_CDF4(1530, 4322, 7054) },
+                            { AOM_CDF4(29639, 32080, 32548) },
+                            { AOM_CDF4(15897, 27552, 30290) },
+                            { AOM_CDF4(8588, 20047, 25383) },
+                            { AOM_CDF4(4889, 13339, 19269) },
+                            { AOM_CDF4(2240, 6871, 10498) },
+                            { AOM_CDF4(28165, 32197, 32517) },
+                            { AOM_CDF4(20735, 30427, 31568) },
+                            { AOM_CDF4(14325, 24671, 27692) },
+                            { AOM_CDF4(5119, 12554, 17805) },
+                            { AOM_CDF4(1810, 5441, 8261) },
+                            { AOM_CDF4(31212, 32724, 32748) },
+                            { AOM_CDF4(23352, 31766, 32545) },
+                            { AOM_CDF4(14669, 27570, 31059) },
+                            { AOM_CDF4(8492, 20894, 27272) },
+                            { AOM_CDF4(3644, 10194, 15204) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(2461, 7013, 9371) },
+                            { AOM_CDF4(24749, 29600, 30986) },
+                            { AOM_CDF4(9466, 19037, 22417) },
+                            { AOM_CDF4(3584, 9280, 14400) },
+                            { AOM_CDF4(1505, 3929, 5433) },
+                            { AOM_CDF4(677, 1500, 2736) },
+                            { AOM_CDF4(23987, 30702, 32117) },
+                            { AOM_CDF4(13554, 24571, 29263) },
+                            { AOM_CDF4(6211, 14556, 21155) },
+                            { AOM_CDF4(3135, 10972, 15625) },
+                            { AOM_CDF4(2435, 7127, 11427) },
+                            { AOM_CDF4(31300, 32532, 32550) },
+                            { AOM_CDF4(14757, 30365, 31954) },
+                            { AOM_CDF4(4405, 11612, 18553) },
+                            { AOM_CDF4(580, 4132, 7322) },
+                            { AOM_CDF4(1695, 10169, 14124) },
+                            { AOM_CDF4(30008, 32282, 32591) },
+                            { AOM_CDF4(19244, 30108, 31748) },
+                            { AOM_CDF4(11180, 24158, 29555) },
+                            { AOM_CDF4(5650, 14972, 19209) },
+                            { AOM_CDF4(2114, 5109, 8456) },
+                            { AOM_CDF4(31856, 32716, 32748) },
+                            { AOM_CDF4(23012, 31664, 32572) },
+                            { AOM_CDF4(13694, 26656, 30636) },
+                            { AOM_CDF4(8142, 19508, 26093) },
+                            { AOM_CDF4(4253, 10955, 16724) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(601, 983, 1311) },
+                            { AOM_CDF4(18725, 23406, 28087) },
+                            { AOM_CDF4(5461, 8192, 10923) },
+                            { AOM_CDF4(3781, 15124, 21425) },
+                            { AOM_CDF4(2587, 7761, 12072) },
+                            { AOM_CDF4(106, 458, 810) },
+                            { AOM_CDF4(22282, 29710, 31894) },
+                            { AOM_CDF4(8508, 20926, 25984) },
+                            { AOM_CDF4(3726, 12713, 18083) },
+                            { AOM_CDF4(1620, 7112, 10893) },
+                            { AOM_CDF4(729, 2236, 3495) },
+                            { AOM_CDF4(30163, 32474, 32684) },
+                            { AOM_CDF4(18304, 30464, 32000) },
+                            { AOM_CDF4(11443, 26526, 29647) },
+                            { AOM_CDF4(6007, 15292, 21299) },
+                            { AOM_CDF4(2234, 6703, 8937) },
+                            { AOM_CDF4(30954, 32177, 32571) },
+                            { AOM_CDF4(17363, 29562, 31076) },
+                            { AOM_CDF4(9686, 22464, 27410) },
+                            { AOM_CDF4(8192, 16384, 21390) },
+                            { AOM_CDF4(1755, 8046, 11264) },
+                            { AOM_CDF4(31168, 32734, 32748) },
+                            { AOM_CDF4(22486, 31441, 32471) },
+                            { AOM_CDF4(12833, 25627, 29738) },
+                            { AOM_CDF4(6980, 17379, 23122) },
+                            { AOM_CDF4(3111, 8887, 13479) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(6041, 11854, 15927) },
+                            { AOM_CDF4(20326, 30905, 32251) },
+                            { AOM_CDF4(14164, 26831, 30725) },
+                            { AOM_CDF4(9760, 20647, 26585) },
+                            { AOM_CDF4(6416, 14953, 21219) },
+                            { AOM_CDF4(2966, 7151, 10891) },
+                            { AOM_CDF4(23567, 31374, 32254) },
+                            { AOM_CDF4(14978, 27416, 30946) },
+                            { AOM_CDF4(9434, 20225, 26254) },
+                            { AOM_CDF4(6658, 14558, 20535) },
+                            { AOM_CDF4(3916, 8677, 12989) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(18088, 29545, 31587) },
+                            { AOM_CDF4(13062, 25843, 30073) },
+                            { AOM_CDF4(8940, 16827, 22251) },
+                            { AOM_CDF4(7654, 13220, 17973) },
+                            { AOM_CDF4(5733, 10316, 14456) },
+                            { AOM_CDF4(22879, 31388, 32114) },
+                            { AOM_CDF4(15215, 27993, 30955) },
+                            { AOM_CDF4(9397, 19445, 24978) },
+                            { AOM_CDF4(3442, 9813, 15344) },
+                            { AOM_CDF4(1368, 3936, 6532) },
+                            { AOM_CDF4(25494, 32033, 32406) },
+                            { AOM_CDF4(16772, 27963, 30718) },
+                            { AOM_CDF4(9419, 18165, 23260) },
+                            { AOM_CDF4(2677, 7501, 11797) },
+                            { AOM_CDF4(1516, 4344, 7170) },
+                            { AOM_CDF4(26556, 31454, 32101) },
+                            { AOM_CDF4(17128, 27035, 30108) },
+                            { AOM_CDF4(8324, 15344, 20249) },
+                            { AOM_CDF4(1903, 5696, 9469) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8455, 19003, 24368) },
+                            { AOM_CDF4(23563, 32021, 32604) },
+                            { AOM_CDF4(16237, 29446, 31935) },
+                            { AOM_CDF4(10724, 23999, 29358) },
+                            { AOM_CDF4(6725, 17528, 24416) },
+                            { AOM_CDF4(3927, 10927, 16825) },
+                            { AOM_CDF4(26313, 32288, 32634) },
+                            { AOM_CDF4(17430, 30095, 32095) },
+                            { AOM_CDF4(11116, 24606, 29679) },
+                            { AOM_CDF4(7195, 18384, 25269) },
+                            { AOM_CDF4(4726, 12852, 19315) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(22822, 31648, 32483) },
+                            { AOM_CDF4(16724, 29633, 31929) },
+                            { AOM_CDF4(10261, 23033, 28725) },
+                            { AOM_CDF4(7029, 17840, 24528) },
+                            { AOM_CDF4(4867, 13886, 21502) },
+                            { AOM_CDF4(25298, 31892, 32491) },
+                            { AOM_CDF4(17809, 29330, 31512) },
+                            { AOM_CDF4(9668, 21329, 26579) },
+                            { AOM_CDF4(4774, 12956, 18976) },
+                            { AOM_CDF4(2322, 7030, 11540) },
+                            { AOM_CDF4(25472, 31920, 32543) },
+                            { AOM_CDF4(17957, 29387, 31632) },
+                            { AOM_CDF4(9196, 20593, 26400) },
+                            { AOM_CDF4(4680, 12705, 19202) },
+                            { AOM_CDF4(2917, 8456, 13436) },
+                            { AOM_CDF4(26471, 32059, 32574) },
+                            { AOM_CDF4(18458, 29783, 31909) },
+                            { AOM_CDF4(8400, 19464, 25956) },
+                            { AOM_CDF4(3812, 10973, 17206) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(6779, 13743, 17678) },
+                            { AOM_CDF4(24806, 31797, 32457) },
+                            { AOM_CDF4(17616, 29047, 31372) },
+                            { AOM_CDF4(11063, 23175, 28003) },
+                            { AOM_CDF4(6521, 16110, 22324) },
+                            { AOM_CDF4(2764, 7504, 11654) },
+                            { AOM_CDF4(25266, 32367, 32637) },
+                            { AOM_CDF4(19054, 30553, 32175) },
+                            { AOM_CDF4(12139, 25212, 29807) },
+                            { AOM_CDF4(7311, 18162, 24704) },
+                            { AOM_CDF4(3397, 9164, 14074) },
+                            { AOM_CDF4(25988, 32208, 32522) },
+                            { AOM_CDF4(16253, 28912, 31526) },
+                            { AOM_CDF4(9151, 21387, 27372) },
+                            { AOM_CDF4(5688, 14915, 21496) },
+                            { AOM_CDF4(2717, 7627, 12004) },
+                            { AOM_CDF4(23144, 31855, 32443) },
+                            { AOM_CDF4(16070, 28491, 31325) },
+                            { AOM_CDF4(8702, 20467, 26517) },
+                            { AOM_CDF4(5243, 13956, 20367) },
+                            { AOM_CDF4(2621, 7335, 11567) },
+                            { AOM_CDF4(26636, 32340, 32630) },
+                            { AOM_CDF4(19990, 31050, 32341) },
+                            { AOM_CDF4(13243, 26105, 30315) },
+                            { AOM_CDF4(8588, 19521, 25918) },
+                            { AOM_CDF4(4717, 11585, 17304) },
+                            { AOM_CDF4(25844, 32292, 32582) },
+                            { AOM_CDF4(19090, 30635, 32097) },
+                            { AOM_CDF4(11963, 24546, 28939) },
+                            { AOM_CDF4(6218, 16087, 22354) },
+                            { AOM_CDF4(2340, 6608, 10426) },
+                            { AOM_CDF4(28046, 32576, 32694) },
+                            { AOM_CDF4(21178, 31313, 32296) },
+                            { AOM_CDF4(13486, 26184, 29870) },
+                            { AOM_CDF4(7149, 17871, 23723) },
+                            { AOM_CDF4(2833, 7958, 12259) },
+                            { AOM_CDF4(27710, 32528, 32686) },
+                            { AOM_CDF4(20674, 31076, 32268) },
+                            { AOM_CDF4(12413, 24955, 29243) },
+                            { AOM_CDF4(6676, 16927, 23097) },
+                            { AOM_CDF4(2966, 8333, 12919) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8639, 19339, 24429) },
+                            { AOM_CDF4(24404, 31837, 32525) },
+                            { AOM_CDF4(16997, 29425, 31784) },
+                            { AOM_CDF4(11253, 24234, 29149) },
+                            { AOM_CDF4(6751, 17394, 24028) },
+                            { AOM_CDF4(3490, 9830, 15191) },
+                            { AOM_CDF4(26283, 32471, 32714) },
+                            { AOM_CDF4(19599, 31168, 32442) },
+                            { AOM_CDF4(13146, 26954, 30893) },
+                            { AOM_CDF4(8214, 20588, 26890) },
+                            { AOM_CDF4(4699, 13081, 19300) },
+                            { AOM_CDF4(28212, 32458, 32669) },
+                            { AOM_CDF4(18594, 30316, 32100) },
+                            { AOM_CDF4(11219, 24408, 29234) },
+                            { AOM_CDF4(6865, 17656, 24149) },
+                            { AOM_CDF4(3678, 10362, 16006) },
+                            { AOM_CDF4(25825, 32136, 32616) },
+                            { AOM_CDF4(17313, 29853, 32021) },
+                            { AOM_CDF4(11197, 24471, 29472) },
+                            { AOM_CDF4(6947, 17781, 24405) },
+                            { AOM_CDF4(3768, 10660, 16261) },
+                            { AOM_CDF4(27352, 32500, 32706) },
+                            { AOM_CDF4(20850, 31468, 32469) },
+                            { AOM_CDF4(14021, 27707, 31133) },
+                            { AOM_CDF4(8964, 21748, 27838) },
+                            { AOM_CDF4(5437, 14665, 21187) },
+                            { AOM_CDF4(26304, 32492, 32698) },
+                            { AOM_CDF4(20409, 31380, 32385) },
+                            { AOM_CDF4(13682, 27222, 30632) },
+                            { AOM_CDF4(8974, 21236, 26685) },
+                            { AOM_CDF4(4234, 11665, 16934) },
+                            { AOM_CDF4(26273, 32357, 32711) },
+                            { AOM_CDF4(20672, 31242, 32441) },
+                            { AOM_CDF4(14172, 27254, 30902) },
+                            { AOM_CDF4(9870, 21898, 27275) },
+                            { AOM_CDF4(5164, 13506, 19270) },
+                            { AOM_CDF4(26725, 32459, 32728) },
+                            { AOM_CDF4(20991, 31442, 32527) },
+                            { AOM_CDF4(13071, 26434, 30811) },
+                            { AOM_CDF4(8184, 20090, 26742) },
+                            { AOM_CDF4(4803, 13255, 19895) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7555, 14942, 18501) },
+                            { AOM_CDF4(24410, 31178, 32287) },
+                            { AOM_CDF4(14394, 26738, 30253) },
+                            { AOM_CDF4(8413, 19554, 25195) },
+                            { AOM_CDF4(4766, 12924, 18785) },
+                            { AOM_CDF4(2029, 5806, 9207) },
+                            { AOM_CDF4(26776, 32364, 32663) },
+                            { AOM_CDF4(18732, 29967, 31931) },
+                            { AOM_CDF4(11005, 23786, 28852) },
+                            { AOM_CDF4(6466, 16909, 23510) },
+                            { AOM_CDF4(3044, 8638, 13419) },
+                            { AOM_CDF4(29208, 32582, 32704) },
+                            { AOM_CDF4(20068, 30857, 32208) },
+                            { AOM_CDF4(12003, 25085, 29595) },
+                            { AOM_CDF4(6947, 17750, 24189) },
+                            { AOM_CDF4(3245, 9103, 14007) },
+                            { AOM_CDF4(27359, 32465, 32669) },
+                            { AOM_CDF4(19421, 30614, 32174) },
+                            { AOM_CDF4(11915, 25010, 29579) },
+                            { AOM_CDF4(6950, 17676, 24074) },
+                            { AOM_CDF4(3007, 8473, 13096) },
+                            { AOM_CDF4(29002, 32676, 32735) },
+                            { AOM_CDF4(22102, 31849, 32576) },
+                            { AOM_CDF4(14408, 28009, 31405) },
+                            { AOM_CDF4(9027, 21679, 27931) },
+                            { AOM_CDF4(4694, 12678, 18748) },
+                            { AOM_CDF4(28216, 32528, 32682) },
+                            { AOM_CDF4(20849, 31264, 32318) },
+                            { AOM_CDF4(12756, 25815, 29751) },
+                            { AOM_CDF4(7565, 18801, 24923) },
+                            { AOM_CDF4(3509, 9533, 14477) },
+                            { AOM_CDF4(30133, 32687, 32739) },
+                            { AOM_CDF4(23063, 31910, 32515) },
+                            { AOM_CDF4(14588, 28051, 31132) },
+                            { AOM_CDF4(9085, 21649, 27457) },
+                            { AOM_CDF4(4261, 11654, 17264) },
+                            { AOM_CDF4(29518, 32691, 32748) },
+                            { AOM_CDF4(22451, 31959, 32613) },
+                            { AOM_CDF4(14864, 28722, 31700) },
+                            { AOM_CDF4(9695, 22964, 28716) },
+                            { AOM_CDF4(4932, 13358, 19502) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6465, 16958, 21688) },
+                            { AOM_CDF4(25199, 31514, 32360) },
+                            { AOM_CDF4(14774, 27149, 30607) },
+                            { AOM_CDF4(9257, 21438, 26972) },
+                            { AOM_CDF4(5723, 15183, 21882) },
+                            { AOM_CDF4(3150, 8879, 13731) },
+                            { AOM_CDF4(26989, 32262, 32682) },
+                            { AOM_CDF4(17396, 29937, 32085) },
+                            { AOM_CDF4(11387, 24901, 29784) },
+                            { AOM_CDF4(7289, 18821, 25548) },
+                            { AOM_CDF4(3734, 10577, 16086) },
+                            { AOM_CDF4(29728, 32501, 32695) },
+                            { AOM_CDF4(17431, 29701, 31903) },
+                            { AOM_CDF4(9921, 22826, 28300) },
+                            { AOM_CDF4(5896, 15434, 22068) },
+                            { AOM_CDF4(3430, 9646, 14757) },
+                            { AOM_CDF4(28614, 32511, 32705) },
+                            { AOM_CDF4(19364, 30638, 32263) },
+                            { AOM_CDF4(13129, 26254, 30402) },
+                            { AOM_CDF4(8754, 20484, 26440) },
+                            { AOM_CDF4(4378, 11607, 17110) },
+                            { AOM_CDF4(30292, 32671, 32744) },
+                            { AOM_CDF4(21780, 31603, 32501) },
+                            { AOM_CDF4(14314, 27829, 31291) },
+                            { AOM_CDF4(9611, 22327, 28263) },
+                            { AOM_CDF4(4890, 13087, 19065) },
+                            { AOM_CDF4(25862, 32567, 32733) },
+                            { AOM_CDF4(20794, 32050, 32567) },
+                            { AOM_CDF4(17243, 30625, 32254) },
+                            { AOM_CDF4(13283, 27628, 31474) },
+                            { AOM_CDF4(9669, 22532, 28918) },
+                            { AOM_CDF4(27435, 32697, 32748) },
+                            { AOM_CDF4(24922, 32390, 32714) },
+                            { AOM_CDF4(21449, 31504, 32536) },
+                            { AOM_CDF4(16392, 29729, 31832) },
+                            { AOM_CDF4(11692, 24884, 29076) },
+                            { AOM_CDF4(24193, 32290, 32735) },
+                            { AOM_CDF4(18909, 31104, 32563) },
+                            { AOM_CDF4(12236, 26841, 31403) },
+                            { AOM_CDF4(8171, 21840, 29082) },
+                            { AOM_CDF4(7224, 17280, 25275) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(3078, 6839, 9890) },
+                            { AOM_CDF4(13837, 20450, 24479) },
+                            { AOM_CDF4(5914, 14222, 19328) },
+                            { AOM_CDF4(3866, 10267, 14762) },
+                            { AOM_CDF4(2612, 7208, 11042) },
+                            { AOM_CDF4(1067, 2991, 4776) },
+                            { AOM_CDF4(25817, 31646, 32529) },
+                            { AOM_CDF4(13708, 26338, 30385) },
+                            { AOM_CDF4(7328, 18585, 24870) },
+                            { AOM_CDF4(4691, 13080, 19276) },
+                            { AOM_CDF4(1825, 5253, 8352) },
+                            { AOM_CDF4(29386, 32315, 32624) },
+                            { AOM_CDF4(17160, 29001, 31360) },
+                            { AOM_CDF4(9602, 21862, 27396) },
+                            { AOM_CDF4(5915, 15772, 22148) },
+                            { AOM_CDF4(2786, 7779, 12047) },
+                            { AOM_CDF4(29246, 32450, 32663) },
+                            { AOM_CDF4(18696, 29929, 31818) },
+                            { AOM_CDF4(10510, 23369, 28560) },
+                            { AOM_CDF4(6229, 16499, 23125) },
+                            { AOM_CDF4(2608, 7448, 11705) },
+                            { AOM_CDF4(30753, 32710, 32748) },
+                            { AOM_CDF4(21638, 31487, 32503) },
+                            { AOM_CDF4(12937, 26854, 30870) },
+                            { AOM_CDF4(8182, 20596, 26970) },
+                            { AOM_CDF4(3637, 10269, 15497) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(5244, 12150, 16906) },
+                            { AOM_CDF4(20486, 26858, 29701) },
+                            { AOM_CDF4(7756, 18317, 23735) },
+                            { AOM_CDF4(3452, 9256, 13146) },
+                            { AOM_CDF4(2020, 5206, 8229) },
+                            { AOM_CDF4(1801, 4993, 7903) },
+                            { AOM_CDF4(27051, 31858, 32531) },
+                            { AOM_CDF4(15988, 27531, 30619) },
+                            { AOM_CDF4(9188, 21484, 26719) },
+                            { AOM_CDF4(6273, 17186, 23800) },
+                            { AOM_CDF4(3108, 9355, 14764) },
+                            { AOM_CDF4(31076, 32520, 32680) },
+                            { AOM_CDF4(18119, 30037, 31850) },
+                            { AOM_CDF4(10244, 22969, 27472) },
+                            { AOM_CDF4(4692, 14077, 19273) },
+                            { AOM_CDF4(3694, 11677, 17556) },
+                            { AOM_CDF4(30060, 32581, 32720) },
+                            { AOM_CDF4(21011, 30775, 32120) },
+                            { AOM_CDF4(11931, 24820, 29289) },
+                            { AOM_CDF4(7119, 17662, 24356) },
+                            { AOM_CDF4(3833, 10706, 16304) },
+                            { AOM_CDF4(31954, 32731, 32748) },
+                            { AOM_CDF4(23913, 31724, 32489) },
+                            { AOM_CDF4(15520, 28060, 31286) },
+                            { AOM_CDF4(11517, 23008, 28571) },
+                            { AOM_CDF4(6193, 14508, 20629) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(1035, 2807, 4156) },
+                            { AOM_CDF4(13162, 18138, 20939) },
+                            { AOM_CDF4(2696, 6633, 8755) },
+                            { AOM_CDF4(1373, 4161, 6853) },
+                            { AOM_CDF4(1099, 2746, 4716) },
+                            { AOM_CDF4(340, 1021, 1599) },
+                            { AOM_CDF4(22826, 30419, 32135) },
+                            { AOM_CDF4(10395, 21762, 26942) },
+                            { AOM_CDF4(4726, 12407, 17361) },
+                            { AOM_CDF4(2447, 7080, 10593) },
+                            { AOM_CDF4(1227, 3717, 6011) },
+                            { AOM_CDF4(28156, 31424, 31934) },
+                            { AOM_CDF4(16915, 27754, 30373) },
+                            { AOM_CDF4(9148, 20990, 26431) },
+                            { AOM_CDF4(5950, 15515, 21148) },
+                            { AOM_CDF4(2492, 7327, 11526) },
+                            { AOM_CDF4(30602, 32477, 32670) },
+                            { AOM_CDF4(20026, 29955, 31568) },
+                            { AOM_CDF4(11220, 23628, 28105) },
+                            { AOM_CDF4(6652, 17019, 22973) },
+                            { AOM_CDF4(3064, 8536, 13043) },
+                            { AOM_CDF4(31769, 32724, 32748) },
+                            { AOM_CDF4(22230, 30887, 32373) },
+                            { AOM_CDF4(12234, 25079, 29731) },
+                            { AOM_CDF4(7326, 18816, 25353) },
+                            { AOM_CDF4(3933, 10907, 16616) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(8896, 16227, 20630) },
+                            { AOM_CDF4(23629, 31782, 32527) },
+                            { AOM_CDF4(15173, 27755, 31321) },
+                            { AOM_CDF4(10158, 21233, 27382) },
+                            { AOM_CDF4(6420, 14857, 21558) },
+                            { AOM_CDF4(3269, 8155, 12646) },
+                            { AOM_CDF4(24835, 32009, 32496) },
+                            { AOM_CDF4(16509, 28421, 31579) },
+                            { AOM_CDF4(10957, 21514, 27418) },
+                            { AOM_CDF4(7881, 15930, 22096) },
+                            { AOM_CDF4(5388, 10960, 15918) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(20745, 30773, 32093) },
+                            { AOM_CDF4(15200, 27221, 30861) },
+                            { AOM_CDF4(13032, 20873, 25667) },
+                            { AOM_CDF4(12285, 18663, 23494) },
+                            { AOM_CDF4(11563, 17481, 21489) },
+                            { AOM_CDF4(26260, 31982, 32320) },
+                            { AOM_CDF4(15397, 28083, 31100) },
+                            { AOM_CDF4(9742, 19217, 24824) },
+                            { AOM_CDF4(3261, 9629, 15362) },
+                            { AOM_CDF4(1480, 4322, 7499) },
+                            { AOM_CDF4(27599, 32256, 32460) },
+                            { AOM_CDF4(16857, 27659, 30774) },
+                            { AOM_CDF4(9551, 18290, 23748) },
+                            { AOM_CDF4(3052, 8933, 14103) },
+                            { AOM_CDF4(2021, 5910, 9787) },
+                            { AOM_CDF4(29005, 32015, 32392) },
+                            { AOM_CDF4(17677, 27694, 30863) },
+                            { AOM_CDF4(9204, 17356, 23219) },
+                            { AOM_CDF4(2403, 7516, 12814) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(10808, 22056, 26896) },
+                            { AOM_CDF4(25739, 32313, 32676) },
+                            { AOM_CDF4(17288, 30203, 32221) },
+                            { AOM_CDF4(11359, 24878, 29896) },
+                            { AOM_CDF4(6949, 17767, 24893) },
+                            { AOM_CDF4(4287, 11796, 18071) },
+                            { AOM_CDF4(27880, 32521, 32705) },
+                            { AOM_CDF4(19038, 31004, 32414) },
+                            { AOM_CDF4(12564, 26345, 30768) },
+                            { AOM_CDF4(8269, 19947, 26779) },
+                            { AOM_CDF4(5674, 14657, 21674) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(25742, 32319, 32671) },
+                            { AOM_CDF4(19557, 31164, 32454) },
+                            { AOM_CDF4(13381, 26381, 30755) },
+                            { AOM_CDF4(10101, 21466, 26722) },
+                            { AOM_CDF4(9209, 19650, 26825) },
+                            { AOM_CDF4(27107, 31917, 32432) },
+                            { AOM_CDF4(18056, 28893, 31203) },
+                            { AOM_CDF4(10200, 21434, 26764) },
+                            { AOM_CDF4(4660, 12913, 19502) },
+                            { AOM_CDF4(2368, 6930, 12504) },
+                            { AOM_CDF4(26960, 32158, 32613) },
+                            { AOM_CDF4(18628, 30005, 32031) },
+                            { AOM_CDF4(10233, 22442, 28232) },
+                            { AOM_CDF4(5471, 14630, 21516) },
+                            { AOM_CDF4(3235, 10767, 17109) },
+                            { AOM_CDF4(27696, 32440, 32692) },
+                            { AOM_CDF4(20032, 31167, 32438) },
+                            { AOM_CDF4(8700, 21341, 28442) },
+                            { AOM_CDF4(5662, 14831, 21795) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(9704, 17294, 21132) },
+                            { AOM_CDF4(26762, 32278, 32633) },
+                            { AOM_CDF4(18382, 29620, 31819) },
+                            { AOM_CDF4(10891, 23475, 28723) },
+                            { AOM_CDF4(6358, 16583, 23309) },
+                            { AOM_CDF4(3248, 9118, 14141) },
+                            { AOM_CDF4(27204, 32573, 32699) },
+                            { AOM_CDF4(19818, 30824, 32329) },
+                            { AOM_CDF4(11772, 25120, 30041) },
+                            { AOM_CDF4(6995, 18033, 25039) },
+                            { AOM_CDF4(3752, 10442, 16098) },
+                            { AOM_CDF4(27222, 32256, 32559) },
+                            { AOM_CDF4(15356, 28399, 31475) },
+                            { AOM_CDF4(8821, 20635, 27057) },
+                            { AOM_CDF4(5511, 14404, 21239) },
+                            { AOM_CDF4(2935, 8222, 13051) },
+                            { AOM_CDF4(24875, 32120, 32529) },
+                            { AOM_CDF4(15233, 28265, 31445) },
+                            { AOM_CDF4(8605, 20570, 26932) },
+                            { AOM_CDF4(5431, 14413, 21196) },
+                            { AOM_CDF4(2994, 8341, 13223) },
+                            { AOM_CDF4(28201, 32604, 32700) },
+                            { AOM_CDF4(21041, 31446, 32456) },
+                            { AOM_CDF4(13221, 26213, 30475) },
+                            { AOM_CDF4(8255, 19385, 26037) },
+                            { AOM_CDF4(4930, 12585, 18830) },
+                            { AOM_CDF4(28768, 32448, 32627) },
+                            { AOM_CDF4(19705, 30561, 32021) },
+                            { AOM_CDF4(11572, 23589, 28220) },
+                            { AOM_CDF4(5532, 15034, 21446) },
+                            { AOM_CDF4(2460, 7150, 11456) },
+                            { AOM_CDF4(29874, 32619, 32699) },
+                            { AOM_CDF4(21621, 31071, 32201) },
+                            { AOM_CDF4(12511, 24747, 28992) },
+                            { AOM_CDF4(6281, 16395, 22748) },
+                            { AOM_CDF4(3246, 9278, 14497) },
+                            { AOM_CDF4(29715, 32625, 32712) },
+                            { AOM_CDF4(20958, 31011, 32283) },
+                            { AOM_CDF4(11233, 23671, 28806) },
+                            { AOM_CDF4(6012, 16128, 22868) },
+                            { AOM_CDF4(3427, 9851, 15414) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(11016, 22111, 26794) },
+                            { AOM_CDF4(25946, 32357, 32677) },
+                            { AOM_CDF4(17890, 30452, 32252) },
+                            { AOM_CDF4(11678, 25142, 29816) },
+                            { AOM_CDF4(6720, 17534, 24584) },
+                            { AOM_CDF4(4230, 11665, 17820) },
+                            { AOM_CDF4(28400, 32623, 32747) },
+                            { AOM_CDF4(21164, 31668, 32575) },
+                            { AOM_CDF4(13572, 27388, 31182) },
+                            { AOM_CDF4(8234, 20750, 27358) },
+                            { AOM_CDF4(5065, 14055, 20897) },
+                            { AOM_CDF4(28981, 32547, 32705) },
+                            { AOM_CDF4(18681, 30543, 32239) },
+                            { AOM_CDF4(10919, 24075, 29286) },
+                            { AOM_CDF4(6431, 17199, 24077) },
+                            { AOM_CDF4(3819, 10464, 16618) },
+                            { AOM_CDF4(26870, 32467, 32693) },
+                            { AOM_CDF4(19041, 30831, 32347) },
+                            { AOM_CDF4(11794, 25211, 30016) },
+                            { AOM_CDF4(6888, 18019, 24970) },
+                            { AOM_CDF4(4370, 12363, 18992) },
+                            { AOM_CDF4(29578, 32670, 32744) },
+                            { AOM_CDF4(23159, 32007, 32613) },
+                            { AOM_CDF4(15315, 28669, 31676) },
+                            { AOM_CDF4(9298, 22607, 28782) },
+                            { AOM_CDF4(6144, 15913, 22968) },
+                            { AOM_CDF4(28110, 32499, 32669) },
+                            { AOM_CDF4(21574, 30937, 32015) },
+                            { AOM_CDF4(12759, 24818, 28727) },
+                            { AOM_CDF4(6545, 16761, 23042) },
+                            { AOM_CDF4(3649, 10597, 16833) },
+                            { AOM_CDF4(28163, 32552, 32728) },
+                            { AOM_CDF4(22101, 31469, 32464) },
+                            { AOM_CDF4(13160, 25472, 30143) },
+                            { AOM_CDF4(7303, 18684, 25468) },
+                            { AOM_CDF4(5241, 13975, 20955) },
+                            { AOM_CDF4(28400, 32631, 32744) },
+                            { AOM_CDF4(22104, 31793, 32603) },
+                            { AOM_CDF4(13557, 26571, 30846) },
+                            { AOM_CDF4(7749, 19861, 26675) },
+                            { AOM_CDF4(4873, 14030, 21234) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(9800, 17635, 21073) },
+                            { AOM_CDF4(26153, 31885, 32527) },
+                            { AOM_CDF4(15038, 27852, 31006) },
+                            { AOM_CDF4(8718, 20564, 26486) },
+                            { AOM_CDF4(5128, 14076, 20514) },
+                            { AOM_CDF4(2636, 7566, 11925) },
+                            { AOM_CDF4(27551, 32504, 32701) },
+                            { AOM_CDF4(18310, 30054, 32100) },
+                            { AOM_CDF4(10211, 23420, 29082) },
+                            { AOM_CDF4(6222, 16876, 23916) },
+                            { AOM_CDF4(3462, 9954, 15498) },
+                            { AOM_CDF4(29991, 32633, 32721) },
+                            { AOM_CDF4(19883, 30751, 32201) },
+                            { AOM_CDF4(11141, 24184, 29285) },
+                            { AOM_CDF4(6420, 16940, 23774) },
+                            { AOM_CDF4(3392, 9753, 15118) },
+                            { AOM_CDF4(28465, 32616, 32712) },
+                            { AOM_CDF4(19850, 30702, 32244) },
+                            { AOM_CDF4(10983, 24024, 29223) },
+                            { AOM_CDF4(6294, 16770, 23582) },
+                            { AOM_CDF4(3244, 9283, 14509) },
+                            { AOM_CDF4(30023, 32717, 32748) },
+                            { AOM_CDF4(22940, 32032, 32626) },
+                            { AOM_CDF4(14282, 27928, 31473) },
+                            { AOM_CDF4(8562, 21327, 27914) },
+                            { AOM_CDF4(4846, 13393, 19919) },
+                            { AOM_CDF4(29981, 32590, 32695) },
+                            { AOM_CDF4(20465, 30963, 32166) },
+                            { AOM_CDF4(11479, 23579, 28195) },
+                            { AOM_CDF4(5916, 15648, 22073) },
+                            { AOM_CDF4(3031, 8605, 13398) },
+                            { AOM_CDF4(31146, 32691, 32739) },
+                            { AOM_CDF4(23106, 31724, 32444) },
+                            { AOM_CDF4(13783, 26738, 30439) },
+                            { AOM_CDF4(7852, 19468, 25807) },
+                            { AOM_CDF4(3860, 11124, 16853) },
+                            { AOM_CDF4(31014, 32724, 32748) },
+                            { AOM_CDF4(23629, 32109, 32628) },
+                            { AOM_CDF4(14747, 28115, 31403) },
+                            { AOM_CDF4(8545, 21242, 27478) },
+                            { AOM_CDF4(4574, 12781, 19067) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(9185, 19694, 24688) },
+                            { AOM_CDF4(26081, 31985, 32621) },
+                            { AOM_CDF4(16015, 29000, 31787) },
+                            { AOM_CDF4(10542, 23690, 29206) },
+                            { AOM_CDF4(6732, 17945, 24677) },
+                            { AOM_CDF4(3916, 11039, 16722) },
+                            { AOM_CDF4(28224, 32566, 32744) },
+                            { AOM_CDF4(19100, 31138, 32485) },
+                            { AOM_CDF4(12528, 26620, 30879) },
+                            { AOM_CDF4(7741, 20277, 26885) },
+                            { AOM_CDF4(4566, 12845, 18990) },
+                            { AOM_CDF4(29933, 32593, 32718) },
+                            { AOM_CDF4(17670, 30333, 32155) },
+                            { AOM_CDF4(10385, 23600, 28909) },
+                            { AOM_CDF4(6243, 16236, 22407) },
+                            { AOM_CDF4(3976, 10389, 16017) },
+                            { AOM_CDF4(28377, 32561, 32738) },
+                            { AOM_CDF4(19366, 31175, 32482) },
+                            { AOM_CDF4(13327, 27175, 31094) },
+                            { AOM_CDF4(8258, 20769, 27143) },
+                            { AOM_CDF4(4703, 13198, 19527) },
+                            { AOM_CDF4(31086, 32706, 32748) },
+                            { AOM_CDF4(22853, 31902, 32583) },
+                            { AOM_CDF4(14759, 28186, 31419) },
+                            { AOM_CDF4(9284, 22382, 28348) },
+                            { AOM_CDF4(5585, 15192, 21868) },
+                            { AOM_CDF4(28291, 32652, 32746) },
+                            { AOM_CDF4(19849, 32107, 32571) },
+                            { AOM_CDF4(14834, 26818, 29214) },
+                            { AOM_CDF4(10306, 22594, 28672) },
+                            { AOM_CDF4(6615, 17384, 23384) },
+                            { AOM_CDF4(28947, 32604, 32745) },
+                            { AOM_CDF4(25625, 32289, 32646) },
+                            { AOM_CDF4(18758, 28672, 31403) },
+                            { AOM_CDF4(10017, 23430, 28523) },
+                            { AOM_CDF4(6862, 15269, 22131) },
+                            { AOM_CDF4(23933, 32509, 32739) },
+                            { AOM_CDF4(19927, 31495, 32631) },
+                            { AOM_CDF4(11903, 26023, 30621) },
+                            { AOM_CDF4(7026, 20094, 27252) },
+                            { AOM_CDF4(5998, 18106, 24437) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4456, 11274, 15533) },
+                            { AOM_CDF4(21219, 29079, 31616) },
+                            { AOM_CDF4(11173, 23774, 28567) },
+                            { AOM_CDF4(7282, 18293, 24263) },
+                            { AOM_CDF4(4890, 13286, 19115) },
+                            { AOM_CDF4(1890, 5508, 8659) },
+                            { AOM_CDF4(26651, 32136, 32647) },
+                            { AOM_CDF4(14630, 28254, 31455) },
+                            { AOM_CDF4(8716, 21287, 27395) },
+                            { AOM_CDF4(5615, 15331, 22008) },
+                            { AOM_CDF4(2675, 7700, 12150) },
+                            { AOM_CDF4(29954, 32526, 32690) },
+                            { AOM_CDF4(16126, 28982, 31633) },
+                            { AOM_CDF4(9030, 21361, 27352) },
+                            { AOM_CDF4(5411, 14793, 21271) },
+                            { AOM_CDF4(2943, 8422, 13163) },
+                            { AOM_CDF4(29539, 32601, 32730) },
+                            { AOM_CDF4(18125, 30385, 32201) },
+                            { AOM_CDF4(10422, 24090, 29468) },
+                            { AOM_CDF4(6468, 17487, 24438) },
+                            { AOM_CDF4(2970, 8653, 13531) },
+                            { AOM_CDF4(30912, 32715, 32748) },
+                            { AOM_CDF4(20666, 31373, 32497) },
+                            { AOM_CDF4(12509, 26640, 30917) },
+                            { AOM_CDF4(8058, 20629, 27290) },
+                            { AOM_CDF4(4231, 12006, 18052) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(10202, 20633, 25484) },
+                            { AOM_CDF4(27336, 31445, 32352) },
+                            { AOM_CDF4(12420, 24384, 28552) },
+                            { AOM_CDF4(7648, 18115, 23856) },
+                            { AOM_CDF4(5662, 14341, 19902) },
+                            { AOM_CDF4(3611, 10328, 15390) },
+                            { AOM_CDF4(30945, 32616, 32736) },
+                            { AOM_CDF4(18682, 30505, 32253) },
+                            { AOM_CDF4(11513, 25336, 30203) },
+                            { AOM_CDF4(7449, 19452, 26148) },
+                            { AOM_CDF4(4482, 13051, 18886) },
+                            { AOM_CDF4(32022, 32690, 32747) },
+                            { AOM_CDF4(18578, 30501, 32146) },
+                            { AOM_CDF4(11249, 23368, 28631) },
+                            { AOM_CDF4(5645, 16958, 22158) },
+                            { AOM_CDF4(5009, 11444, 16637) },
+                            { AOM_CDF4(31357, 32710, 32748) },
+                            { AOM_CDF4(21552, 31494, 32504) },
+                            { AOM_CDF4(13891, 27677, 31340) },
+                            { AOM_CDF4(9051, 22098, 28172) },
+                            { AOM_CDF4(5190, 13377, 19486) },
+                            { AOM_CDF4(32364, 32740, 32748) },
+                            { AOM_CDF4(24839, 31907, 32551) },
+                            { AOM_CDF4(17160, 28779, 31696) },
+                            { AOM_CDF4(12452, 24137, 29602) },
+                            { AOM_CDF4(6165, 15389, 22477) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(2575, 7281, 11077) },
+                            { AOM_CDF4(14002, 20866, 25402) },
+                            { AOM_CDF4(6343, 15056, 19658) },
+                            { AOM_CDF4(4474, 11858, 17041) },
+                            { AOM_CDF4(2865, 8299, 12534) },
+                            { AOM_CDF4(1344, 3949, 6391) },
+                            { AOM_CDF4(24720, 31239, 32459) },
+                            { AOM_CDF4(12585, 25356, 29968) },
+                            { AOM_CDF4(7181, 18246, 24444) },
+                            { AOM_CDF4(5025, 13667, 19885) },
+                            { AOM_CDF4(2521, 7304, 11605) },
+                            { AOM_CDF4(29908, 32252, 32584) },
+                            { AOM_CDF4(17421, 29156, 31575) },
+                            { AOM_CDF4(9889, 22188, 27782) },
+                            { AOM_CDF4(5878, 15647, 22123) },
+                            { AOM_CDF4(2814, 8665, 13323) },
+                            { AOM_CDF4(30183, 32568, 32713) },
+                            { AOM_CDF4(18528, 30195, 32049) },
+                            { AOM_CDF4(10982, 24606, 29657) },
+                            { AOM_CDF4(6957, 18165, 25231) },
+                            { AOM_CDF4(3508, 10118, 15468) },
+                            { AOM_CDF4(31761, 32736, 32748) },
+                            { AOM_CDF4(21041, 31328, 32546) },
+                            { AOM_CDF4(12568, 26732, 31166) },
+                            { AOM_CDF4(8052, 20720, 27733) },
+                            { AOM_CDF4(4336, 12192, 18396) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(7062, 16472, 22319) },
+                            { AOM_CDF4(24538, 32261, 32674) },
+                            { AOM_CDF4(13675, 28041, 31779) },
+                            { AOM_CDF4(8590, 20674, 27631) },
+                            { AOM_CDF4(5685, 14675, 22013) },
+                            { AOM_CDF4(3655, 9898, 15731) },
+                            { AOM_CDF4(26493, 32418, 32658) },
+                            { AOM_CDF4(16376, 29342, 32090) },
+                            { AOM_CDF4(10594, 22649, 28970) },
+                            { AOM_CDF4(8176, 17170, 24303) },
+                            { AOM_CDF4(5605, 12694, 19139) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(23888, 31902, 32542) },
+                            { AOM_CDF4(18612, 29687, 31987) },
+                            { AOM_CDF4(16245, 24852, 29249) },
+                            { AOM_CDF4(15765, 22608, 27559) },
+                            { AOM_CDF4(19895, 24699, 27510) },
+                            { AOM_CDF4(28401, 32212, 32457) },
+                            { AOM_CDF4(15274, 27825, 30980) },
+                            { AOM_CDF4(9364, 18128, 24332) },
+                            { AOM_CDF4(2283, 8193, 15082) },
+                            { AOM_CDF4(1228, 3972, 7881) },
+                            { AOM_CDF4(29455, 32469, 32620) },
+                            { AOM_CDF4(17981, 28245, 31388) },
+                            { AOM_CDF4(10921, 20098, 26240) },
+                            { AOM_CDF4(3743, 11829, 18657) },
+                            { AOM_CDF4(2374, 9593, 15715) },
+                            { AOM_CDF4(31068, 32466, 32635) },
+                            { AOM_CDF4(20321, 29572, 31971) },
+                            { AOM_CDF4(10771, 20255, 27119) },
+                            { AOM_CDF4(2795, 10410, 17361) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(9320, 22102, 27840) },
+                            { AOM_CDF4(27057, 32464, 32724) },
+                            { AOM_CDF4(16331, 30268, 32309) },
+                            { AOM_CDF4(10319, 23935, 29720) },
+                            { AOM_CDF4(6189, 16448, 24106) },
+                            { AOM_CDF4(3589, 10884, 18808) },
+                            { AOM_CDF4(29026, 32624, 32748) },
+                            { AOM_CDF4(19226, 31507, 32587) },
+                            { AOM_CDF4(12692, 26921, 31203) },
+                            { AOM_CDF4(7049, 19532, 27635) },
+                            { AOM_CDF4(7727, 15669, 23252) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(28056, 32625, 32748) },
+                            { AOM_CDF4(22383, 32075, 32669) },
+                            { AOM_CDF4(15417, 27098, 31749) },
+                            { AOM_CDF4(18127, 26493, 27190) },
+                            { AOM_CDF4(5461, 16384, 21845) },
+                            { AOM_CDF4(27982, 32091, 32584) },
+                            { AOM_CDF4(19045, 29868, 31972) },
+                            { AOM_CDF4(10397, 22266, 27932) },
+                            { AOM_CDF4(5990, 13697, 21500) },
+                            { AOM_CDF4(1792, 6912, 15104) },
+                            { AOM_CDF4(28198, 32501, 32718) },
+                            { AOM_CDF4(21534, 31521, 32569) },
+                            { AOM_CDF4(11109, 25217, 30017) },
+                            { AOM_CDF4(5671, 15124, 26151) },
+                            { AOM_CDF4(4681, 14043, 18725) },
+                            { AOM_CDF4(28688, 32580, 32741) },
+                            { AOM_CDF4(22576, 32079, 32661) },
+                            { AOM_CDF4(10627, 22141, 28340) },
+                            { AOM_CDF4(9362, 14043, 28087) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7754, 16948, 22142) },
+                            { AOM_CDF4(25670, 32330, 32691) },
+                            { AOM_CDF4(15663, 29225, 31994) },
+                            { AOM_CDF4(9878, 23288, 29158) },
+                            { AOM_CDF4(6419, 17088, 24336) },
+                            { AOM_CDF4(3859, 11003, 17039) },
+                            { AOM_CDF4(27562, 32595, 32725) },
+                            { AOM_CDF4(17575, 30588, 32399) },
+                            { AOM_CDF4(10819, 24838, 30309) },
+                            { AOM_CDF4(7124, 18686, 25916) },
+                            { AOM_CDF4(4479, 12688, 19340) },
+                            { AOM_CDF4(28385, 32476, 32673) },
+                            { AOM_CDF4(15306, 29005, 31938) },
+                            { AOM_CDF4(8937, 21615, 28322) },
+                            { AOM_CDF4(5982, 15603, 22786) },
+                            { AOM_CDF4(3620, 10267, 16136) },
+                            { AOM_CDF4(27280, 32464, 32667) },
+                            { AOM_CDF4(15607, 29160, 32004) },
+                            { AOM_CDF4(9091, 22135, 28740) },
+                            { AOM_CDF4(6232, 16632, 24020) },
+                            { AOM_CDF4(4047, 11377, 17672) },
+                            { AOM_CDF4(29220, 32630, 32718) },
+                            { AOM_CDF4(19650, 31220, 32462) },
+                            { AOM_CDF4(13050, 26312, 30827) },
+                            { AOM_CDF4(9228, 20870, 27468) },
+                            { AOM_CDF4(6146, 15149, 21971) },
+                            { AOM_CDF4(30169, 32481, 32623) },
+                            { AOM_CDF4(17212, 29311, 31554) },
+                            { AOM_CDF4(9911, 21311, 26882) },
+                            { AOM_CDF4(4487, 13314, 20372) },
+                            { AOM_CDF4(2570, 7772, 12889) },
+                            { AOM_CDF4(30924, 32613, 32708) },
+                            { AOM_CDF4(19490, 30206, 32107) },
+                            { AOM_CDF4(11232, 23998, 29276) },
+                            { AOM_CDF4(6769, 17955, 25035) },
+                            { AOM_CDF4(4398, 12623, 19214) },
+                            { AOM_CDF4(30609, 32627, 32722) },
+                            { AOM_CDF4(19370, 30582, 32287) },
+                            { AOM_CDF4(10457, 23619, 29409) },
+                            { AOM_CDF4(6443, 17637, 24834) },
+                            { AOM_CDF4(4645, 13236, 20106) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8626, 20271, 26216) },
+                            { AOM_CDF4(26707, 32406, 32711) },
+                            { AOM_CDF4(16999, 30329, 32286) },
+                            { AOM_CDF4(11445, 25123, 30286) },
+                            { AOM_CDF4(6411, 18828, 25601) },
+                            { AOM_CDF4(6801, 12458, 20248) },
+                            { AOM_CDF4(29918, 32682, 32748) },
+                            { AOM_CDF4(20649, 31739, 32618) },
+                            { AOM_CDF4(12879, 27773, 31581) },
+                            { AOM_CDF4(7896, 21751, 28244) },
+                            { AOM_CDF4(5260, 14870, 23698) },
+                            { AOM_CDF4(29252, 32593, 32731) },
+                            { AOM_CDF4(17072, 30460, 32294) },
+                            { AOM_CDF4(10653, 24143, 29365) },
+                            { AOM_CDF4(6536, 17490, 23983) },
+                            { AOM_CDF4(4929, 13170, 20085) },
+                            { AOM_CDF4(28137, 32518, 32715) },
+                            { AOM_CDF4(18171, 30784, 32407) },
+                            { AOM_CDF4(11437, 25436, 30459) },
+                            { AOM_CDF4(7252, 18534, 26176) },
+                            { AOM_CDF4(4126, 13353, 20978) },
+                            { AOM_CDF4(31162, 32726, 32748) },
+                            { AOM_CDF4(23017, 32222, 32701) },
+                            { AOM_CDF4(15629, 29233, 32046) },
+                            { AOM_CDF4(9387, 22621, 29480) },
+                            { AOM_CDF4(6922, 17616, 25010) },
+                            { AOM_CDF4(28838, 32265, 32614) },
+                            { AOM_CDF4(19701, 30206, 31920) },
+                            { AOM_CDF4(11214, 22410, 27933) },
+                            { AOM_CDF4(5320, 14177, 23034) },
+                            { AOM_CDF4(5049, 12881, 17827) },
+                            { AOM_CDF4(27484, 32471, 32734) },
+                            { AOM_CDF4(21076, 31526, 32561) },
+                            { AOM_CDF4(12707, 26303, 31211) },
+                            { AOM_CDF4(8169, 21722, 28219) },
+                            { AOM_CDF4(6045, 19406, 27042) },
+                            { AOM_CDF4(27753, 32572, 32745) },
+                            { AOM_CDF4(20832, 31878, 32653) },
+                            { AOM_CDF4(13250, 27356, 31674) },
+                            { AOM_CDF4(7718, 21508, 29858) },
+                            { AOM_CDF4(7209, 18350, 25559) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7876, 16901, 21741) },
+                            { AOM_CDF4(24001, 31898, 32625) },
+                            { AOM_CDF4(14529, 27959, 31451) },
+                            { AOM_CDF4(8273, 20818, 27258) },
+                            { AOM_CDF4(5278, 14673, 21510) },
+                            { AOM_CDF4(2983, 8843, 14039) },
+                            { AOM_CDF4(28016, 32574, 32732) },
+                            { AOM_CDF4(17471, 30306, 32301) },
+                            { AOM_CDF4(10224, 24063, 29728) },
+                            { AOM_CDF4(6602, 17954, 25052) },
+                            { AOM_CDF4(4002, 11585, 17759) },
+                            { AOM_CDF4(30190, 32634, 32739) },
+                            { AOM_CDF4(17497, 30282, 32270) },
+                            { AOM_CDF4(10229, 23729, 29538) },
+                            { AOM_CDF4(6344, 17211, 24440) },
+                            { AOM_CDF4(3849, 11189, 17108) },
+                            { AOM_CDF4(28570, 32583, 32726) },
+                            { AOM_CDF4(17521, 30161, 32238) },
+                            { AOM_CDF4(10153, 23565, 29378) },
+                            { AOM_CDF4(6455, 17341, 24443) },
+                            { AOM_CDF4(3907, 11042, 17024) },
+                            { AOM_CDF4(30689, 32715, 32748) },
+                            { AOM_CDF4(21546, 31840, 32610) },
+                            { AOM_CDF4(13547, 27581, 31459) },
+                            { AOM_CDF4(8912, 21757, 28309) },
+                            { AOM_CDF4(5548, 15080, 22046) },
+                            { AOM_CDF4(30783, 32540, 32685) },
+                            { AOM_CDF4(17540, 29528, 31668) },
+                            { AOM_CDF4(10160, 21468, 26783) },
+                            { AOM_CDF4(4724, 13393, 20054) },
+                            { AOM_CDF4(2702, 8174, 13102) },
+                            { AOM_CDF4(31648, 32686, 32742) },
+                            { AOM_CDF4(20954, 31094, 32337) },
+                            { AOM_CDF4(12420, 25698, 30179) },
+                            { AOM_CDF4(7304, 19320, 26248) },
+                            { AOM_CDF4(4366, 12261, 18864) },
+                            { AOM_CDF4(31581, 32723, 32748) },
+                            { AOM_CDF4(21373, 31586, 32525) },
+                            { AOM_CDF4(12744, 26625, 30885) },
+                            { AOM_CDF4(7431, 20322, 26950) },
+                            { AOM_CDF4(4692, 13323, 20111) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(7833, 18369, 24095) },
+                            { AOM_CDF4(26650, 32273, 32702) },
+                            { AOM_CDF4(16371, 29961, 32191) },
+                            { AOM_CDF4(11055, 24082, 29629) },
+                            { AOM_CDF4(6892, 18644, 25400) },
+                            { AOM_CDF4(5006, 13057, 19240) },
+                            { AOM_CDF4(29834, 32666, 32748) },
+                            { AOM_CDF4(19577, 31335, 32570) },
+                            { AOM_CDF4(12253, 26509, 31122) },
+                            { AOM_CDF4(7991, 20772, 27711) },
+                            { AOM_CDF4(5677, 15910, 23059) },
+                            { AOM_CDF4(30109, 32532, 32720) },
+                            { AOM_CDF4(16747, 30166, 32252) },
+                            { AOM_CDF4(10134, 23542, 29184) },
+                            { AOM_CDF4(5791, 16176, 23556) },
+                            { AOM_CDF4(4362, 10414, 17284) },
+                            { AOM_CDF4(29492, 32626, 32748) },
+                            { AOM_CDF4(19894, 31402, 32525) },
+                            { AOM_CDF4(12942, 27071, 30869) },
+                            { AOM_CDF4(8346, 21216, 27405) },
+                            { AOM_CDF4(6572, 17087, 23859) },
+                            { AOM_CDF4(32035, 32735, 32748) },
+                            { AOM_CDF4(22957, 31838, 32618) },
+                            { AOM_CDF4(14724, 28572, 31772) },
+                            { AOM_CDF4(10364, 23999, 29553) },
+                            { AOM_CDF4(7004, 18433, 25655) },
+                            { AOM_CDF4(27528, 32277, 32681) },
+                            { AOM_CDF4(16959, 31171, 32096) },
+                            { AOM_CDF4(10486, 23593, 27962) },
+                            { AOM_CDF4(8192, 16384, 23211) },
+                            { AOM_CDF4(8937, 17873, 20852) },
+                            { AOM_CDF4(27715, 32002, 32615) },
+                            { AOM_CDF4(15073, 29491, 31676) },
+                            { AOM_CDF4(11264, 24576, 28672) },
+                            { AOM_CDF4(2341, 18725, 23406) },
+                            { AOM_CDF4(7282, 18204, 25486) },
+                            { AOM_CDF4(28547, 32213, 32657) },
+                            { AOM_CDF4(20788, 29773, 32239) },
+                            { AOM_CDF4(6780, 21469, 30508) },
+                            { AOM_CDF4(5958, 14895, 23831) },
+                            { AOM_CDF4(16384, 21845, 27307) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5992, 14304, 19765) },
+                            { AOM_CDF4(22612, 31238, 32456) },
+                            { AOM_CDF4(13456, 27162, 31087) },
+                            { AOM_CDF4(8001, 20062, 26504) },
+                            { AOM_CDF4(5168, 14105, 20764) },
+                            { AOM_CDF4(2632, 7771, 12385) },
+                            { AOM_CDF4(27034, 32344, 32709) },
+                            { AOM_CDF4(15850, 29415, 31997) },
+                            { AOM_CDF4(9494, 22776, 28841) },
+                            { AOM_CDF4(6151, 16830, 23969) },
+                            { AOM_CDF4(3461, 10039, 15722) },
+                            { AOM_CDF4(30134, 32569, 32731) },
+                            { AOM_CDF4(15638, 29422, 31945) },
+                            { AOM_CDF4(9150, 21865, 28218) },
+                            { AOM_CDF4(5647, 15719, 22676) },
+                            { AOM_CDF4(3402, 9772, 15477) },
+                            { AOM_CDF4(28530, 32586, 32735) },
+                            { AOM_CDF4(17139, 30298, 32292) },
+                            { AOM_CDF4(10200, 24039, 29685) },
+                            { AOM_CDF4(6419, 17674, 24786) },
+                            { AOM_CDF4(3544, 10225, 15824) },
+                            { AOM_CDF4(31333, 32726, 32748) },
+                            { AOM_CDF4(20618, 31487, 32544) },
+                            { AOM_CDF4(12901, 27217, 31232) },
+                            { AOM_CDF4(8624, 21734, 28171) },
+                            { AOM_CDF4(5104, 14191, 20748) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(11206, 21090, 26561) },
+                            { AOM_CDF4(28759, 32279, 32671) },
+                            { AOM_CDF4(14171, 27952, 31569) },
+                            { AOM_CDF4(9743, 22907, 29141) },
+                            { AOM_CDF4(6871, 17886, 24868) },
+                            { AOM_CDF4(4960, 13152, 19315) },
+                            { AOM_CDF4(31077, 32661, 32748) },
+                            { AOM_CDF4(19400, 31195, 32515) },
+                            { AOM_CDF4(12752, 26858, 31040) },
+                            { AOM_CDF4(8370, 22098, 28591) },
+                            { AOM_CDF4(5457, 15373, 22298) },
+                            { AOM_CDF4(31697, 32706, 32748) },
+                            { AOM_CDF4(17860, 30657, 32333) },
+                            { AOM_CDF4(12510, 24812, 29261) },
+                            { AOM_CDF4(6180, 19124, 24722) },
+                            { AOM_CDF4(5041, 13548, 17959) },
+                            { AOM_CDF4(31552, 32716, 32748) },
+                            { AOM_CDF4(21908, 31769, 32623) },
+                            { AOM_CDF4(14470, 28201, 31565) },
+                            { AOM_CDF4(9493, 22982, 28608) },
+                            { AOM_CDF4(6858, 17240, 24137) },
+                            { AOM_CDF4(32543, 32752, 32756) },
+                            { AOM_CDF4(24286, 32097, 32666) },
+                            { AOM_CDF4(15958, 29217, 32024) },
+                            { AOM_CDF4(10207, 24234, 29958) },
+                            { AOM_CDF4(6929, 18305, 25652) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4137, 10847, 15682) },
+                            { AOM_CDF4(17824, 27001, 30058) },
+                            { AOM_CDF4(10204, 22796, 28291) },
+                            { AOM_CDF4(6076, 15935, 22125) },
+                            { AOM_CDF4(3852, 10937, 16816) },
+                            { AOM_CDF4(2252, 6324, 10131) },
+                            { AOM_CDF4(25840, 32016, 32662) },
+                            { AOM_CDF4(15109, 28268, 31531) },
+                            { AOM_CDF4(9385, 22231, 28340) },
+                            { AOM_CDF4(6082, 16672, 23479) },
+                            { AOM_CDF4(3318, 9427, 14681) },
+                            { AOM_CDF4(30594, 32574, 32718) },
+                            { AOM_CDF4(16836, 29552, 31859) },
+                            { AOM_CDF4(9556, 22542, 28356) },
+                            { AOM_CDF4(6305, 16725, 23540) },
+                            { AOM_CDF4(3376, 9895, 15184) },
+                            { AOM_CDF4(29383, 32617, 32745) },
+                            { AOM_CDF4(18891, 30809, 32401) },
+                            { AOM_CDF4(11688, 25942, 30687) },
+                            { AOM_CDF4(7468, 19469, 26651) },
+                            { AOM_CDF4(3909, 11358, 17012) },
+                            { AOM_CDF4(31564, 32736, 32748) },
+                            { AOM_CDF4(20906, 31611, 32600) },
+                            { AOM_CDF4(13191, 27621, 31537) },
+                            { AOM_CDF4(8768, 22029, 28676) },
+                            { AOM_CDF4(5079, 14109, 20906) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } } };
 
 static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
     [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
diff --git a/media/libaom/src/av1/common/txb_common.c b/media/libaom/src/av1/common/txb_common.c
index c96d37cca..4eef319cd 100644
--- a/media/libaom/src/av1/common/txb_common.c
+++ b/media/libaom/src/av1/common/txb_common.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include "aom/aom_integer.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
 #include "av1/common/txb_common.h"
 
 const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
@@ -453,23 +453,6 @@ const int8_t *av1_nz_map_ctx_offset[19] = {
   av1_nz_map_ctx_offset_64x32,  // TX_64x16
 };
 
-void av1_init_lv_map(AV1_COMMON *cm) {
-  LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
-  for (int row = 0; row < 2; ++row) {
-    for (int col = 0; col < 2; ++col) {
-      for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
-        for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
-          if (row == 0 && col == 0 && count > 5) continue;
-          if ((row == 0 || col == 0) && count > 8) continue;
-
-          coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
-              get_base_ctx_from_count_mag(row, col, count, sig_mag);
-        }
-      }
-    }
-  }
-}
-
-const int16_t k_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
-                                        17, 33, 65, 129, 257, 513 };
-const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+const int16_t av1_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
+                                          17, 33, 65, 129, 257, 513 };
+const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/media/libaom/src/av1/common/txb_common.h b/media/libaom/src/av1/common/txb_common.h
index 1dda51f8b..5a62fa89b 100644
--- a/media/libaom/src/av1/common/txb_common.h
+++ b/media/libaom/src/av1/common/txb_common.h
@@ -12,8 +12,10 @@
 #ifndef AOM_AV1_COMMON_TXB_COMMON_H_
 #define AOM_AV1_COMMON_TXB_COMMON_H_
 
-extern const int16_t k_eob_group_start[12];
-extern const int16_t k_eob_offset_bits[12];
+#include "av1/common/av1_common_int.h"
+
+extern const int16_t av1_eob_group_start[12];
+extern const int16_t av1_eob_offset_bits[12];
 
 extern const int8_t av1_coeff_band_4x4[16];
 
@@ -157,6 +159,19 @@ static INLINE int get_br_ctx_2d(const uint8_t *const levels,
   return mag + 14;
 }
 
+static AOM_FORCE_INLINE int get_br_ctx_eob(const int c,  // raster order
+                                           const int bwl,
+                                           const TX_CLASS tx_class) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  if (c == 0) return 0;
+  if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
+      (tx_class == TX_CLASS_HORIZ && col == 0) ||
+      (tx_class == TX_CLASS_VERT && row == 0))
+    return 7;
+  return 14;
+}
+
 static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
                                        const int c,  // raster order
                                        const int bwl, const TX_CLASS tx_class) {
@@ -270,12 +285,10 @@ static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
       const int row = coeff_idx >> bwl;
       const int col = coeff_idx - (row << bwl);
       return ctx + nz_map_ctx_offset_1d[col];
-      break;
     }
     case TX_CLASS_VERT: {
       const int row = coeff_idx >> bwl;
       return ctx + nz_map_ctx_offset_1d[row];
-      break;
     }
     default: break;
   }
@@ -373,7 +386,9 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
     if (plane_bsize == txsize_to_bsize[tx_size]) {
       txb_ctx->txb_skip_ctx = 0;
     } else {
-      // This is the algorithm to generate table skip_contexts[min][max].
+      // This is the algorithm to generate table skip_contexts[top][left].
+      //    const int max = AOMMIN(top | left, 4);
+      //    const int min = AOMMIN(AOMMIN(top, left), 4);
       //    if (!max)
       //      txb_skip_ctx = 1;
       //    else if (!min)
@@ -385,10 +400,15 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
       //    else
       //      txb_skip_ctx = 6;
       static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
-                                                   { 1, 4, 4, 4, 5 },
-                                                   { 1, 4, 4, 4, 5 },
-                                                   { 1, 4, 4, 4, 5 },
-                                                   { 1, 4, 4, 4, 6 } };
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 3, 5, 5, 5, 6 } };
+      // For top and left, we only care about which of the following three
+      // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The
+      // spec calculates top and left with the Max() function. We can calculate
+      // an approximate max with bitwise OR because the real max and the
+      // approximate max belong to the same category.
       int top = 0;
       int left = 0;
 
@@ -397,16 +417,16 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
         top |= a[k];
       } while (++k < txb_w_unit);
       top &= COEFF_CONTEXT_MASK;
+      top = AOMMIN(top, 4);
 
       k = 0;
       do {
         left |= l[k];
       } while (++k < txb_h_unit);
       left &= COEFF_CONTEXT_MASK;
-      const int max = AOMMIN(top | left, 4);
-      const int min = AOMMIN(AOMMIN(top, left), 4);
+      left = AOMMIN(left, 4);
 
-      txb_ctx->txb_skip_ctx = skip_contexts[min][max];
+      txb_ctx->txb_skip_ctx = skip_contexts[top][left];
     }
   } else {
     const int ctx_base = get_entropy_context(tx_size, a, l);
@@ -419,6 +439,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
 #undef MAX_TX_SIZE_UNIT
 }
 
-void av1_init_lv_map(AV1_COMMON *cm);
-
 #endif  // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/media/libaom/src/av1/common/warped_motion.c b/media/libaom/src/av1/common/warped_motion.c
index 4144c4389..4e9fab9bd 100644
--- a/media/libaom/src/av1/common/warped_motion.c
+++ b/media/libaom/src/av1/common/warped_motion.c
@@ -20,85 +20,13 @@
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
-#define WARP_ERROR_BLOCK 32
-
-/* clang-format off */
-static const int error_measure_lut[512] = {
-  // pow 0.7
-  16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
-  16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
-  15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
-  15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
-  14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
-  14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
-  14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
-  13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
-  13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
-  12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
-  12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
-  12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
-  11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
-  11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
-  10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
-  10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
-  10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
-  9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
-  9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
-  8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
-  8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
-  7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
-  7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
-  6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
-  6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
-  5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
-  5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
-  4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
-  3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
-  3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
-  2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
-  1323, 1187, 1045,  894,  731,  550,  339,    0,
-  339,  550,  731,  894, 1045, 1187, 1323, 1452,
-  1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
-  2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
-  3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
-  3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
-  4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
-  5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
-  5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
-  6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
-  6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
-  7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
-  7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
-  8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
-  8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
-  9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
-  9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
-  10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
-  10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
-  11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
-  11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
-  11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
-  12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
-  12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
-  13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
-  13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
-  13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
-  14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
-  14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
-  15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
-  15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
-  15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
-  16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
-};
-/* clang-format on */
-
 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
 // at a time. The zoom/rotation/shear in the model are applied to the
 // "fractional" position of each pixel, which therefore varies within
 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
 // We need an extra 2 taps to fit this in, for a total of 8 taps.
 /* clang-format off */
-const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
 #if WARPEDPIXEL_PREC_BITS == 6
   // [-1, 0)
   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
@@ -345,7 +273,7 @@ static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
 }
 
 // Returns 1 on success or 0 on an invalid affine set
-int get_shear_params(WarpedMotionParams *wm) {
+int av1_get_shear_params(WarpedMotionParams *wm) {
   const int32_t *mat = wm->wmmat;
   if (!is_affine_valid(wm)) return 0;
   wm->alpha =
@@ -376,6 +304,7 @@ int get_shear_params(WarpedMotionParams *wm) {
   return 1;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE int highbd_error_measure(int err, int bd) {
   const int b = bd - 8;
   const int bmask = (1 << b) - 1;
@@ -447,7 +376,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
           for (int m = 0; m < 8; ++m) {
@@ -468,7 +397,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
           for (int m = 0; m < 8; ++m) {
@@ -485,7 +414,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
               uint16_t *dst16 =
                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
               int32_t tmp32 = *p;
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 tmp32 = tmp32 * conv_params->fwd_offset +
                         sum * conv_params->bck_offset;
                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -514,12 +443,11 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
   }
 }
 
-static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
-                              int width, int height, int stride,
-                              const uint8_t *const pred8, int p_col, int p_row,
-                              int p_width, int p_height, int p_stride,
-                              int subsampling_x, int subsampling_y, int bd,
-                              ConvolveParams *conv_params) {
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+                       int width, int height, int stride, uint16_t *const pred,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int bd, ConvolveParams *conv_params) {
   assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
@@ -531,17 +459,15 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
   const int16_t gamma = wm->gamma;
   const int16_t delta = wm->delta;
 
-  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
                          p_width, p_height, p_stride, subsampling_x,
                          subsampling_y, bd, conv_params, alpha, beta, gamma,
                          delta);
 }
 
-static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
-                                  const uint16_t *const dst, int p_width,
-                                  int p_height, int p_stride, int bd) {
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+                                    const uint16_t *const dst, int p_width,
+                                    int p_height, int p_stride, int bd) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
@@ -552,41 +478,33 @@ static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
   return sum_error;
 }
 
-static int64_t highbd_warp_error(
-    WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
-    int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
-    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
-    int64_t best_error) {
-  int64_t gm_sumerr = 0;
+static int64_t highbd_segmented_frame_error(
+    const uint16_t *const ref, int stride, const uint16_t *const dst,
+    int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
+    int segment_map_stride) {
+  int patch_w, patch_h;
   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-
-  ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  conv_params.use_jnt_comp_avg = 0;
-  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
-      // avoid warping extra 8x8 blocks in the padded region of the frame
-      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
-      highbd_warp_plane(wm, ref8, width, height, stride,
-                        CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
-                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
-                        &conv_params);
-
-      gm_sumerr += highbd_frame_error(
-          tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
-          warp_w, warp_h, p_stride, bd);
-      if (gm_sumerr > best_error) return gm_sumerr;
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+      sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
+                                               dst + j + i * p_stride, patch_w,
+                                               patch_h, p_stride, bd);
     }
   }
-  return gm_sumerr;
-}
-
-static INLINE int error_measure(int err) {
-  return error_measure_lut[255 + err];
+  return sum_error;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* The warp filter for ROTZOOM and AFFINE models works as follows:
    * Split the input into 8x8 blocks
@@ -732,7 +650,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
           for (int m = 0; m < 8; ++m) {
@@ -756,7 +674,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = warped_filter[offs];
+          const int16_t *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
           for (int m = 0; m < 8; ++m) {
@@ -773,7 +691,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
               uint8_t *dst8 =
                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
               int32_t tmp32 = *p;
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 tmp32 = tmp32 * conv_params->fwd_offset +
                         sum * conv_params->bck_offset;
                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -801,11 +719,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
   }
 }
 
-static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
-                       int width, int height, int stride, uint8_t *pred,
-                       int p_col, int p_row, int p_width, int p_height,
-                       int p_stride, int subsampling_x, int subsampling_y,
-                       ConvolveParams *conv_params) {
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+                int height, int stride, uint8_t *pred, int p_col, int p_row,
+                int p_width, int p_height, int p_stride, int subsampling_x,
+                int subsampling_y, ConvolveParams *conv_params) {
   assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
@@ -821,9 +738,9 @@ static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
                   alpha, beta, gamma, delta);
 }
 
-static int64_t frame_error(const uint8_t *const ref, int stride,
-                           const uint8_t *const dst, int p_width, int p_height,
-                           int p_stride) {
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
+                               const uint8_t *const dst, int p_width,
+                               int p_height, int p_stride) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
@@ -834,61 +751,64 @@ static int64_t frame_error(const uint8_t *const ref, int stride,
   return sum_error;
 }
 
-static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
-                          int width, int height, int stride,
-                          const uint8_t *const dst, int p_col, int p_row,
-                          int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y,
-                          int64_t best_error) {
-  int64_t gm_sumerr = 0;
-  int warp_w, warp_h;
-  int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-  ConvolveParams conv_params = get_conv_params(0, 0, 8);
-  conv_params.use_jnt_comp_avg = 0;
-
-  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
-      // avoid warping extra 8x8 blocks in the padded region of the frame
-      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
-      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
-                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
-
-      gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
-                               warp_w, warp_h, p_stride);
-      if (gm_sumerr > best_error) return gm_sumerr;
+static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
+                                     const uint8_t *const dst, int p_width,
+                                     int p_height, int p_stride,
+                                     uint8_t *segment_map,
+                                     int segment_map_stride) {
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+      sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
+                                        dst + j + i * p_stride, patch_w,
+                                        patch_h, p_stride);
     }
   }
-  return gm_sumerr;
+  return sum_error;
 }
 
 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
                         uint8_t *dst, int p_width, int p_height, int p_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd) {
-    return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
-                              CONVERT_TO_SHORTPTR(dst), p_width, p_height,
-                              p_stride, bd);
+    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+                                       CONVERT_TO_SHORTPTR(dst), p_width,
+                                       p_height, p_stride, bd);
   }
-  return frame_error(ref, stride, dst, p_width, p_height, p_stride);
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
 }
 
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int64_t best_error) {
-  if (wm->wmtype <= AFFINE)
-    if (!get_shear_params(wm)) return 1;
-  if (use_hbd)
-    return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
-                             p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, bd, best_error);
-  return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
-                    p_height, p_stride, subsampling_x, subsampling_y,
-                    best_error);
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int stride, uint8_t *dst, int p_width,
+                                  int p_height, int p_stride,
+                                  uint8_t *segment_map,
+                                  int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    return highbd_segmented_frame_error(
+        CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
+        p_height, p_stride, bd, segment_map, segment_map_stride);
+  }
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
+                               segment_map, segment_map_stride);
 }
 
 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
@@ -896,13 +816,21 @@ void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
                     int subsampling_y, ConvolveParams *conv_params) {
+#if CONFIG_AV1_HIGHBITDEPTH
   if (use_hbd)
-    highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
-                      p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                      bd, conv_params);
+    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
+                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
+                      p_height, p_stride, subsampling_x, subsampling_y, bd,
+                      conv_params);
   else
     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#else
+  (void)use_hbd;
+  (void)bd;
+  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#endif
 }
 
 #define LS_MV_MAX 256  // max mv in 1/8-pel
@@ -1023,18 +951,15 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
   int32_t Bx[2] = { 0, 0 };
   int32_t By[2] = { 0, 0 };
-  int i;
 
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
-  const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
+  const int rsuy = bh / 2 - 1;
+  const int rsux = bw / 2 - 1;
   const int suy = rsuy * 8;
   const int sux = rsux * 8;
   const int duy = suy + mvy;
   const int dux = sux + mvx;
-  const int isuy = (mi_row * MI_SIZE + rsuy);
-  const int isux = (mi_col * MI_SIZE + rsux);
 
   // Assume the center pixel of the block has exactly the same motion vector
   // as transmitted for the block. First shift the origin of the source
@@ -1059,7 +984,7 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
   // The loop below computes: A = P'P, Bx = P'q, By = P'r
   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
   // Contribution from neighbor block
-  for (i = 0; i < np; i++) {
+  for (int i = 0; i < np; i++) {
     const int dx = pts2[i * 2] - dux;
     const int dy = pts2[i * 2 + 1] - duy;
     const int sx = pts1[i * 2] - sux;
@@ -1087,13 +1012,12 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
 
-  int64_t Det;
-  int16_t iDet, shift;
-
   // Compute Determinant of A
-  Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
   if (Det == 0) return 1;
-  iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+
+  int16_t shift;
+  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
   shift -= WARPEDMODEL_PREC_BITS;
   if (shift < 0) {
     iDet <<= (-shift);
@@ -1101,7 +1025,6 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
   }
 
   int64_t Px[2], Py[2];
-
   // These divided by the Det, are the least squares solutions
   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
@@ -1113,16 +1036,18 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
 
+  const int isuy = (mi_row * MI_SIZE + rsuy);
+  const int isux = (mi_col * MI_SIZE + rsux);
   // Note: In the vx, vy expressions below, the max value of each of the
   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
   // for the first term so that the overall sum in the worst case fits
   // within 32 bits overall.
-  int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
-               (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
-                isuy * wm->wmmat[3]);
-  int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
-               (isux * wm->wmmat[4] +
-                isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+                      isuy * wm->wmmat[3]);
+  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+                     (isux * wm->wmmat[4] +
+                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
   wm->wmmat[0] =
       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
   wm->wmmat[1] =
@@ -1132,9 +1057,9 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
   return 0;
 }
 
-int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
-                    int mvx, WarpedMotionParams *wm_params, int mi_row,
-                    int mi_col) {
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+                        BLOCK_SIZE bsize, int mvy, int mvx,
+                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
   assert(wm_params->wmtype == AFFINE);
 
   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
@@ -1142,7 +1067,7 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
     return 1;
 
   // check compatibility with the fast warp filter
-  if (!get_shear_params(wm_params)) return 1;
+  if (!av1_get_shear_params(wm_params)) return 1;
 
   return 0;
 }
diff --git a/media/libaom/src/av1/common/warped_motion.h b/media/libaom/src/av1/common/warped_motion.h
index a1a4f067d..14dc0fe47 100644
--- a/media/libaom/src/av1/common/warped_motion.h
+++ b/media/libaom/src/av1/common/warped_motion.h
@@ -31,8 +31,83 @@
 #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
 #define WARPED_MOTION_DEBUG 0
 #define DEFAULT_WMTYPE AFFINE
+#define WARP_ERROR_BLOCK_LOG 5
+#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
 
-extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+DECLARE_ALIGNED(8, extern const int8_t,
+                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
+
+/* clang-format off */
+static const int error_measure_lut[512] = {
+    // pow 0.7
+    16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+    16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+    15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+    15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+    14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+    14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+    14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+    13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+    13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+    12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+    12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+    12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+    11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+    11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+    10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+    10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+    10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
+    9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+    9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+    8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+    8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+    7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+    7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+    6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+    6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+    5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+    5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+    4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+    3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+    3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+    2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+    1323, 1187, 1045,  894,  731,  550,  339,    0,
+    339,  550,  731,  894, 1045, 1187, 1323, 1452,
+    1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+    2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+    3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+    3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+    4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+    5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+    5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+    6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+    6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+    7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+    7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+    8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+    8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+    9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+    9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
+    10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+    10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+    11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+    11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+    11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+    12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+    12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+    13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+    13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+    13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+    14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+    14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+    15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+    15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+    15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+    16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
 
 static const uint8_t warp_pad_left[14][16] = {
   { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -68,28 +143,44 @@ static const uint8_t warp_pad_right[14][16] = {
   { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 };
 
-// Returns the error between the result of applying motion 'wm' to the frame
-// described by 'ref' and the frame described by 'dst'.
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int64_t best_error);
+static INLINE int error_measure(int err) {
+  return error_measure_lut[255 + err];
+}
 
 // Returns the error between the frame described by 'ref' and the frame
 // described by 'dst'.
 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
                         uint8_t *dst, int p_width, int p_height, int p_stride);
 
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int stride, uint8_t *dst, int p_width,
+                                  int p_height, int p_stride,
+                                  uint8_t *segment_map, int segment_map_stride);
+
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+                                    const uint16_t *const dst, int p_width,
+                                    int p_height, int p_stride, int bd);
+
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+                       int width, int height, int stride, uint16_t *const pred,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int bd, ConvolveParams *conv_params);
+
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+                int height, int stride, uint8_t *pred, int p_col, int p_row,
+                int p_width, int p_height, int p_stride, int subsampling_x,
+                int subsampling_y, ConvolveParams *conv_params);
+
 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
                     int subsampling_y, ConvolveParams *conv_params);
 
-int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
-                    int mvx, WarpedMotionParams *wm_params, int mi_row,
-                    int mi_col);
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+                        BLOCK_SIZE bsize, int mvy, int mvx,
+                        WarpedMotionParams *wm_params, int mi_row, int mi_col);
 
-int get_shear_params(WarpedMotionParams *wm);
+int av1_get_shear_params(WarpedMotionParams *wm);
 #endif  // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c b/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
index d9fb53785..196618176 100644
--- a/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
@@ -129,8 +129,8 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt0 = _mm_set1_epi16((short)w0);
+  const __m128i wt1 = _mm_set1_epi16((short)w1);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
 
   int y_qn = subpel_y_qn;
@@ -175,7 +175,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
             const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
             const __m128i shifted_32 =
@@ -207,7 +207,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
@@ -236,8 +236,7 @@ void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
                                   const int subpel_x_qn, const int x_step_qn,
                                   const int subpel_y_qn, const int y_step_qn,
                                   ConvolveParams *conv_params) {
-  // TODO(yaowu): remove unnecessary initializations
-  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
+  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
 
@@ -408,7 +407,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
           __m128i p_32 =
               _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
 
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
                                     _mm_mullo_epi32(shifted, wt1));
             shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
@@ -443,7 +442,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
-          if (conv_params->use_jnt_comp_avg) {
+          if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
diff --git a/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c b/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c
deleted file mode 100644
index 212d3bd72..000000000
--- a/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "av1/common/filter.h"
-
-typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
-                              int src_stride, uint16_t *dst, int dst_stride,
-                              int bd);
-
-// pixelsNum 0: write all 4 pixels
-//           1/2/3: residual pixels 1/2/3
-static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
-                       int dst_stride) {
-  if (2 == width) {
-    if (0 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
-      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
-      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
-    } else if (1 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-    } else if (2 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
-    } else if (3 == pixelsNum) {
-      *(int *)dst = _mm_cvtsi128_si32(u[0]);
-      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
-      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
-    }
-  } else {
-    if (0 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
-      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
-      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
-    } else if (1 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-    } else if (2 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
-    } else if (3 == pixelsNum) {
-      _mm_storel_epi64((__m128i *)dst, u[0]);
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
-      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
-    }
-  }
-}
-
-// 16-bit pixels clip with bd (10/12)
-static void highbd_clip(__m128i *p, int numVecs, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  __m128i clamped, mask;
-  int i;
-
-  for (i = 0; i < numVecs; i++) {
-    mask = _mm_cmpgt_epi16(p[i], max);
-    clamped = _mm_andnot_si128(mask, p[i]);
-    mask = _mm_and_si128(mask, max);
-    clamped = _mm_or_si128(mask, clamped);
-    mask = _mm_cmpgt_epi16(clamped, zero);
-    p[i] = _mm_and_si128(clamped, mask);
-  }
-}
-
-static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
-  __m128i v0, v1;
-  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
-
-  u[0] = _mm_loadu_si128((__m128i const *)src);
-  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
-  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  u[0] = _mm_add_epi32(u[0], rnd);
-  u[1] = _mm_add_epi32(u[1], rnd);
-  u[2] = _mm_add_epi32(u[2], rnd);
-  u[3] = _mm_add_epi32(u[3], rnd);
-
-  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
-  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
-  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
-  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
-
-  u[0] = _mm_packus_epi32(u[0], u[1]);
-  u[1] = _mm_packus_epi32(u[2], u[3]);
-
-  highbd_clip(u, 2, bd);
-
-  v0 = _mm_unpacklo_epi16(u[0], u[1]);
-  v1 = _mm_unpackhi_epi16(u[0], u[1]);
-
-  u[0] = _mm_unpacklo_epi16(v0, v1);
-  u[2] = _mm_unpackhi_epi16(v0, v1);
-
-  u[1] = _mm_srli_si128(u[0], 8);
-  u[3] = _mm_srli_si128(u[2], 8);
-}
-
-// pixelsNum = 0     : all 4 rows of pixels will be saved.
-// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
-void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
-                    uint16_t *dst, int dst_stride, int bd) {
-  __m128i u[4];
-  transClipPixel(src, src_stride, u, bd);
-  writePixel(u, width, pixelsNum, dst, dst_stride);
-}
-
-void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
-                          int src_stride, uint16_t *dst, int dst_stride,
-                          int bd) {
-  __m128i u[4], v[4];
-  const __m128i ones = _mm_set1_epi16(1);
-
-  transClipPixel(src, src_stride, u, bd);
-
-  v[0] = _mm_loadl_epi64((__m128i const *)dst);
-  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
-  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
-  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-
-  u[0] = _mm_add_epi16(u[0], v[0]);
-  u[1] = _mm_add_epi16(u[1], v[1]);
-  u[2] = _mm_add_epi16(u[2], v[2]);
-  u[3] = _mm_add_epi16(u[3], v[3]);
-
-  u[0] = _mm_add_epi16(u[0], ones);
-  u[1] = _mm_add_epi16(u[1], ones);
-  u[2] = _mm_add_epi16(u[2], ones);
-  u[3] = _mm_add_epi16(u[3], ones);
-
-  u[0] = _mm_srai_epi16(u[0], 1);
-  u[1] = _mm_srai_epi16(u[1], 1);
-  u[2] = _mm_srai_epi16(u[2], 1);
-  u[3] = _mm_srai_epi16(u[3], 1);
-
-  writePixel(u, width, pixelsNum, dst, dst_stride);
-}
-
-// Vertical convolutional filter
-
-typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
-
-static void highbdRndingPacks(__m128i *u) {
-  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
-  u[0] = _mm_add_epi32(u[0], rnd);
-  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
-  u[0] = _mm_packus_epi32(u[0], u[0]);
-}
-
-static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
-}
-
-static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
-  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
-  const __m128i ones = _mm_set1_epi16(1);
-
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-
-  v = _mm_add_epi16(v, u[0]);
-  v = _mm_add_epi16(v, ones);
-  v = _mm_srai_epi16(v, 1);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
-}
-
-WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
-
-static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-  _mm_storel_epi64((__m128i *)dst, u[0]);
-}
-
-static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
-  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
-  const __m128i ones = _mm_set1_epi16(1);
-
-  highbdRndingPacks(u);
-  highbd_clip(u, 1, bd);
-
-  v = _mm_add_epi16(v, u[0]);
-  v = _mm_add_epi16(v, ones);
-  v = _mm_srai_epi16(v, 1);
-  _mm_storel_epi64((__m128i *)dst, v);
-}
-
-WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
index 5db2ccf6c..0fbd5eae4 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
@@ -61,8 +61,7 @@ static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
   btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
 }
 
-static void idct16_new_avx2(const __m256i *input, __m256i *output,
-                            int8_t cos_bit) {
+static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -133,8 +132,8 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output,
   idct16_stage7_avx2(output, x1);
 }
 
-static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct16_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -181,8 +180,8 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
   idct16_stage7_avx2(output, x1);
 }
 
-static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct16_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -303,8 +302,8 @@ static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
   output[15] = _mm256_subs_epi16(__zero, x1[1]);
 }
 
-static void iadst16_new_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
+static void iadst16_avx2(const __m256i *input, __m256i *output,
+                         int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -365,8 +364,8 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
   iadst16_stage9_avx2(output, x1);
 }
 
-static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -401,8 +400,8 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
   iadst16_stage9_avx2(output, x1);
 }
 
-static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -568,8 +567,8 @@ static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
   btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
 }
 
-static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct32_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -621,8 +620,8 @@ static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
   output[16] = x[0];
 }
 
-static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct32_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -679,8 +678,8 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
   idct32_stage9_avx2(output, x);
 }
 
-static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void idct32_low16_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -746,8 +745,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
   idct32_stage9_avx2(output, x);
 }
 
-static void idct32_new_avx2(const __m256i *input, __m256i *output,
-                            int8_t cos_bit) {
+static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
   (void)(cos_bit);
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1104,8 +1102,8 @@ static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
   btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
 }
 
-static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct64_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -1191,8 +1189,8 @@ static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
   output[32] = x[0];
 }
 
-static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
-                                 int8_t cos_bit) {
+static void idct64_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1302,7 +1300,6 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
   x[6] = x[1];
   x[5] = x[2];
   x[4] = x[3];
-  x[9] = x[9];
   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
   idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
@@ -1312,8 +1309,8 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
   idct64_stage11_avx2(output, x);
 }
 
-static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void idct64_low16_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1428,8 +1425,8 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
   idct64_stage11_avx2(output, x);
 }
 
-static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit) {
+static void idct64_low32_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1577,6 +1574,9 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
   idct64_stage11_avx2(output, x);
 }
 
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
 // 1D functions process 16 pixels at one time.
 static const transform_1d_avx2
     lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
@@ -1589,17 +1589,15 @@ static const transform_1d_avx2
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } },
       {
-          { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
-          { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
-            NULL },
+          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
           { NULL, NULL, NULL, NULL },
       },
-      { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
-          idct32_new_avx2 },
+      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } },
-      { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
-          idct64_low32_new_avx2 },
+      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
+          idct64_low32_avx2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
@@ -1611,11 +1609,11 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
   __m256i buf1[64 * 16];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;
@@ -1635,6 +1633,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
   assert(row_txfm != NULL);
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
   for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
     __m256i buf0[64];
     const int32_t *input_row = input + (i << 4) * input_stride;
@@ -1649,7 +1648,9 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
     }
     row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+    for (int j = 0; j < txfm_size_col; ++j) {
+      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
+    }
 
     __m256i *buf1_cur = buf1 + (i << 4);
     if (lr_flip) {
@@ -1665,10 +1666,13 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
       }
     }
   }
+  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
   for (int i = 0; i < buf_size_w_div16; i++) {
     __m256i *buf1_cur = buf1 + i * txfm_size_row;
     col_txfm(buf1_cur, buf1_cur, cos_bit_col);
-    round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+    for (int j = 0; j < txfm_size_row; ++j) {
+      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
+    }
   }
   for (int i = 0; i < buf_size_w_div16; i++) {
     lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
@@ -1745,7 +1749,7 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
                                                   TX_SIZE tx_size,
                                                   int32_t eob) {
   (void)eob;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -1767,10 +1771,10 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
     TX_SIZE tx_size, int eob) {
   int eobx, eoby;
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
@@ -1807,10 +1811,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
   __m256i buf1[64];
   int eobx, eoby;
   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div16 = txfm_size_col >> 4;
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
index 995bc3da4..46c051ff8 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -24,8 +24,7 @@ static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
 
 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
 
-static void idct4_new_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
+static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -51,7 +50,8 @@ static void idct4_new_sse2(const __m128i *input, __m128i *output,
   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
 }
 
-void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct4_w4_sse2(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -77,8 +77,8 @@ void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
 }
 
-void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
-                          int8_t cos_bit) {
+static void idct8_low1_ssse3(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -102,7 +102,7 @@ void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
   output[4] = x[0];
 }
 
-void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -150,7 +150,8 @@ void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
 }
 
-void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct8_w4_sse2(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -236,8 +237,8 @@ static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
 }
 
-static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct16_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -271,8 +272,8 @@ static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
   output[8] = x[0];
 }
 
-static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct16_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -318,7 +319,7 @@ static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
   idct16_stage7_sse2(output, x);
 }
 
-void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -390,7 +391,8 @@ void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   idct16_stage7_sse2(output, x);
 }
 
-void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct16_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -600,8 +602,8 @@ static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
 }
 
-static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct32_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -653,8 +655,8 @@ static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
   output[16] = x[0];
 }
 
-static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct32_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -711,8 +713,8 @@ static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
   idct32_stage9_sse2(output, x);
 }
 
-static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void idct32_low16_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -778,8 +780,7 @@ static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
   idct32_stage9_sse2(output, x);
 }
 
-static void idct32_new_sse2(const __m128i *input, __m128i *output,
-                            int8_t cos_bit) {
+static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1138,8 +1139,8 @@ static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
   btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
 }
 
-static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct64_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
 
@@ -1225,8 +1226,8 @@ static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
   output[32] = x[0];
 }
 
-static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void idct64_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1336,7 +1337,6 @@ static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
   x[6] = x[1];
   x[5] = x[2];
   x[4] = x[3];
-  x[9] = x[9];
   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
@@ -1346,8 +1346,8 @@ static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
   idct64_stage11_sse2(output, x);
 }
 
-static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void idct64_low16_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1462,8 +1462,8 @@ static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
   idct64_stage11_sse2(output, x);
 }
 
-static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void idct64_low32_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1611,7 +1611,7 @@ static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
   idct64_stage11_sse2(output, x);
 }
 
-void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
@@ -1672,10 +1672,8 @@ void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   }
 }
 
-// TODO(binpengsmail@gmail.com):
-// To explore the reuse of VP9 versions of corresponding SSE2 functions and
-// evaluate whether there is a possibility for further speedup.
-void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst4_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
@@ -1720,8 +1718,8 @@ void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   }
 }
 
-static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void iadst8_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
@@ -1767,7 +1765,7 @@ static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
   output[7] = _mm_subs_epi16(__zero, x[1]);
 }
 
-void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
@@ -1835,7 +1833,8 @@ void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   output[7] = _mm_subs_epi16(__zero, x[1]);
 }
 
-void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst8_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __zero = _mm_setzero_si128();
@@ -1994,8 +1993,8 @@ static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
   output[15] = _mm_subs_epi16(__zero, x[1]);
 }
 
-static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void iadst16_low1_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2043,8 +2042,8 @@ static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
   iadst16_stage9_ssse3(output, x);
 }
 
-static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit) {
+static void iadst16_low8_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2079,7 +2078,8 @@ static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
   iadst16_stage9_ssse3(output, x);
 }
-void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst16_sse2(const __m128i *input, __m128i *output,
+                         int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2139,8 +2139,8 @@ void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   iadst16_stage9_ssse3(output, x);
 }
 
-void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
-                         int8_t cos_bit) {
+static void iadst16_w4_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
   (void)cos_bit;
   const int32_t *cospi = cospi_arr(INV_COS_BIT);
   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2233,8 +2233,8 @@ void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
   iadst16_stage9_ssse3(output, x);
 }
 
-static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
-                                 int8_t cos_bit) {
+static void iidentity4_ssse3(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
   (void)cos_bit;
   const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
@@ -2244,16 +2244,16 @@ static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
   }
 }
 
-static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
-                                int8_t cos_bit) {
+static void iidentity8_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
   (void)cos_bit;
   for (int i = 0; i < 8; ++i) {
     output[i] = _mm_adds_epi16(input[i], input[i]);
   }
 }
 
-static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
-                                  int8_t cos_bit) {
+static void iidentity16_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
   (void)cos_bit;
   const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
@@ -2300,11 +2300,11 @@ static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
 // 1D functions process process 8 pixels at one time.
 static const transform_1d_ssse3
     lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
-      { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
-      { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
-      { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
-      { idct32_new_sse2, NULL, NULL },
-      { idct64_low32_new_ssse3, NULL, NULL },
+      { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
+      { idct8_sse2, iadst8_sse2, iidentity8_sse2 },
+      { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
+      { idct32_sse2, NULL, NULL },
+      { idct64_low32_ssse3, NULL, NULL },
     };
 
 // functions for blocks with eob at DC and within
@@ -2312,26 +2312,24 @@ static const transform_1d_ssse3
 static const transform_1d_ssse3
     lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
       {
-          { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
-          { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
-          { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+          { idct4_sse2, idct4_sse2, NULL, NULL },
+          { iadst4_sse2, iadst4_sse2, NULL, NULL },
+          { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
       },
-      { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
-        { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
-        { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+      { { idct8_low1_ssse3, idct8_sse2, NULL, NULL },
+        { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL },
+        { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
       {
-          { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
-            NULL },
-          { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
-            NULL },
+          { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
+          { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
           { NULL, NULL, NULL, NULL },
       },
-      { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
-          idct32_new_sse2 },
+      { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
+          idct32_sse2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } },
-      { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
-          idct64_low32_new_ssse3 },
+      { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
+          idct64_low32_ssse3 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
@@ -2340,9 +2338,9 @@ static const transform_1d_ssse3
 // used in 4x4, 4x8, 4x16, 8x4, 16x4
 static const transform_1d_ssse3
     lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
-      { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
-      { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
-      { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+      { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
+      { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
+      { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
       { NULL, NULL, NULL },
       { NULL, NULL, NULL },
     };
@@ -2419,7 +2417,7 @@ static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
                                                    uint8_t *output, int stride,
                                                    TX_SIZE tx_size) {
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -2437,18 +2435,19 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
   }
 }
 
-void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type,
-                                    TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size_,
+                                           int eob) {
   (void)tx_size_;
   (void)eob;
   __m128i buf[4];
   const TX_SIZE tx_size = TX_4X4;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2510,11 +2509,11 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
   __m128i buf1[64 * 8];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2580,12 +2579,12 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
     TX_SIZE tx_size, int eob) {
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   int eobx, eoby;
   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = (eobx + 8) >> 3;
@@ -2626,10 +2625,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
   __m128i buf1[64];
   int eobx, eoby;
   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2708,18 +2707,19 @@ static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
   }
 }
 
-void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type,
-                                    TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size_,
+                                           int eob) {
   (void)tx_size_;
   (void)eob;
   __m128i buf[8];
   const TX_SIZE tx_size = TX_4X8;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2747,18 +2747,19 @@ void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
-void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type,
-                                    TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size_,
+                                           int eob) {
   (void)tx_size_;
   (void)eob;
   __m128i buf[8];
   const TX_SIZE tx_size = TX_8X4;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2786,18 +2787,19 @@ void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
-void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
-                                     int stride, TX_TYPE tx_type,
-                                     TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
+                                            uint8_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size_,
+                                            int eob) {
   (void)tx_size_;
   (void)eob;
   __m128i buf[16];
   const TX_SIZE tx_size = TX_4X16;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
 
@@ -2816,8 +2818,22 @@ void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
     load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
                                   row_one_loop);
     transpose_16bit_4x8(buf_cur, buf_cur);
-    row_txfm(buf_cur, buf_cur, cos_bit_row);
-    round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    if (row_txfm == iidentity4_ssse3) {
+      const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
+      const __m128i ones = _mm_set1_epi16(1);
+      for (int j = 0; j < 4; ++j) {
+        const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
+        const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
+        const __m128i buf_32_lo =
+            _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+        const __m128i buf_32_hi =
+            _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+        buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+      }
+    } else {
+      row_txfm(buf_cur, buf_cur, cos_bit_row);
+      round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    }
     if (lr_flip) {
       __m128i temp[8];
       flip_buf_sse2(buf_cur, temp, txfm_size_col);
@@ -2831,18 +2847,19 @@ void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
-void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
-                                     int stride, TX_TYPE tx_type,
-                                     TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
+                                            uint8_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size_,
+                                            int eob) {
   (void)tx_size_;
   (void)eob;
   __m128i buf[16];
   const TX_SIZE tx_size = TX_16X4;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
   const int txfm_size_col = tx_size_wide[tx_size];
   const int txfm_size_row = tx_size_high[tx_size];
   const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2862,8 +2879,22 @@ void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
                                txfm_size_row);
     transpose_16bit_8x4(buf_cur, buf_cur);
   }
-  row_txfm(buf, buf, cos_bit_row);
-  round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  if (row_txfm == iidentity16_ssse3) {
+    const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
+    const __m128i ones = _mm_set1_epi16(1);
+    for (int j = 0; j < 16; ++j) {
+      const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
+      const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
+      const __m128i buf_32_lo =
+          _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+      const __m128i buf_32_hi =
+          _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+      buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+    }
+  } else {
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  }
   if (lr_flip) {
     __m128i temp[16];
     flip_buf_sse2(buf, temp, 16);
@@ -2911,12 +2942,14 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
       break;
   }
 }
+
 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
   if (!txfm_param->lossless) {
+    const TX_TYPE tx_type = txfm_param->tx_type;
     av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
                                    txfm_param->tx_size, txfm_param->eob);
+
   } else {
     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
   }
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
index 66bd339d1..7d5055deb 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -72,13 +72,13 @@ static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
 }
 
 // 1D itx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
   IDCT_1D,
   IADST_1D,
   IFLIPADST_1D = IADST_1D,
   IIDENTITY_1D,
   ITX_TYPES_1D,
-} ITX_TYPE_1D;
+} UENUM1BYTE(ITX_TYPE_1D);
 
 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
   IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
diff --git a/media/libaom/src/av1/common/x86/av1_txfm_sse4.c b/media/libaom/src/av1/common/x86/av1_txfm_sse4.c
index 90b9879cc..65ccd1952 100644
--- a/media/libaom/src/av1/common/x86/av1_txfm_sse4.c
+++ b/media/libaom/src/av1/common/x86/av1_txfm_sse4.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/av1_txfm_sse4.h"
diff --git a/media/libaom/src/av1/common/x86/cfl_avx2.c b/media/libaom/src/av1/common/x86/cfl_avx2.c
index a8bfdcce6..d9c6f99d5 100644
--- a/media/libaom/src/av1/common/x86/cfl_avx2.c
+++ b/media/libaom/src/av1/common/x86/cfl_avx2.c
@@ -16,34 +16,34 @@
 
 #include "av1/common/x86/cfl_simd.h"
 
-#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                           \
-  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                     \
-  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                     \
-  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                      \
-  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(    \
-      TX_SIZE tx_size) {                                                   \
-    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {     \
-      subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
-      subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
-      subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
-      subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
-      cfl_subsample_##bd##_null,            /* 64x64 (invalid CFL size) */ \
-      subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
-      subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
-      subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
-      subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
-      subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
-      subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
-      cfl_subsample_##bd##_null,            /* 32x64 (invalid CFL size) */ \
-      cfl_subsample_##bd##_null,            /* 64x32 (invalid CFL size) */ \
-      subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
-      subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
-      subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
-      subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
-      cfl_subsample_##bd##_null,            /* 16x64 (invalid CFL size) */ \
-      cfl_subsample_##bd##_null,            /* 64x16 (invalid CFL size) */ \
-    };                                                                     \
-    return subfn_##sub[tx_size];                                           \
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                               \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                         \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                         \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                          \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(        \
+      TX_SIZE tx_size) {                                                       \
+    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
+      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
+      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
+      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
+      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
+      NULL,                                     /* 64x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
+      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
+      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
+      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
+      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
+      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
+      NULL,                                     /* 32x64 (invalid CFL size) */ \
+      NULL,                                     /* 64x32 (invalid CFL size) */ \
+      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
+      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
+      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
+      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
+      NULL,                                     /* 16x64 (invalid CFL size) */ \
+      NULL,                                     /* 64x16 (invalid CFL size) */ \
+    };                                                                         \
+    return subfn_##sub[tx_size];                                               \
   }
 
 /**
@@ -147,6 +147,7 @@ static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
 
 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 /**
  * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
  * precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -238,6 +239,7 @@ static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
 }
 
 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
                                         __m256i alpha_sign, __m256i dc_q0) {
@@ -273,33 +275,34 @@ CFL_PREDICT_X(avx2, 32, 8, lbd);
 CFL_PREDICT_X(avx2, 32, 16, lbd);
 CFL_PREDICT_X(avx2, 32, 32, lbd);
 
-cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
   static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
-    predict_lbd_4x4_ssse3,   /* 4x4 */
-    predict_lbd_8x8_ssse3,   /* 8x8 */
-    predict_lbd_16x16_ssse3, /* 16x16 */
-    predict_lbd_32x32_avx2,  /* 32x32 */
-    cfl_predict_lbd_null,    /* 64x64 (invalid CFL size) */
-    predict_lbd_4x8_ssse3,   /* 4x8 */
-    predict_lbd_8x4_ssse3,   /* 8x4 */
-    predict_lbd_8x16_ssse3,  /* 8x16 */
-    predict_lbd_16x8_ssse3,  /* 16x8 */
-    predict_lbd_16x32_ssse3, /* 16x32 */
-    predict_lbd_32x16_avx2,  /* 32x16 */
-    cfl_predict_lbd_null,    /* 32x64 (invalid CFL size) */
-    cfl_predict_lbd_null,    /* 64x32 (invalid CFL size) */
-    predict_lbd_4x16_ssse3,  /* 4x16  */
-    predict_lbd_16x4_ssse3,  /* 16x4  */
-    predict_lbd_8x32_ssse3,  /* 8x32  */
-    predict_lbd_32x8_avx2,   /* 32x8  */
-    cfl_predict_lbd_null,    /* 16x64 (invalid CFL size) */
-    cfl_predict_lbd_null,    /* 64x16 (invalid CFL size) */
+    cfl_predict_lbd_4x4_ssse3,   /* 4x4 */
+    cfl_predict_lbd_8x8_ssse3,   /* 8x8 */
+    cfl_predict_lbd_16x16_ssse3, /* 16x16 */
+    cfl_predict_lbd_32x32_avx2,  /* 32x32 */
+    NULL,                        /* 64x64 (invalid CFL size) */
+    cfl_predict_lbd_4x8_ssse3,   /* 4x8 */
+    cfl_predict_lbd_8x4_ssse3,   /* 8x4 */
+    cfl_predict_lbd_8x16_ssse3,  /* 8x16 */
+    cfl_predict_lbd_16x8_ssse3,  /* 16x8 */
+    cfl_predict_lbd_16x32_ssse3, /* 16x32 */
+    cfl_predict_lbd_32x16_avx2,  /* 32x16 */
+    NULL,                        /* 32x64 (invalid CFL size) */
+    NULL,                        /* 64x32 (invalid CFL size) */
+    cfl_predict_lbd_4x16_ssse3,  /* 4x16  */
+    cfl_predict_lbd_16x4_ssse3,  /* 16x4  */
+    cfl_predict_lbd_8x32_ssse3,  /* 8x32  */
+    cfl_predict_lbd_32x8_avx2,   /* 32x8  */
+    NULL,                        /* 16x64 (invalid CFL size) */
+    NULL,                        /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
   // function pointer array out of bounds.
   return pred[tx_size % TX_SIZES_ALL];
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static __m256i highbd_max_epi16(int bd) {
   const __m256i neg_one = _mm256_set1_epi16(-1);
   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
@@ -346,32 +349,33 @@ CFL_PREDICT_X(avx2, 32, 8, hbd)
 CFL_PREDICT_X(avx2, 32, 16, hbd)
 CFL_PREDICT_X(avx2, 32, 32, hbd)
 
-cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
   static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
-    predict_hbd_4x4_ssse3,  /* 4x4 */
-    predict_hbd_8x8_ssse3,  /* 8x8 */
-    predict_hbd_16x16_avx2, /* 16x16 */
-    predict_hbd_32x32_avx2, /* 32x32 */
-    cfl_predict_hbd_null,   /* 64x64 (invalid CFL size) */
-    predict_hbd_4x8_ssse3,  /* 4x8 */
-    predict_hbd_8x4_ssse3,  /* 8x4 */
-    predict_hbd_8x16_ssse3, /* 8x16 */
-    predict_hbd_16x8_avx2,  /* 16x8 */
-    predict_hbd_16x32_avx2, /* 16x32 */
-    predict_hbd_32x16_avx2, /* 32x16 */
-    cfl_predict_hbd_null,   /* 32x64 (invalid CFL size) */
-    cfl_predict_hbd_null,   /* 64x32 (invalid CFL size) */
-    predict_hbd_4x16_ssse3, /* 4x16  */
-    predict_hbd_16x4_avx2,  /* 16x4  */
-    predict_hbd_8x32_ssse3, /* 8x32  */
-    predict_hbd_32x8_avx2,  /* 32x8  */
-    cfl_predict_hbd_null,   /* 16x64 (invalid CFL size) */
-    cfl_predict_hbd_null,   /* 64x16 (invalid CFL size) */
+    cfl_predict_hbd_4x4_ssse3,  /* 4x4 */
+    cfl_predict_hbd_8x8_ssse3,  /* 8x8 */
+    cfl_predict_hbd_16x16_avx2, /* 16x16 */
+    cfl_predict_hbd_32x32_avx2, /* 32x32 */
+    NULL,                       /* 64x64 (invalid CFL size) */
+    cfl_predict_hbd_4x8_ssse3,  /* 4x8 */
+    cfl_predict_hbd_8x4_ssse3,  /* 8x4 */
+    cfl_predict_hbd_8x16_ssse3, /* 8x16 */
+    cfl_predict_hbd_16x8_avx2,  /* 16x8 */
+    cfl_predict_hbd_16x32_avx2, /* 16x32 */
+    cfl_predict_hbd_32x16_avx2, /* 32x16 */
+    NULL,                       /* 32x64 (invalid CFL size) */
+    NULL,                       /* 64x32 (invalid CFL size) */
+    cfl_predict_hbd_4x16_ssse3, /* 4x16  */
+    cfl_predict_hbd_16x4_avx2,  /* 16x4  */
+    cfl_predict_hbd_8x32_ssse3, /* 8x32  */
+    cfl_predict_hbd_32x8_avx2,  /* 32x8  */
+    NULL,                       /* 16x64 (invalid CFL size) */
+    NULL,                       /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
   // function pointer array out of bounds.
   return pred[tx_size % TX_SIZES_ALL];
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // Returns a vector where all the (32-bits) elements are the sum of all the
 // lanes in a.
@@ -463,27 +467,27 @@ CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
 
 // Based on the observation that for small blocks AVX2 does not outperform
 // SSE2, we call the SSE2 code for block widths 4 and 8.
-cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
   static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
-    subtract_average_4x4_sse2,   /* 4x4 */
-    subtract_average_8x8_sse2,   /* 8x8 */
-    subtract_average_16x16_avx2, /* 16x16 */
-    subtract_average_32x32_avx2, /* 32x32 */
-    cfl_subtract_average_null,   /* 64x64 (invalid CFL size) */
-    subtract_average_4x8_sse2,   /* 4x8 */
-    subtract_average_8x4_sse2,   /* 8x4 */
-    subtract_average_8x16_sse2,  /* 8x16 */
-    subtract_average_16x8_avx2,  /* 16x8 */
-    subtract_average_16x32_avx2, /* 16x32 */
-    subtract_average_32x16_avx2, /* 32x16 */
-    cfl_subtract_average_null,   /* 32x64 (invalid CFL size) */
-    cfl_subtract_average_null,   /* 64x32 (invalid CFL size) */
-    subtract_average_4x16_sse2,  /* 4x16 */
-    subtract_average_16x4_avx2,  /* 16x4 */
-    subtract_average_8x32_sse2,  /* 8x32 */
-    subtract_average_32x8_avx2,  /* 32x8 */
-    cfl_subtract_average_null,   /* 16x64 (invalid CFL size) */
-    cfl_subtract_average_null,   /* 64x16 (invalid CFL size) */
+    cfl_subtract_average_4x4_sse2,   /* 4x4 */
+    cfl_subtract_average_8x8_sse2,   /* 8x8 */
+    cfl_subtract_average_16x16_avx2, /* 16x16 */
+    cfl_subtract_average_32x32_avx2, /* 32x32 */
+    NULL,                            /* 64x64 (invalid CFL size) */
+    cfl_subtract_average_4x8_sse2,   /* 4x8 */
+    cfl_subtract_average_8x4_sse2,   /* 8x4 */
+    cfl_subtract_average_8x16_sse2,  /* 8x16 */
+    cfl_subtract_average_16x8_avx2,  /* 16x8 */
+    cfl_subtract_average_16x32_avx2, /* 16x32 */
+    cfl_subtract_average_32x16_avx2, /* 32x16 */
+    NULL,                            /* 32x64 (invalid CFL size) */
+    NULL,                            /* 64x32 (invalid CFL size) */
+    cfl_subtract_average_4x16_sse2,  /* 4x16 */
+    cfl_subtract_average_16x4_avx2,  /* 16x4 */
+    cfl_subtract_average_8x32_sse2,  /* 8x32 */
+    cfl_subtract_average_32x8_avx2,  /* 32x8 */
+    NULL,                            /* 16x64 (invalid CFL size) */
+    NULL,                            /* 64x16 (invalid CFL size) */
   };
   // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
   // index the function pointer array out of bounds.
diff --git a/media/libaom/src/av1/common/x86/cfl_simd.h b/media/libaom/src/av1/common/x86/cfl_simd.h
index 3b342cd4e..03ae02a92 100644
--- a/media/libaom/src/av1/common/x86/cfl_simd.h
+++ b/media/libaom/src/av1/common/x86/cfl_simd.h
@@ -15,229 +15,232 @@
 #include "av1/common/blockd.h"
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   uint16_t *output_q3);
-
-void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-
-void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-
-void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 uint16_t *output_q3);
-void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  uint16_t *output_q3);
-void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
-void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 // SSE2 version is optimal for with == 4, we reuse them in AVX2
-void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
 
 // SSE2 version is optimal for with == 8, we reuse them in AVX2
-void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
-
-void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-
-void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                           int dst_stride, int alpha_q3);
-void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-
-void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                            int dst_stride, int alpha_q3);
-void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                             int dst_stride, int alpha_q3);
-void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                             int dst_stride, int alpha_q3);
-
-void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-
-void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                           int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-
-void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                            int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                             int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
-                             int dst_stride, int alpha_q3, int bd);
-
+void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                 int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                 int dst_stride, int alpha_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                 int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                 int dst_stride, int alpha_q3, int bd);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/media/libaom/src/av1/common/x86/cfl_ssse3.c b/media/libaom/src/av1/common/x86/cfl_ssse3.c
index bbf007295..476b6609a 100644
--- a/media/libaom/src/av1/common/x86/cfl_ssse3.c
+++ b/media/libaom/src/av1/common/x86/cfl_ssse3.c
@@ -168,6 +168,7 @@ static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
   } while (pred_buf_m128i < end);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 /**
  * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
  * precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -296,6 +297,7 @@ static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
     pred_buf_q3 += CFL_BUF_LINE;
   } while (pred_buf_q3 < end);
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
 
@@ -341,6 +343,7 @@ static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
 
 CFL_PREDICT_FN(ssse3, lbd)
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE __m128i highbd_max_epi16(int bd) {
   const __m128i neg_one = _mm_set1_epi16(-1);
   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
@@ -391,3 +394,4 @@ static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
 }
 
 CFL_PREDICT_FN(ssse3, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
index 0acafd044..e19575d72 100644
--- a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
+++ b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
@@ -24,34 +24,18 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
+                             const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
   const int bd = 8;
-
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  int im_h = h + filter_params_y->taps - 1;
   int im_stride = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   const int bits =
       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
-  __m256i filt[4], coeffs_h[4], coeffs_v[4];
-
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
-
   const __m256i round_const_h = _mm256_set1_epi16(
       ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
   const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -65,58 +49,96 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
       ((1 << (offset_bits - conv_params->round_1)) >> 1));
   const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
 
-  for (j = 0; j < w; j += 8) {
-    for (i = 0; i < im_h; i += 2) {
-      __m256i data = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
+    is_vert_4tap = 1;
 
-      // Load the next line
-      if (i + 1 < im_h)
+  // horz_filt as 4 tap and vert_filt as 8 tap
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    // horz-filter
+    for (int j = 0; j < w; j += 8) {
+      for (i = 0; i < (im_h - 2); i += 2) {
+        __m256i data = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+        // Load the next line
         data = _mm256_inserti128_si256(
             data,
             _mm_loadu_si128(
                 (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
             1);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
 
-      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+      __m256i data_1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
 
+      __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
       res =
           _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
       _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-    }
 
-    /* Vertical filter */
-    {
+      // vert filter
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      // horz_filter
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+      // vert_filter
+      __m256i s[6];
       __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
       __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
       __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
       __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
 
-      __m256i s[8];
       s[0] = _mm256_unpacklo_epi16(src_0, src_1);
       s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
-      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
-      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+      s[3] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[4] = _mm256_unpackhi_epi16(src_2, src_3);
 
       for (i = 0; i < h; i += 2) {
         const int16_t *data = &im_block[i * im_stride];
 
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
 
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
 
-        __m256i res_a = convolve(s, coeffs_v);
-        __m256i res_b = convolve(s + 4, coeffs_v);
+        __m256i res_a = convolve_4tap(s, coeffs_v + 1);
+        __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
 
         // Combine V round and 2F-H-V round into a single rounding
         res_a =
@@ -154,13 +176,25 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
 
         s[0] = s[1];
         s[1] = s[2];
-        s[2] = s[3];
-
+        s[3] = s[4];
         s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
       }
     }
+  } else {
+    int j;
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (j = 0; j < w; j += 8) {
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
   }
 }
 
@@ -180,12 +214,12 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   if (w >= 16) {
@@ -195,20 +229,20 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
     } while (h);
   } else if (w == 4) {
     do {
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
index b1a62a4f6..5376ea79b 100644
--- a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
+++ b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
@@ -22,7 +22,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
                              const InterpFilterParams *filter_params_x,
                              const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
+                             const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
   const int bd = 8;
 
@@ -45,7 +45,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -111,7 +111,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -205,7 +205,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
 
         if (w == 2) {
-          *(uint16_t *)p = _mm_cvtsi128_si32(res);
+          *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
         } else if (w == 4) {
           *(uint32_t *)p = _mm_cvtsi128_si32(res);
         } else {
@@ -240,12 +240,12 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  const int subpel_x_qn, const int subpel_y_qn,
                                   ConvolveParams *conv_params) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
 
   if (w >= 16) {
@@ -255,20 +255,20 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
     } while (h);
   } else if (w == 4) {
     do {
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 4 * sizeof(*src));
+      memmove(dst, src, 4 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
@@ -354,24 +354,23 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
   }
 }
 
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_sse2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const __m128i zero = _mm_setzero_si128();
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   int i, j;
@@ -411,14 +410,14 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
           const __m128i data_ref_0_hi =
               _mm_loadu_si128((__m128i *)(&dst[j + 8]));
 
-          const __m128i comp_avg_res_lo =
-              comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo = comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
 
-          const __m128i comp_avg_res_hi =
-              comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result_hi = convolve_rounding(
               &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
@@ -449,7 +448,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
           const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/convolve_avx2.c b/media/libaom/src/av1/common/x86/convolve_avx2.c
index 0e91ea947..1d5bc6fbd 100644
--- a/media/libaom/src/av1/common/x86/convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/convolve_avx2.c
@@ -21,155 +21,241 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
+  int i, j, is_vert_4tap = 0;
   // right shift is F-1 because we are already dividing
   // filter co-efficients by 2
   const int right_shift_bits = (FILTER_BITS - 1);
   const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
   const __m256i right_shift_const =
       _mm256_set1_epi16((1 << right_shift_bits) >> 1);
-  __m256i coeffs[4], s[8];
 
   assert(conv_params->round_0 <= FILTER_BITS);
   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
-
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
+  __m256i coeffs[4], s[8];
+  __m128i d[6];
 
-  for (j = 0; j < w; j += 16) {
-    const uint8_t *data = &src_ptr[j];
-    __m256i src6;
-
-    // Load lines a and b. Line a to lower 128, line b to upper 128
-    const __m256i src_01a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        0x20);
-
-    const __m256i src_12a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        0x20);
-
-    const __m256i src_23a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        0x20);
-
-    const __m256i src_34a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        0x20);
-
-    const __m256i src_45a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        0x20);
-
-    src6 = _mm256_castsi128_si256(
-        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-    const __m256i src_56a = _mm256_permute2x128_si256(
-        _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-        src6, 0x20);
-
-    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
-    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
-    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
-    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-    for (i = 0; i < h; i += 2) {
-      data = &src_ptr[i * src_stride + j];
-      const __m256i src_67a = _mm256_permute2x128_si256(
-          src6,
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          0x20);
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-          src6, 0x20);
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  // vert_filt as 4 tap
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
 
-      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
 
-      const __m256i res_lo = convolve_lowbd(s, coeffs);
+      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
 
-      /* rounding code */
-      // shift by F - 1
-      const __m256i res_16b_lo = _mm256_sra_epi16(
-          _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-      // 8 bit conversion and saturation to uint8
-      __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+        const __m256i src_45a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
 
-      if (w - j > 8) {
-        const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+        const __m256i src_56a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
 
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
         /* rounding code */
         // shift by F - 1
-        const __m256i res_16b_hi = _mm256_sra_epi16(
-            _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
         // 8 bit conversion and saturation to uint8
-        __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-        __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_a);
-        const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                         res_1);
-      } else {
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-        if (w - j > 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
                            res_1);
-        } else if (w - j > 2) {
-          xx_storel_32(&dst[i * dst_stride + j], res_0);
-          xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
         } else {
-          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+          }
         }
+        s[0] = s[1];
+        s[1] = s[2];
+
+        s[3] = s[4];
+        s[4] = s[5];
       }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+      const __m256i src_45a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      const __m256i src_56a =
+          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        const __m256i src_67a = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+        const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
 
-      s[0] = s[1];
-      s[1] = s[2];
-      s[2] = s[3];
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
 
-      s[4] = s[5];
-      s[5] = s[6];
-      s[6] = s[7];
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
     }
   }
 }
@@ -178,81 +264,119 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_0;
 
-  __m256i filt[4], coeffs[4];
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
   const __m256i round_0_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(bits);
-
+  int i, is_horiz_4tap = 0;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
   assert(conv_params->round_0 > 0);
 
-  if (w <= 8) {
-    for (i = 0; i < h; i += 2) {
-      const __m256i data = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
-          _mm256_castsi128_si256(_mm_loadu_si128(
-              (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
-          0x20);
-
-      __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
-
-      res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                 round_0_shift);
-
-      res_16b =
-          _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
-
-      /* rounding code */
-      // 8 bit conversion and saturation to uint8
-      __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-      const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-      const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-      if (w > 4) {
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
-      } else if (w > 2) {
-        xx_storel_32(&dst[i * dst_stride], res_0);
-        xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
-      } else {
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
-        *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-        *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+  __m256i coeffs[4], filt[4];
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
       }
     }
   } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
-        // 19 20 21 22 23
-        const __m256i data = _mm256_inserti128_si256(
-            _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
-            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
-            1);
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
 
         __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
 
@@ -266,11 +390,49 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
         // 8 bit conversion and saturation to uint8
         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 
-        // Store values into the destination buffer
-        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-        res_8b = _mm256_permute4x64_epi64(res_8b, 216);
-        __m128i res = _mm256_castsi256_si128(res_8b);
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
       }
     }
   }
diff --git a/media/libaom/src/av1/common/x86/convolve_sse2.c b/media/libaom/src/av1/common/x86/convolve_sse2.c
index 5016642de..4323ac4d1 100644
--- a/media/libaom/src/av1/common/x86/convolve_sse2.c
+++ b/media/libaom/src/av1/common/x86/convolve_sse2.c
@@ -79,7 +79,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_vert * src_stride;
@@ -88,14 +88,14 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   __m128i coeffs[4];
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
 
   if (w <= 4) {
     __m128i s[8], src6, res, res_round, res16;
@@ -132,7 +132,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
       res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
       if (w == 2)
-        *(uint16_t *)dst = res_int;
+        *(uint16_t *)dst = (uint16_t)res_int;
       else
         *(uint32_t *)dst = res_int;
 
@@ -145,7 +145,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
       res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
       if (w == 2)
-        *(uint16_t *)dst = res_int;
+        *(uint16_t *)dst = (uint16_t)res_int;
       else
         *(uint32_t *)dst = res_int;
 
@@ -240,7 +240,7 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_x,
                             const InterpFilterParams *filter_params_y,
-                            const int subpel_x_q4, const int subpel_y_q4,
+                            const int subpel_x_qn, const int subpel_y_qn,
                             ConvolveParams *conv_params) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_horiz;
@@ -253,13 +253,13 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
   __m128i coeffs[4];
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   assert(bits >= 0);
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
 
   if (w <= 4) {
     do {
@@ -284,7 +284,7 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
 
       uint32_t r = _mm_cvtsi128_si32(res);
       if (w == 2)
-        *(uint16_t *)dst = r;
+        *(uint16_t *)dst = (uint16_t)r;
       else
         *(uint32_t *)dst = r;
 
diff --git a/media/libaom/src/av1/common/x86/filterintra_sse4.c b/media/libaom/src/av1/common/x86/filterintra_sse4.c
index c11edc1d4..99f4d9967 100644
--- a/media/libaom/src/av1/common/x86/filterintra_sse4.c
+++ b/media/libaom/src/av1/common/x86/filterintra_sse4.c
@@ -27,10 +27,6 @@ void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
 
   assert(bw <= 32 && bh <= 32);
 
-  // The initialization is just for silencing Jenkins static analysis warnings
-  for (r = 0; r < bh + 1; ++r)
-    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
   for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
   memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
 
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
index ae68f0bbb..396aed01b 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -24,8 +24,8 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
@@ -58,8 +58,8 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m256i zero = _mm256_setzero_si256();
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -222,12 +222,12 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
 void av1_highbd_convolve_2d_copy_sr_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)bd;
 
@@ -238,10 +238,10 @@ void av1_highbd_convolve_2d_copy_sr_avx2(
 
   if (w == 2) {
     do {
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
-      memcpy(dst, src, 2 * sizeof(*src));
+      memmove(dst, src, 2 * sizeof(*src));
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
index 15f8872c1..f758775ee 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -74,12 +74,12 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
 void av1_highbd_convolve_2d_copy_sr_sse2(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
   (void)conv_params;
   (void)bd;
   if (w >= 16) {
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
index 3f8dafb4b..d2ff47c1f 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,23 +21,23 @@
 #include "aom_dsp/x86/convolve_sse4_1.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi32(w0);
@@ -75,15 +75,17 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
           const __m128i res_unsigned_lo =
               _mm_add_epi32(res_32b_lo, offset_const);
 
-          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
           const __m128i res_unsigned_hi =
               _mm_add_epi32(res_32b_hi, offset_const);
 
-          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -132,9 +134,9 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
               _mm_add_epi32(res_32b_hi, offset_const);
 
           const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
           const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -166,11 +168,11 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
   }
 }
 
-void av1_highbd_jnt_convolve_2d_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   CONV_BUF_TYPE *dst = conv_params->dst;
@@ -179,7 +181,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
   int im_stride = MAX_SB_SIZE;
   int i, j;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -206,7 +208,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -273,7 +275,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -359,8 +361,9 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
 
             const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
 
-            const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result = highbd_convolve_rounding_sse2(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -391,10 +394,12 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
             const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
             const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
 
-            const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_lo =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_hi =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_lo =
                 highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
index 1d029db39..5318fcaa8 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -22,8 +22,8 @@
 void av1_highbd_convolve_2d_sr_ssse3(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = 8;
@@ -54,8 +54,8 @@ void av1_highbd_convolve_2d_sr_ssse3(
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
index ade2af03e..93e98e4b3 100644
--- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -18,6 +18,7 @@
 #include "av1/common/idct.h"
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
 
 // Note:
 //  Total 32x4 registers to represent 32x32 block coefficients.
@@ -46,6 +47,47 @@ static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
   return clamped;
 }
 
+static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
+  if (shift != 0) {
+    __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+    in[0] = _mm256_add_epi32(in[0], rnding);
+    in[1] = _mm256_add_epi32(in[1], rnding);
+    in[2] = _mm256_add_epi32(in[2], rnding);
+    in[3] = _mm256_add_epi32(in[3], rnding);
+
+    in[0] = _mm256_srai_epi32(in[0], shift);
+    in[1] = _mm256_srai_epi32(in[1], shift);
+    in[2] = _mm256_srai_epi32(in[2], shift);
+    in[3] = _mm256_srai_epi32(in[3], shift);
+  }
+}
+
+static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
+  round_shift_4x4_avx2(in, shift);
+  round_shift_4x4_avx2(in + 4, shift);
+  round_shift_4x4_avx2(in + 8, shift);
+  round_shift_4x4_avx2(in + 12, shift);
+}
+
+static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
+                                    const __m256i *clamp_lo,
+                                    const __m256i *clamp_hi, int size) {
+  __m256i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm256_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm256_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
+  }
+}
+
 static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
                                                  __m256i res0, __m256i res1,
                                                  const int bd) {
@@ -72,30 +114,48 @@ static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
     _mm256_storeu_si256((__m256i *)(output + i * stride), u);
   }
 }
-
-static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
-  __m256i tmp, round;
-  round = _mm256_set1_epi32(1 << (bit - 1));
-  tmp = _mm256_add_epi32(vec, round);
-  return _mm256_srai_epi32(tmp, bit);
+static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
+                                                const int bd) {
+  __m256i x0 = pred;
+  x0 = _mm256_add_epi32(res, x0);
+  x0 = _mm256_packus_epi32(x0, x0);
+  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+  x0 = highbd_clamp_epi16_avx2(x0, bd);
+  return x0;
 }
 
-static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
-                                                 __m256i *output,
-                                                 const int size,
-                                                 const int bit) {
-  if (bit > 0) {
-    int i;
-    for (i = 0; i < size; i++) {
-      output[i] = av1_round_shift_32_avx2(input[i], bit);
-    }
-  } else {
-    int i;
-    for (i = 0; i < size; i++) {
-      output[i] = _mm256_slli_epi32(input[i], -bit);
-    }
+static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
+                                                int stride, int flipud,
+                                                int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  __m128i temp;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m256i v = _mm256_cvtepi16_epi32(temp);
+    __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
+    __m128i u1 = _mm256_castsi256_si128(u);
+    _mm_storeu_si128((__m128i *)(output + i * stride), u1);
   }
 }
+static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+                           __m256i *out1, const __m256i *clamp_lo,
+                           const __m256i *clamp_hi, int shift) {
+  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+  __m256i a0 = _mm256_add_epi32(offset, in0);
+  __m256i a1 = _mm256_sub_epi32(offset, in1);
+
+  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
 
 static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
   __m256i u0, u1, u2, u3, u4, u5, u6, u7;
@@ -134,6 +194,43 @@ static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
   out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
 }
 
+static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[7], in[6]);
+  u1 = _mm256_unpackhi_epi32(in[7], in[6]);
+
+  u2 = _mm256_unpacklo_epi32(in[5], in[4]);
+  u3 = _mm256_unpackhi_epi32(in[5], in[4]);
+
+  u4 = _mm256_unpacklo_epi32(in[3], in[2]);
+  u5 = _mm256_unpackhi_epi32(in[3], in[2]);
+
+  u6 = _mm256_unpacklo_epi32(in[1], in[0]);
+  u7 = _mm256_unpackhi_epi32(in[1], in[0]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
 static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
                               int input_stiride, int size) {
   int i;
@@ -179,36 +276,6 @@ static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
   *out1 = a1;
 }
 
-static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
-                                 __m256i *out0, __m256i *out1) {
-  __m256i a0 = _mm256_add_epi32(in0, in1);
-  __m256i a1 = _mm256_sub_epi32(in0, in1);
-
-  *out0 = a0;
-  *out1 = a1;
-}
-
-static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
-                              __m256i *out0, __m256i *out1,
-                              const __m256i *clamp_lo, const __m256i *clamp_hi,
-                              int shift) {
-  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
-  __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
-  __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
-  __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
-
-  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
-  a0 = _mm256_max_epi32(a0, *clamp_lo);
-  a0 = _mm256_min_epi32(a0, *clamp_hi);
-  a1 = _mm256_max_epi32(a1, *clamp_lo);
-  a1 = _mm256_min_epi32(a1, *clamp_hi);
-
-  *out0 = a0;
-  *out1 = a1;
-}
-
 static INLINE void idct32_stage4_avx2(
     __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
     const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
@@ -344,63 +411,32 @@ static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
 static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
                                       const int do_cols, const int bd,
                                       const int out_shift,
-                                      const int log_range) {
-  if (do_cols) {
-    addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
-    addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
-    addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
-    addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
-    addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
-    addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
-    addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
-    addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
-    addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
-    addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
-    addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
-    addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
-    addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
-    addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
-    addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
-    addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
-  } else {
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi) {
+  addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
-    addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
-                      &clamp_hi_out, out_shift);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8_avx2(out, out_shift);
+    round_shift_8x8_avx2(out + 16, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   }
 }
 
@@ -410,8 +446,8 @@ static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
-  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
   __m256i x;
   // stage 0
   // stage 1
@@ -427,22 +463,16 @@ static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
   // stage 7
   // stage 8
   // stage 9
-  if (do_cols) {
-    x = _mm256_max_epi32(x, clamp_lo);
-    x = _mm256_min_epi32(x, clamp_hi);
-  } else {
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
     __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
     x = _mm256_add_epi32(offset, x);
     x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-    x = _mm256_max_epi32(x, clamp_lo_out);
-    x = _mm256_min_epi32(x, clamp_hi_out);
   }
-
+  x = _mm256_max_epi32(x, clamp_lo);
+  x = _mm256_min_epi32(x, clamp_hi);
   out[0] = x;
   out[1] = x;
   out[2] = x;
@@ -586,7 +616,7 @@ static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                        &rounding, bit);
 
     // stage 9
-    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -736,7 +766,7 @@ static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
                        &rounding, bit);
 
     // stage 9
-    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -1094,66 +1124,2958 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
     bf0[31] = bf1[31];
 
     // stage 9
+    addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      round_shift_8x8_avx2(out + 16, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+    }
+  }
+}
+static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    in[0] = _mm256_mullo_epi32(in[0], cospi32);
+    in[0] = _mm256_add_epi32(in[0], rnding);
+    in[0] = _mm256_srai_epi32(in[0], bit);
+
+    // stage 5
+    // stage 6
+    // stage 7
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+      in[0] = _mm256_add_epi32(in[0], offset);
+      in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+    }
+    in[0] = _mm256_max_epi32(in[0], clamp_lo);
+    in[0] = _mm256_min_epi32(in[0], clamp_hi);
+    out[0] = in[0];
+    out[1] = in[0];
+    out[2] = in[0];
+    out[3] = in[0];
+    out[4] = in[0];
+    out[5] = in[0];
+    out[6] = in[0];
+    out[7] = in[0];
+    out[8] = in[0];
+    out[9] = in[0];
+    out[10] = in[0];
+    out[11] = in[0];
+    out[12] = in[0];
+    out[13] = in[0];
+    out[14] = in[0];
+    out[15] = in[0];
+  }
+}
+
+static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[2] = in[4];
+    u[4] = in[2];
+    u[6] = in[6];
+    u[8] = in[1];
+    u[10] = in[5];
+    u[12] = in[3];
+    u[14] = in[7];
+
+    // stage 2
+    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+
+    u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+    u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+
+    u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+    u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+
+    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+    // stage 3
+    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+    u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
+    u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
+
+    addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm256_mullo_epi32(u[0], cospi32);
+    u[0] = _mm256_add_epi32(x, rnding);
+    u[0] = _mm256_srai_epi32(u[0], bit);
+    u[1] = u[0];
+
+    u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+    u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+
+    addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+    x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = x;
+    y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = y;
+
+    // stage 5
+    addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    x = _mm256_mullo_epi32(u[5], cospi32);
+    y = _mm256_mullo_epi32(u[6], cospi32);
+    u[5] = _mm256_sub_epi32(y, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_add_epi32(y, x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+    x = _mm256_mullo_epi32(u[10], cospi32);
+    y = _mm256_mullo_epi32(u[13], cospi32);
+    u[10] = _mm256_sub_epi32(y, x);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    u[13] = _mm256_add_epi32(x, y);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    y = _mm256_mullo_epi32(u[12], cospi32);
+    u[11] = _mm256_sub_epi32(y, x);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    u[12] = _mm256_add_epi32(x, y);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+    // stage 7
+    addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+    }
+  }
+}
+
+static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], v[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+    v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+    v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+    v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+    u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+    addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm256_mullo_epi32(u[0], cospi32);
+    y = _mm256_mullo_epi32(u[1], cospi32);
+    v[0] = _mm256_add_epi32(x, y);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    v[1] = _mm256_sub_epi32(x, y);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+    v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = _mm256_mullo_epi32(v[5], cospi32);
+    y = _mm256_mullo_epi32(v[6], cospi32);
+    u[5] = _mm256_sub_epi32(y, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_add_epi32(y, x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm256_mullo_epi32(u[10], cospi32);
+    y = _mm256_mullo_epi32(u[13], cospi32);
+    v[10] = _mm256_sub_epi32(y, x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_add_epi32(x, y);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    y = _mm256_mullo_epi32(u[12], cospi32);
+    v[11] = _mm256_sub_epi32(y, x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_add_epi32(x, y);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+    }
+  }
+}
+
+static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i v[16], x, y, temp1, temp2;
+
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    x = _mm256_mullo_epi32(in[0], cospi62);
+    v[0] = _mm256_add_epi32(x, rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    x = _mm256_mullo_epi32(in[0], cospi2);
+    v[1] = _mm256_sub_epi32(zero, x);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    // stage 3
+    v[8] = v[0];
+    v[9] = v[1];
+
+    // stage 4
+    temp1 = _mm256_mullo_epi32(v[8], cospi8);
+    x = _mm256_mullo_epi32(v[9], cospi56);
+    temp1 = _mm256_add_epi32(temp1, x);
+    temp1 = _mm256_add_epi32(temp1, rnding);
+    temp1 = _mm256_srai_epi32(temp1, bit);
+
+    temp2 = _mm256_mullo_epi32(v[8], cospi56);
+    x = _mm256_mullo_epi32(v[9], cospi8);
+    temp2 = _mm256_sub_epi32(temp2, x);
+    temp2 = _mm256_add_epi32(temp2, rnding);
+    temp2 = _mm256_srai_epi32(temp2, bit);
+    v[8] = temp1;
+    v[9] = temp2;
+
+    // stage 5
+    v[4] = v[0];
+    v[5] = v[1];
+    v[12] = v[8];
+    v[13] = v[9];
+
+    // stage 6
+    temp1 = _mm256_mullo_epi32(v[4], cospi16);
+    x = _mm256_mullo_epi32(v[5], cospi48);
+    temp1 = _mm256_add_epi32(temp1, x);
+    temp1 = _mm256_add_epi32(temp1, rnding);
+    temp1 = _mm256_srai_epi32(temp1, bit);
+
+    temp2 = _mm256_mullo_epi32(v[4], cospi48);
+    x = _mm256_mullo_epi32(v[5], cospi16);
+    temp2 = _mm256_sub_epi32(temp2, x);
+    temp2 = _mm256_add_epi32(temp2, rnding);
+    temp2 = _mm256_srai_epi32(temp2, bit);
+    v[4] = temp1;
+    v[5] = temp2;
+
+    temp1 = _mm256_mullo_epi32(v[12], cospi16);
+    x = _mm256_mullo_epi32(v[13], cospi48);
+    temp1 = _mm256_add_epi32(temp1, x);
+    temp1 = _mm256_add_epi32(temp1, rnding);
+    temp1 = _mm256_srai_epi32(temp1, bit);
+
+    temp2 = _mm256_mullo_epi32(v[12], cospi48);
+    x = _mm256_mullo_epi32(v[13], cospi16);
+    temp2 = _mm256_sub_epi32(temp2, x);
+    temp2 = _mm256_add_epi32(temp2, rnding);
+    temp2 = _mm256_srai_epi32(temp2, bit);
+    v[12] = temp1;
+    v[13] = temp2;
+
+    // stage 7
+    v[2] = v[0];
+    v[3] = v[1];
+    v[6] = v[4];
+    v[7] = v[5];
+    v[10] = v[8];
+    v[11] = v[9];
+    v[14] = v[12];
+    v[15] = v[13];
+
+    // stage 8
+    y = _mm256_mullo_epi32(v[2], cospi32);
+    x = _mm256_mullo_epi32(v[3], cospi32);
+    v[2] = _mm256_add_epi32(y, x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_sub_epi32(y, x);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    y = _mm256_mullo_epi32(v[6], cospi32);
+    x = _mm256_mullo_epi32(v[7], cospi32);
+    v[6] = _mm256_add_epi32(y, x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_sub_epi32(y, x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    y = _mm256_mullo_epi32(v[10], cospi32);
+    x = _mm256_mullo_epi32(v[11], cospi32);
+    v[10] = _mm256_add_epi32(y, x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_sub_epi32(y, x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    y = _mm256_mullo_epi32(v[14], cospi32);
+    x = _mm256_mullo_epi32(v[15], cospi32);
+    v[14] = _mm256_add_epi32(y, x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_sub_epi32(y, x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = v[0];
+      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    __m256i zero = _mm256_setzero_si256();
+    x = _mm256_mullo_epi32(in[0], cospi62);
+    u[0] = _mm256_add_epi32(x, rnding);
+    u[0] = _mm256_srai_epi32(u[0], bit);
+
+    x = _mm256_mullo_epi32(in[0], cospi2);
+    u[1] = _mm256_sub_epi32(zero, x);
+    u[1] = _mm256_add_epi32(u[1], rnding);
+    u[1] = _mm256_srai_epi32(u[1], bit);
+
+    x = _mm256_mullo_epi32(in[2], cospi54);
+    u[2] = _mm256_add_epi32(x, rnding);
+    u[2] = _mm256_srai_epi32(u[2], bit);
+
+    x = _mm256_mullo_epi32(in[2], cospi10);
+    u[3] = _mm256_sub_epi32(zero, x);
+    u[3] = _mm256_add_epi32(u[3], rnding);
+    u[3] = _mm256_srai_epi32(u[3], bit);
+
+    x = _mm256_mullo_epi32(in[4], cospi46);
+    u[4] = _mm256_add_epi32(x, rnding);
+    u[4] = _mm256_srai_epi32(u[4], bit);
+
+    x = _mm256_mullo_epi32(in[4], cospi18);
+    u[5] = _mm256_sub_epi32(zero, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    x = _mm256_mullo_epi32(in[6], cospi38);
+    u[6] = _mm256_add_epi32(x, rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    x = _mm256_mullo_epi32(in[6], cospi26);
+    u[7] = _mm256_sub_epi32(zero, x);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    u[8] = _mm256_mullo_epi32(in[7], cospi34);
+    u[8] = _mm256_add_epi32(u[8], rnding);
+    u[8] = _mm256_srai_epi32(u[8], bit);
+
+    u[9] = _mm256_mullo_epi32(in[7], cospi30);
+    u[9] = _mm256_add_epi32(u[9], rnding);
+    u[9] = _mm256_srai_epi32(u[9], bit);
+
+    u[10] = _mm256_mullo_epi32(in[5], cospi42);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    u[11] = _mm256_mullo_epi32(in[5], cospi22);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    u[12] = _mm256_mullo_epi32(in[3], cospi50);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+
+    u[13] = _mm256_mullo_epi32(in[3], cospi14);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    u[14] = _mm256_mullo_epi32(in[1], cospi58);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    u[15] = _mm256_mullo_epi32(in[1], cospi6);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 3
+    addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    y = _mm256_mullo_epi32(u[8], cospi56);
+    x = _mm256_mullo_epi32(u[9], cospi56);
+    u[8] = _mm256_mullo_epi32(u[8], cospi8);
+    u[8] = _mm256_add_epi32(u[8], x);
+    u[8] = _mm256_add_epi32(u[8], rnding);
+    u[8] = _mm256_srai_epi32(u[8], bit);
+
+    x = _mm256_mullo_epi32(u[9], cospi8);
+    u[9] = _mm256_sub_epi32(y, x);
+    u[9] = _mm256_add_epi32(u[9], rnding);
+    u[9] = _mm256_srai_epi32(u[9], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi24);
+    y = _mm256_mullo_epi32(u[10], cospi24);
+    u[10] = _mm256_mullo_epi32(u[10], cospi40);
+    u[10] = _mm256_add_epi32(u[10], x);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi40);
+    u[11] = _mm256_sub_epi32(y, x);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospi8);
+    y = _mm256_mullo_epi32(u[12], cospi8);
+    u[12] = _mm256_mullo_epi32(u[12], cospim56);
+    u[12] = _mm256_add_epi32(u[12], x);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospim56);
+    u[13] = _mm256_sub_epi32(y, x);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospi40);
+    y = _mm256_mullo_epi32(u[14], cospi40);
+    u[14] = _mm256_mullo_epi32(u[14], cospim24);
+    u[14] = _mm256_add_epi32(u[14], x);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospim24);
+    u[15] = _mm256_sub_epi32(y, x);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 5
+    addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    x = _mm256_mullo_epi32(u[5], cospi48);
+    y = _mm256_mullo_epi32(u[4], cospi48);
+    u[4] = _mm256_mullo_epi32(u[4], cospi16);
+    u[4] = _mm256_add_epi32(u[4], x);
+    u[4] = _mm256_add_epi32(u[4], rnding);
+    u[4] = _mm256_srai_epi32(u[4], bit);
+
+    x = _mm256_mullo_epi32(u[5], cospi16);
+    u[5] = _mm256_sub_epi32(y, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    x = _mm256_mullo_epi32(u[7], cospi16);
+    y = _mm256_mullo_epi32(u[6], cospi16);
+    u[6] = _mm256_mullo_epi32(u[6], cospim48);
+    u[6] = _mm256_add_epi32(u[6], x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    x = _mm256_mullo_epi32(u[7], cospim48);
+    u[7] = _mm256_sub_epi32(y, x);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospi48);
+    y = _mm256_mullo_epi32(u[12], cospi48);
+    u[12] = _mm256_mullo_epi32(u[12], cospi16);
+    u[12] = _mm256_add_epi32(u[12], x);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospi16);
+    u[13] = _mm256_sub_epi32(y, x);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospi16);
+    y = _mm256_mullo_epi32(u[14], cospi16);
+    u[14] = _mm256_mullo_epi32(u[14], cospim48);
+    u[14] = _mm256_add_epi32(u[14], x);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospim48);
+    u[15] = _mm256_sub_epi32(y, x);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 7
+    addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    y = _mm256_mullo_epi32(u[2], cospi32);
+    x = _mm256_mullo_epi32(u[3], cospi32);
+    u[2] = _mm256_add_epi32(y, x);
+    u[2] = _mm256_add_epi32(u[2], rnding);
+    u[2] = _mm256_srai_epi32(u[2], bit);
+
+    u[3] = _mm256_sub_epi32(y, x);
+    u[3] = _mm256_add_epi32(u[3], rnding);
+    u[3] = _mm256_srai_epi32(u[3], bit);
+    y = _mm256_mullo_epi32(u[6], cospi32);
+    x = _mm256_mullo_epi32(u[7], cospi32);
+    u[6] = _mm256_add_epi32(y, x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[7] = _mm256_sub_epi32(y, x);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    y = _mm256_mullo_epi32(u[10], cospi32);
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    u[10] = _mm256_add_epi32(y, x);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    u[11] = _mm256_sub_epi32(y, x);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    y = _mm256_mullo_epi32(u[14], cospi32);
+    x = _mm256_mullo_epi32(u[15], cospi32);
+    u[14] = _mm256_add_epi32(y, x);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    u[15] = _mm256_sub_epi32(y, x);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = u[0];
+      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
+      out[2] = u[12];
+      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
+      out[4] = u[6];
+      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
+      out[6] = u[10];
+      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
+      out[8] = u[3];
+      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
+      out[10] = u[15];
+      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
+      out[12] = u[5];
+      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
+      out[14] = u[9];
+      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+      neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], v[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm256_mullo_epi32(in[15], cospi2);
+    x = _mm256_mullo_epi32(in[0], cospi62);
+    v[0] = _mm256_add_epi32(v[0], x);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    v[1] = _mm256_mullo_epi32(in[15], cospi62);
+    x = _mm256_mullo_epi32(in[0], cospi2);
+    v[1] = _mm256_sub_epi32(v[1], x);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    v[2] = _mm256_mullo_epi32(in[13], cospi10);
+    x = _mm256_mullo_epi32(in[2], cospi54);
+    v[2] = _mm256_add_epi32(v[2], x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_mullo_epi32(in[13], cospi54);
+    x = _mm256_mullo_epi32(in[2], cospi10);
+    v[3] = _mm256_sub_epi32(v[3], x);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = _mm256_mullo_epi32(in[11], cospi18);
+    x = _mm256_mullo_epi32(in[4], cospi46);
+    v[4] = _mm256_add_epi32(v[4], x);
+    v[4] = _mm256_add_epi32(v[4], rnding);
+    v[4] = _mm256_srai_epi32(v[4], bit);
+
+    v[5] = _mm256_mullo_epi32(in[11], cospi46);
+    x = _mm256_mullo_epi32(in[4], cospi18);
+    v[5] = _mm256_sub_epi32(v[5], x);
+    v[5] = _mm256_add_epi32(v[5], rnding);
+    v[5] = _mm256_srai_epi32(v[5], bit);
+
+    v[6] = _mm256_mullo_epi32(in[9], cospi26);
+    x = _mm256_mullo_epi32(in[6], cospi38);
+    v[6] = _mm256_add_epi32(v[6], x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_mullo_epi32(in[9], cospi38);
+    x = _mm256_mullo_epi32(in[6], cospi26);
+    v[7] = _mm256_sub_epi32(v[7], x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = _mm256_mullo_epi32(in[7], cospi34);
+    x = _mm256_mullo_epi32(in[8], cospi30);
+    v[8] = _mm256_add_epi32(v[8], x);
+    v[8] = _mm256_add_epi32(v[8], rnding);
+    v[8] = _mm256_srai_epi32(v[8], bit);
+
+    v[9] = _mm256_mullo_epi32(in[7], cospi30);
+    x = _mm256_mullo_epi32(in[8], cospi34);
+    v[9] = _mm256_sub_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[10] = _mm256_mullo_epi32(in[5], cospi42);
+    x = _mm256_mullo_epi32(in[10], cospi22);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_mullo_epi32(in[5], cospi22);
+    x = _mm256_mullo_epi32(in[10], cospi42);
+    v[11] = _mm256_sub_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(in[3], cospi50);
+    x = _mm256_mullo_epi32(in[12], cospi14);
+    v[12] = _mm256_add_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[13] = _mm256_mullo_epi32(in[3], cospi14);
+    x = _mm256_mullo_epi32(in[12], cospi50);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[14] = _mm256_mullo_epi32(in[1], cospi58);
+    x = _mm256_mullo_epi32(in[14], cospi6);
+    v[14] = _mm256_add_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_mullo_epi32(in[1], cospi6);
+    x = _mm256_mullo_epi32(in[14], cospi58);
+    v[15] = _mm256_sub_epi32(v[15], x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 3
+    addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm256_mullo_epi32(u[8], cospi8);
+    x = _mm256_mullo_epi32(u[9], cospi56);
+    v[8] = _mm256_add_epi32(v[8], x);
+    v[8] = _mm256_add_epi32(v[8], rnding);
+    v[8] = _mm256_srai_epi32(v[8], bit);
+
+    v[9] = _mm256_mullo_epi32(u[8], cospi56);
+    x = _mm256_mullo_epi32(u[9], cospi8);
+    v[9] = _mm256_sub_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospi40);
+    x = _mm256_mullo_epi32(u[11], cospi24);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_mullo_epi32(u[10], cospi24);
+    x = _mm256_mullo_epi32(u[11], cospi40);
+    v[11] = _mm256_sub_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[12], cospim56);
+    x = _mm256_mullo_epi32(u[13], cospi8);
+    v[12] = _mm256_add_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[13] = _mm256_mullo_epi32(u[12], cospi8);
+    x = _mm256_mullo_epi32(u[13], cospim56);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[14] = _mm256_mullo_epi32(u[14], cospim24);
+    x = _mm256_mullo_epi32(u[15], cospi40);
+    v[14] = _mm256_add_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_mullo_epi32(u[14], cospi40);
+    x = _mm256_mullo_epi32(u[15], cospim24);
+    v[15] = _mm256_sub_epi32(v[15], x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 5
+    addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm256_mullo_epi32(u[4], cospi16);
+    x = _mm256_mullo_epi32(u[5], cospi48);
+    v[4] = _mm256_add_epi32(v[4], x);
+    v[4] = _mm256_add_epi32(v[4], rnding);
+    v[4] = _mm256_srai_epi32(v[4], bit);
+
+    v[5] = _mm256_mullo_epi32(u[4], cospi48);
+    x = _mm256_mullo_epi32(u[5], cospi16);
+    v[5] = _mm256_sub_epi32(v[5], x);
+    v[5] = _mm256_add_epi32(v[5], rnding);
+    v[5] = _mm256_srai_epi32(v[5], bit);
+
+    v[6] = _mm256_mullo_epi32(u[6], cospim48);
+    x = _mm256_mullo_epi32(u[7], cospi16);
+    v[6] = _mm256_add_epi32(v[6], x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_mullo_epi32(u[6], cospi16);
+    x = _mm256_mullo_epi32(u[7], cospim48);
+    v[7] = _mm256_sub_epi32(v[7], x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm256_mullo_epi32(u[12], cospi16);
+    x = _mm256_mullo_epi32(u[13], cospi48);
+    v[12] = _mm256_add_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[13] = _mm256_mullo_epi32(u[12], cospi48);
+    x = _mm256_mullo_epi32(u[13], cospi16);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[14] = _mm256_mullo_epi32(u[14], cospim48);
+    x = _mm256_mullo_epi32(u[15], cospi16);
+    v[14] = _mm256_add_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_mullo_epi32(u[14], cospi16);
+    x = _mm256_mullo_epi32(u[15], cospim48);
+    v[15] = _mm256_sub_epi32(v[15], x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 7
+    addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm256_mullo_epi32(u[2], cospi32);
+    x = _mm256_mullo_epi32(u[3], cospi32);
+    v[2] = _mm256_add_epi32(y, x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_sub_epi32(y, x);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm256_mullo_epi32(u[6], cospi32);
+    x = _mm256_mullo_epi32(u[7], cospi32);
+    v[6] = _mm256_add_epi32(y, x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_sub_epi32(y, x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm256_mullo_epi32(u[10], cospi32);
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    v[10] = _mm256_add_epi32(y, x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_sub_epi32(y, x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm256_mullo_epi32(u[14], cospi32);
+    x = _mm256_mullo_epi32(u[15], cospi32);
+    v[14] = _mm256_add_epi32(y, x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_sub_epi32(y, x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 9
     if (do_cols) {
-      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
-      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
-      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
-      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
-      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
-      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
-      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
-      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
-      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
-      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
-      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
-      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
-      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
-      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
-      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
-      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
+      out[0] = v[0];
+      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
     } else {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-      addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    }
+  }
+}
+static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  x = _mm256_mullo_epi32(in[0], cospi32);
+  x = _mm256_add_epi32(x, rnding);
+  x = _mm256_srai_epi32(x, bit);
+
+  // stage 4
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+    x = _mm256_add_epi32(x, offset);
+    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+  }
+  x = _mm256_max_epi32(x, clamp_lo);
+  x = _mm256_min_epi32(x, clamp_hi);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+}
+static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i x, y;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = _mm256_mullo_epi32(in[1], cospi56);
+  y = _mm256_mullo_epi32(in[7], cospim8);
+  u4 = _mm256_add_epi32(x, y);
+  u4 = _mm256_add_epi32(u4, rnding);
+  u4 = _mm256_srai_epi32(u4, bit);
+
+  x = _mm256_mullo_epi32(in[1], cospi8);
+  y = _mm256_mullo_epi32(in[7], cospi56);
+  u7 = _mm256_add_epi32(x, y);
+  u7 = _mm256_add_epi32(u7, rnding);
+  u7 = _mm256_srai_epi32(u7, bit);
+
+  x = _mm256_mullo_epi32(in[5], cospi24);
+  y = _mm256_mullo_epi32(in[3], cospim40);
+  u5 = _mm256_add_epi32(x, y);
+  u5 = _mm256_add_epi32(u5, rnding);
+  u5 = _mm256_srai_epi32(u5, bit);
+
+  x = _mm256_mullo_epi32(in[5], cospi40);
+  y = _mm256_mullo_epi32(in[3], cospi24);
+  u6 = _mm256_add_epi32(x, y);
+  u6 = _mm256_add_epi32(u6, rnding);
+  u6 = _mm256_srai_epi32(u6, bit);
+
+  // stage 3
+  x = _mm256_mullo_epi32(u0, cospi32);
+  y = _mm256_mullo_epi32(u1, cospi32);
+  v0 = _mm256_add_epi32(x, y);
+  v0 = _mm256_add_epi32(v0, rnding);
+  v0 = _mm256_srai_epi32(v0, bit);
+
+  v1 = _mm256_sub_epi32(x, y);
+  v1 = _mm256_add_epi32(v1, rnding);
+  v1 = _mm256_srai_epi32(v1, bit);
+
+  x = _mm256_mullo_epi32(u2, cospi48);
+  y = _mm256_mullo_epi32(u3, cospim16);
+  v2 = _mm256_add_epi32(x, y);
+  v2 = _mm256_add_epi32(v2, rnding);
+  v2 = _mm256_srai_epi32(v2, bit);
+
+  x = _mm256_mullo_epi32(u2, cospi16);
+  y = _mm256_mullo_epi32(u3, cospi48);
+  v3 = _mm256_add_epi32(x, y);
+  v3 = _mm256_add_epi32(v3, rnding);
+  v3 = _mm256_srai_epi32(v3, bit);
+
+  addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
+
+  x = _mm256_mullo_epi32(v5, cospi32);
+  y = _mm256_mullo_epi32(v6, cospi32);
+  u6 = _mm256_add_epi32(y, x);
+  u6 = _mm256_add_epi32(u6, rnding);
+  u6 = _mm256_srai_epi32(u6, bit);
+
+  u5 = _mm256_sub_epi32(y, x);
+  u5 = _mm256_add_epi32(u5, rnding);
+  u5 = _mm256_srai_epi32(u5, bit);
+
+  addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_4x4_avx2(out, out_shift);
+    round_shift_4x4_avx2(out + 4, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+  }
+}
+static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i kZero = _mm256_setzero_si256();
+  __m256i u[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  x = _mm256_mullo_epi32(in[0], cospi60);
+  u[0] = _mm256_add_epi32(x, rnding);
+  u[0] = _mm256_srai_epi32(u[0], bit);
+
+  x = _mm256_mullo_epi32(in[0], cospi4);
+  u[1] = _mm256_sub_epi32(kZero, x);
+  u[1] = _mm256_add_epi32(u[1], rnding);
+  u[1] = _mm256_srai_epi32(u[1], bit);
+
+  // stage 3
+  // stage 4
+  __m256i temp1, temp2;
+  temp1 = _mm256_mullo_epi32(u[0], cospi16);
+  x = _mm256_mullo_epi32(u[1], cospi48);
+  temp1 = _mm256_add_epi32(temp1, x);
+  temp1 = _mm256_add_epi32(temp1, rnding);
+  temp1 = _mm256_srai_epi32(temp1, bit);
+  u[4] = temp1;
+
+  temp2 = _mm256_mullo_epi32(u[0], cospi48);
+  x = _mm256_mullo_epi32(u[1], cospi16);
+  u[5] = _mm256_sub_epi32(temp2, x);
+  u[5] = _mm256_add_epi32(u[5], rnding);
+  u[5] = _mm256_srai_epi32(u[5], bit);
+
+  // stage 5
+  // stage 6
+  temp1 = _mm256_mullo_epi32(u[0], cospi32);
+  x = _mm256_mullo_epi32(u[1], cospi32);
+  u[2] = _mm256_add_epi32(temp1, x);
+  u[2] = _mm256_add_epi32(u[2], rnding);
+  u[2] = _mm256_srai_epi32(u[2], bit);
+
+  u[3] = _mm256_sub_epi32(temp1, x);
+  u[3] = _mm256_add_epi32(u[3], rnding);
+  u[3] = _mm256_srai_epi32(u[3], bit);
+
+  temp1 = _mm256_mullo_epi32(u[4], cospi32);
+  x = _mm256_mullo_epi32(u[5], cospi32);
+  u[6] = _mm256_add_epi32(temp1, x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_sub_epi32(temp1, x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm256_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm256_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm256_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm256_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+  }
+}
+
+static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                          int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i kZero = _mm256_setzero_si256();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[8], v[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  u[0] = _mm256_mullo_epi32(in[7], cospi4);
+  x = _mm256_mullo_epi32(in[0], cospi60);
+  u[0] = _mm256_add_epi32(u[0], x);
+  u[0] = _mm256_add_epi32(u[0], rnding);
+  u[0] = _mm256_srai_epi32(u[0], bit);
+
+  u[1] = _mm256_mullo_epi32(in[7], cospi60);
+  x = _mm256_mullo_epi32(in[0], cospi4);
+  u[1] = _mm256_sub_epi32(u[1], x);
+  u[1] = _mm256_add_epi32(u[1], rnding);
+  u[1] = _mm256_srai_epi32(u[1], bit);
+
+  u[2] = _mm256_mullo_epi32(in[5], cospi20);
+  x = _mm256_mullo_epi32(in[2], cospi44);
+  u[2] = _mm256_add_epi32(u[2], x);
+  u[2] = _mm256_add_epi32(u[2], rnding);
+  u[2] = _mm256_srai_epi32(u[2], bit);
+
+  u[3] = _mm256_mullo_epi32(in[5], cospi44);
+  x = _mm256_mullo_epi32(in[2], cospi20);
+  u[3] = _mm256_sub_epi32(u[3], x);
+  u[3] = _mm256_add_epi32(u[3], rnding);
+  u[3] = _mm256_srai_epi32(u[3], bit);
+
+  u[4] = _mm256_mullo_epi32(in[3], cospi36);
+  x = _mm256_mullo_epi32(in[4], cospi28);
+  u[4] = _mm256_add_epi32(u[4], x);
+  u[4] = _mm256_add_epi32(u[4], rnding);
+  u[4] = _mm256_srai_epi32(u[4], bit);
+
+  u[5] = _mm256_mullo_epi32(in[3], cospi28);
+  x = _mm256_mullo_epi32(in[4], cospi36);
+  u[5] = _mm256_sub_epi32(u[5], x);
+  u[5] = _mm256_add_epi32(u[5], rnding);
+  u[5] = _mm256_srai_epi32(u[5], bit);
+
+  u[6] = _mm256_mullo_epi32(in[1], cospi52);
+  x = _mm256_mullo_epi32(in[6], cospi12);
+  u[6] = _mm256_add_epi32(u[6], x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_mullo_epi32(in[1], cospi12);
+  x = _mm256_mullo_epi32(in[6], cospi52);
+  u[7] = _mm256_sub_epi32(u[7], x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm256_mullo_epi32(v[4], cospi16);
+  x = _mm256_mullo_epi32(v[5], cospi48);
+  u[4] = _mm256_add_epi32(u[4], x);
+  u[4] = _mm256_add_epi32(u[4], rnding);
+  u[4] = _mm256_srai_epi32(u[4], bit);
+
+  u[5] = _mm256_mullo_epi32(v[4], cospi48);
+  x = _mm256_mullo_epi32(v[5], cospi16);
+  u[5] = _mm256_sub_epi32(u[5], x);
+  u[5] = _mm256_add_epi32(u[5], rnding);
+  u[5] = _mm256_srai_epi32(u[5], bit);
+
+  u[6] = _mm256_mullo_epi32(v[6], cospim48);
+  x = _mm256_mullo_epi32(v[7], cospi16);
+  u[6] = _mm256_add_epi32(u[6], x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_mullo_epi32(v[6], cospi16);
+  x = _mm256_mullo_epi32(v[7], cospim48);
+  u[7] = _mm256_sub_epi32(u[7], x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm256_mullo_epi32(v[2], cospi32);
+  x = _mm256_mullo_epi32(v[3], cospi32);
+  u[2] = _mm256_add_epi32(v[0], x);
+  u[2] = _mm256_add_epi32(u[2], rnding);
+  u[2] = _mm256_srai_epi32(u[2], bit);
+
+  u[3] = _mm256_sub_epi32(v[0], x);
+  u[3] = _mm256_add_epi32(u[3], rnding);
+  u[3] = _mm256_srai_epi32(u[3], bit);
+
+  v[0] = _mm256_mullo_epi32(v[6], cospi32);
+  x = _mm256_mullo_epi32(v[7], cospi32);
+  u[6] = _mm256_add_epi32(v[0], x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_sub_epi32(v[0], x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm256_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm256_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm256_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm256_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+  }
+}
+static INLINE void idct64_stage8_avx2(
+    __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
+    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+    const __m256i *rnding, int bit) {
+  int i;
+  __m256i temp1, temp2, temp3, temp4;
+  temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+  u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+  u[10] = temp1;
+  temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+  u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+  temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+  temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+  temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+  u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+  u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+  u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+  u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+  temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+  temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+  temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+  u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+  u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+  u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+  u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rnding, int bit) {
+  int i;
+  __m256i temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+  temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+  temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+  temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+  u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+  u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+  u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+  u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
+                                       const __m256i *cospi32,
+                                       const __m256i *clamp_lo,
+                                       const __m256i *clamp_hi,
+                                       const __m256i *rnding, int bit) {
+  __m256i temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+  temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+  temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+  temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+  u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+  u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+  u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+  u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+  temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+  temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+  temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+  u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+  u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+  u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+  u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
+                                       int bd, int out_shift,
+                                       const __m256i *clamp_lo,
+                                       const __m256i *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_8x8_avx2(out, out_shift);
+    round_shift_8x8_avx2(out + 16, out_shift);
+    round_shift_8x8_avx2(out + 32, out_shift);
+    round_shift_8x8_avx2(out + 48, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+  }
+}
+
+static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+
+  {
+    __m256i x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+        x = _mm256_add_epi32(x, offset);
+        x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+      }
     }
+    x = _mm256_max_epi32(x, clamp_lo);
+    x = _mm256_min_epi32(x, clamp_hi);
+    out[0] = x;
+    out[1] = x;
+    out[2] = x;
+    out[3] = x;
+    out[4] = x;
+    out[5] = x;
+    out[6] = x;
+    out[7] = x;
+    out[8] = x;
+    out[9] = x;
+    out[10] = x;
+    out[11] = x;
+    out[12] = x;
+    out[13] = x;
+    out[14] = x;
+    out[15] = x;
+    out[16] = x;
+    out[17] = x;
+    out[18] = x;
+    out[19] = x;
+    out[20] = x;
+    out[21] = x;
+    out[22] = x;
+    out[23] = x;
+    out[24] = x;
+    out[25] = x;
+    out[26] = x;
+    out[27] = x;
+    out[28] = x;
+    out[29] = x;
+    out[30] = x;
+    out[31] = x;
+    out[32] = x;
+    out[33] = x;
+    out[34] = x;
+    out[35] = x;
+    out[36] = x;
+    out[37] = x;
+    out[38] = x;
+    out[39] = x;
+    out[40] = x;
+    out[41] = x;
+    out[42] = x;
+    out[43] = x;
+    out[44] = x;
+    out[45] = x;
+    out[46] = x;
+    out[47] = x;
+    out[48] = x;
+    out[49] = x;
+    out[50] = x;
+    out[51] = x;
+    out[52] = x;
+    out[53] = x;
+    out[54] = x;
+    out[55] = x;
+    out[56] = x;
+    out[57] = x;
+    out[58] = x;
+    out[59] = x;
+    out[60] = x;
+    out[61] = x;
+    out[62] = x;
+    out[63] = x;
   }
 }
+static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+
+  {
+    __m256i u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    __m256i temp1, temp2;
+    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = temp1;
+
+    temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = temp2;
+
+    temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = temp1;
+
+    temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = temp1;
+
+    temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
 
+    // stage 6
+    temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[0] = temp1;
+
+    temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = temp1;
+    temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[35] = temp2;
+    temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[36] = temp1;
+    temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[37] = temp2;
+    temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = temp1;
+    temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[43] = temp2;
+    temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[44] = temp1;
+    temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = temp1;
+    temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[19] = temp2;
+    temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[20] = temp1;
+    temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+
+    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                       bit);
+
+    // stage 10
+    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                        bit);
+
+    // stage 11
+    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+  {
+    __m256i u[64];
+    __m256i tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+    u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+    u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+    u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+    u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+    u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+    u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+    u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+    u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+    u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
+    u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
+    u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
+    u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
+    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+    u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = tmp1;
+    tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[5] = tmp1;
+    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                       bit);
+
+    // stage 10
+    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                        bit);
+
+    // stage 11
+    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+  const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+  {
+    __m256i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
+    u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
+    u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
+    u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
+    u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
+    u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
+    u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
+    u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
+    u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
+    u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
+    u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
+    u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
+    u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
+    u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+    v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+    v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+    v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+
+    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    for (i = 0; i < 32; i++) {
+      addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                  &clamp_hi);
+    }
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      round_shift_8x8_avx2(out, out_shift);
+      round_shift_8x8_avx2(out + 16, out_shift);
+      round_shift_8x8_avx2(out + 32, out_shift);
+      round_shift_8x8_avx2(out + 48, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+    }
+  }
+}
 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
                                   int do_cols, int bd, int out_shift);
 
@@ -1164,19 +4086,21 @@ static const transform_1d_avx2
           { NULL, NULL, NULL, NULL },
           { NULL, NULL, NULL, NULL },
       },
-      { { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
       {
+          { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
+          { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
           { NULL, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
+      },
+      {
+          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
           { NULL, NULL, NULL, NULL },
       },
       { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } },
 
-      { { NULL, NULL, NULL, NULL },
+      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
@@ -1186,10 +4110,10 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
                                                    TX_TYPE tx_type,
                                                    TX_SIZE tx_size, int eob,
                                                    const int bd) {
-  __m256i buf1[64 * 2];
+  __m256i buf1[64 * 8];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -1198,7 +4122,7 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int input_stride = AOMMIN(32, txfm_size_col);
-
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
   const transform_1d_avx2 row_txfm =
@@ -1213,7 +4137,7 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
 
   // 1st stage: column transform
   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
-    __m256i buf0[32];
+    __m256i buf0[64];
     const int32_t *input_row = input + i * input_stride * 8;
     for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
       __m256i *buf0_cur = buf0 + j * 8;
@@ -1221,18 +4145,29 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
 
       transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
     }
-
-    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_avx2(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
 
     __m256i *_buf1 = buf1 + i * 8;
-    for (int j = 0; j < buf_size_w_div8; ++j) {
-      transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_8x8_flip_avx2(
+            &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+      }
     }
   }
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
     av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
                                   buf1 + i * txfm_size_row, txfm_size_row,
@@ -1240,12 +4175,15 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
   }
 
   // write to buffer
-  {
+  if (txfm_size_col >= 16) {
     for (int i = 0; i < (txfm_size_col >> 4); i++) {
       highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
                                     output + 16 * i, stride, ud_flip,
                                     txfm_size_row, bd);
     }
+  } else if (txfm_size_col == 8) {
+    highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
+                                 bd);
   }
 }
 
@@ -1255,95 +4193,54 @@ void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
                                              int eob, const int bd) {
   switch (tx_type) {
     case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
       highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
                                              stride, tx_type, tx_size, eob, bd);
       break;
-    default: assert(0); break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
-                                        int stride,
-                                        const TxfmParam *txfm_param) {
-  const int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
-                                              txfm_param->tx_size,
-                                              txfm_param->eob, bd);
-      break;
-      // Assembly version doesn't support IDTX, so use C version for it.
     case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+                                                tx_size, eob, bd);
       break;
-
-    default: assert(0);
+    default: assert(0); break;
   }
 }
-
 void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
                                   int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-    case TX_32X32:
-      av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
-      break;
-    case TX_16X16:
-      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_8X8:
-      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_4X8:
-      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
-      break;
-    case TX_8X16:
-      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X8:
-      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
-      break;
-    case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
-      break;
-    case TX_32X64:
-      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
-      break;
-    case TX_64X32:
-      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_4X4:
       av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
-      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_4X16:
-      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
-      break;
-    case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
-      break;
-    case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_64X64:
-    case TX_16X64:
-    case TX_64X16:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+    default:
+      av1_highbd_inv_txfm2d_add_universe_avx2(
           input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
           txfm_param->eob, txfm_param->bd);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
index e29e0baf5..03eaef832 100644
--- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -17,6 +17,7 @@
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 #include "av1/common/idct.h"
 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
 #include "av1/common/x86/av1_txfm_sse4.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
@@ -36,19 +37,87 @@ static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
   return clamped;
 }
 
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  if (shift != 0) {
+    __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+    in[0] = _mm_add_epi32(in[0], rnding);
+    in[1] = _mm_add_epi32(in[1], rnding);
+    in[2] = _mm_add_epi32(in[2], rnding);
+    in[3] = _mm_add_epi32(in[3], rnding);
+
+    in[0] = _mm_srai_epi32(in[0], shift);
+    in[1] = _mm_srai_epi32(in[1], shift);
+    in[2] = _mm_srai_epi32(in[2], shift);
+    in[3] = _mm_srai_epi32(in[3], shift);
+  }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
+                                      const __m128i *clamp_lo,
+                                      const __m128i *clamp_hi, int size) {
+  __m128i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+  }
+}
+
 static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
                                                   __m128i res0, __m128i res1,
                                                   const int bd) {
   __m128i x0 = _mm_cvtepi16_epi32(pred);
   __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
-
+  __m128i min_clip_val = _mm_setzero_si128();
+  __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
   x0 = _mm_add_epi32(res0, x0);
   x1 = _mm_add_epi32(res1, x1);
+  x0 = _mm_max_epi32(x0, min_clip_val);
+  x0 = _mm_min_epi32(x0, max_clip_val);
+  x1 = _mm_max_epi32(x1, min_clip_val);
+  x1 = _mm_min_epi32(x1, max_clip_val);
   x0 = _mm_packus_epi32(x0, x1);
+  return x0;
+}
+
+static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
+                                                  __m128i res0, const int bd) {
+  __m128i x0 = _mm_cvtepi16_epi32(pred);
+
+  x0 = _mm_add_epi32(res0, x0);
+  x0 = _mm_packus_epi32(x0, x0);
   x0 = highbd_clamp_epi16(x0, bd);
   return x0;
 }
 
+static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
+                                                  int stride, int flipud,
+                                                  int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+    __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
+
+    _mm_storel_epi64((__m128i *)(output + i * stride), u);
+  }
+}
+
 static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
                                                   int stride, int flipud,
                                                   int height, const int bd) {
@@ -91,34 +160,23 @@ static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
   *out1 = a1;
 }
 
-static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
-                                   __m128i *out0, __m128i *out1) {
-  __m128i a0 = _mm_add_epi32(in0, in1);
-  __m128i a1 = _mm_sub_epi32(in0, in1);
-
-  *out0 = a0;
-  *out1 = a1;
-}
-
-static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
-                                __m128i *out0, __m128i *out1,
-                                const __m128i *clamp_lo,
-                                const __m128i *clamp_hi, int shift) {
+static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
+                                   const __m128i *clamp_lo,
+                                   const __m128i *clamp_hi, int shift) {
   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
-  __m128i in0_w_offset = _mm_add_epi32(in0, offset);
-  __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
-  __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+  __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
+  __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
 
-  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
-  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+  in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
+  in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
 
-  a0 = _mm_max_epi32(a0, *clamp_lo);
-  a0 = _mm_min_epi32(a0, *clamp_hi);
-  a1 = _mm_max_epi32(a1, *clamp_lo);
-  a1 = _mm_min_epi32(a1, *clamp_hi);
+  in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
+  in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
+  in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
+  in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
 
-  *out0 = a0;
-  *out1 = a1;
+  *in0 = in0_w_offset;
+  *in1 = in1_w_offset;
 }
 
 static INLINE void idct32_stage4_sse4_1(
@@ -274,63 +332,34 @@ static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
 static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
                                         const int do_cols, const int bd,
                                         const int out_shift,
-                                        const int log_range) {
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
-    addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
-    addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
-    addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
-    addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
-    addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
-    addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
-    addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
-    addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
-    addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
-    addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
-    addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
-    addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
-    addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
-    addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
-    addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
-  } else {
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi) {
+  addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    for (int i = 0; i < 32; i += 8) {
+      round_shift_4x4(out + i, out_shift);
+      round_shift_4x4(out + i + 4, out_shift);
+    }
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   }
 }
 
@@ -354,17 +383,23 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
   *out1 = a1;
 }
 
-static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                           int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3, x, y;
 
+  // Stage 0
+  // Stage 1
+  // Stage 2
   v0 = _mm_unpacklo_epi32(in[0], in[1]);
   v1 = _mm_unpackhi_epi32(in[0], in[1]);
   v2 = _mm_unpacklo_epi32(in[2], in[3]);
@@ -397,21 +432,27 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   v3 = _mm_add_epi32(v3, rnding);
   v3 = _mm_srai_epi32(v3, bit);
 
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
-    addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
-  } else {
-    const int log_range = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-    addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+  // Stage 3
+  addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
+    shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
   }
 }
 
-static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
   const int32_t *sinpi = sinpi_arr(bit);
-  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_set1_epi32(0);
+  __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
+  rnding = _mm_unpacklo_epi32(rnding, zero);
+  const __m128i mul = _mm_set1_epi32(1 << 4);
   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
   const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
   const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
@@ -421,6 +462,8 @@ static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   __m128i x0, x1, x2, x3;
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
+  __m128i u0_low, u1_low, u2_low, u3_low;
+  __m128i u0_high, u1_high, u2_high, u3_high;
 
   v0 = _mm_unpacklo_epi32(in[0], in[1]);
   v1 = _mm_unpackhi_epi32(in[0], in[1]);
@@ -455,51 +498,78 @@ static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   t = _mm_add_epi32(s0, s1);
   u3 = _mm_sub_epi32(t, s3);
 
-  u0 = _mm_add_epi32(u0, rnding);
-  u0 = _mm_srai_epi32(u0, bit);
+  // u0
+  u0_low = _mm_mul_epi32(u0, mul);
+  u0_low = _mm_add_epi64(u0_low, rnding);
+
+  u0 = _mm_srli_si128(u0, 4);
+  u0_high = _mm_mul_epi32(u0, mul);
+  u0_high = _mm_add_epi64(u0_high, rnding);
+
+  u0_low = _mm_srli_si128(u0_low, 2);
+  u0_high = _mm_srli_si128(u0_high, 2);
+
+  u0 = _mm_unpacklo_epi32(u0_low, u0_high);
+  u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
+  u0 = _mm_unpacklo_epi64(u0, u0_high);
+
+  // u1
+  u1_low = _mm_mul_epi32(u1, mul);
+  u1_low = _mm_add_epi64(u1_low, rnding);
+
+  u1 = _mm_srli_si128(u1, 4);
+  u1_high = _mm_mul_epi32(u1, mul);
+  u1_high = _mm_add_epi64(u1_high, rnding);
+
+  u1_low = _mm_srli_si128(u1_low, 2);
+  u1_high = _mm_srli_si128(u1_high, 2);
+
+  u1 = _mm_unpacklo_epi32(u1_low, u1_high);
+  u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
+  u1 = _mm_unpacklo_epi64(u1, u1_high);
+
+  // u2
+  u2_low = _mm_mul_epi32(u2, mul);
+  u2_low = _mm_add_epi64(u2_low, rnding);
+
+  u2 = _mm_srli_si128(u2, 4);
+  u2_high = _mm_mul_epi32(u2, mul);
+  u2_high = _mm_add_epi64(u2_high, rnding);
+
+  u2_low = _mm_srli_si128(u2_low, 2);
+  u2_high = _mm_srli_si128(u2_high, 2);
+
+  u2 = _mm_unpacklo_epi32(u2_low, u2_high);
+  u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
+  u2 = _mm_unpacklo_epi64(u2, u2_high);
+
+  // u3
+  u3_low = _mm_mul_epi32(u3, mul);
+  u3_low = _mm_add_epi64(u3_low, rnding);
+
+  u3 = _mm_srli_si128(u3, 4);
+  u3_high = _mm_mul_epi32(u3, mul);
+  u3_high = _mm_add_epi64(u3_high, rnding);
 
-  u1 = _mm_add_epi32(u1, rnding);
-  u1 = _mm_srai_epi32(u1, bit);
+  u3_low = _mm_srli_si128(u3_low, 2);
+  u3_high = _mm_srli_si128(u3_high, 2);
 
-  u2 = _mm_add_epi32(u2, rnding);
-  u2 = _mm_srai_epi32(u2, bit);
+  u3 = _mm_unpacklo_epi32(u3_low, u3_high);
+  u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
+  u3 = _mm_unpacklo_epi64(u3, u3_high);
 
-  u3 = _mm_add_epi32(u3, rnding);
-  u3 = _mm_srai_epi32(u3, bit);
+  out[0] = u0;
+  out[1] = u1;
+  out[2] = u2;
+  out[3] = u3;
 
   if (!do_cols) {
     const int log_range = AOMMAX(16, bd + 6);
     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-
-    u0 = _mm_max_epi32(u0, clamp_lo);
-    u0 = _mm_min_epi32(u0, clamp_hi);
-    u1 = _mm_max_epi32(u1, clamp_lo);
-    u1 = _mm_min_epi32(u1, clamp_hi);
-    u2 = _mm_max_epi32(u2, clamp_lo);
-    u2 = _mm_min_epi32(u2, clamp_hi);
-    u3 = _mm_max_epi32(u3, clamp_lo);
-    u3 = _mm_min_epi32(u3, clamp_hi);
+    round_shift_4x4(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
   }
-
-  in[0] = u0;
-  in[1] = u1;
-  in[2] = u2;
-  in[3] = u3;
-}
-
-static INLINE void round_shift_4x4(__m128i *in, int shift) {
-  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
-
-  in[0] = _mm_add_epi32(in[0], rnding);
-  in[1] = _mm_add_epi32(in[1], rnding);
-  in[2] = _mm_add_epi32(in[2], rnding);
-  in[3] = _mm_add_epi32(in[3], rnding);
-
-  in[0] = _mm_srai_epi32(in[0], shift);
-  in[1] = _mm_srai_epi32(in[1], shift);
-  in[2] = _mm_srai_epi32(in[2], shift);
-  in[3] = _mm_srai_epi32(in[3], shift);
 }
 
 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
@@ -556,68 +626,164 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
 }
 
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  __m128i v[4];
+  __m128i zero = _mm_set1_epi32(0);
+  __m128i fact = _mm_set1_epi32(NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0_low, a1_low;
+  __m128i a0_high, a1_high;
+
+  offset = _mm_unpacklo_epi32(offset, zero);
+
+  for (int i = 0; i < 4; i++) {
+    a0_low = _mm_mul_epi32(in[i], fact);
+    a0_low = _mm_add_epi32(a0_low, offset);
+    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+    a0_high = _mm_srli_si128(in[i], 4);
+    a0_high = _mm_mul_epi32(a0_high, fact);
+    a0_high = _mm_add_epi32(a0_high, offset);
+    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+  }
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_4x4(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+
+  // Transpose for 4x4
+  v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+  v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+  v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+  v[3] = _mm_unpackhi_epi32(out[2], out[3]);
+
+  out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+  out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+  out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+  out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
-  const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
   const int txw_idx = get_txw_idx(TX_4X4);
   const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
-      load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
-      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
     default: assert(0);
   }
 }
@@ -745,26 +911,22 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     u5 = _mm_srai_epi32(u5, bit);
 
     // stage 5
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
-      addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
-      addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
-      addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-      addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
-    }
+    addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
   }
 }
 
@@ -1089,11 +1251,26 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   }
 }
 
-static void round_shift_8x8(__m128i *in, int shift) {
-  round_shift_4x4(&in[0], shift);
-  round_shift_4x4(&in[4], shift);
-  round_shift_4x4(&in[8], shift);
-  round_shift_4x4(&in[12], shift);
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  out[0] = _mm_add_epi32(in[0], in[0]);
+  out[1] = _mm_add_epi32(in[1], in[1]);
+  out[2] = _mm_add_epi32(in[2], in[2]);
+  out[3] = _mm_add_epi32(in[3], in[3]);
+  out[4] = _mm_add_epi32(in[4], in[4]);
+  out[5] = _mm_add_epi32(in[5], in[5]);
+  out[6] = _mm_add_epi32(in[6], in[6]);
+  out[7] = _mm_add_epi32(in[7], in[7]);
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_4x4(out, out_shift);
+    round_shift_4x4(out + 4, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
+  }
 }
 
 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
@@ -1165,93 +1342,93 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
   _mm_store_si128((__m128i *)(output + 7 * stride), u7);
 }
 
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
-  const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
   const int txw_idx = get_txw_idx(TX_8X8);
   const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                      -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case FLIPADST_DCT:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      load_buffer_8x8(coeff, in);
+      load_buffer_8x8(input, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
                       -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     default: assert(0);
@@ -1264,6 +1441,8 @@ static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i x;
 
   // stage 0
@@ -1278,18 +1457,16 @@ static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   // stage 5
   if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
     x = _mm_add_epi32(x, offset);
     x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-    x = _mm_max_epi32(x, clamp_lo_out);
-    x = _mm_min_epi32(x, clamp_hi_out);
   }
 
+  x = _mm_max_epi32(x, clamp_lo);
+  x = _mm_min_epi32(x, clamp_hi);
   out[0] = x;
   out[1] = x;
   out[2] = x;
@@ -1396,25 +1573,19 @@ static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   u5 = _mm_srai_epi32(u5, bit);
 
   // stage 5
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
-    addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
-    addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
-    addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
-  } else {
+  addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-    addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
-    addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
-    addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
-    addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
-                        out_shift);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_4x4(out, out_shift);
+    round_shift_4x4(out + 4, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
   }
 }
 
@@ -1683,56 +1854,50 @@ static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    // stage 3
-    // stage 4
-    in[0] = _mm_mullo_epi32(in[0], cospi32);
-    in[0] = _mm_add_epi32(in[0], rnding);
-    in[0] = _mm_srai_epi32(in[0], bit);
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  in[0] = _mm_mullo_epi32(in[0], cospi32);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
 
-    // stage 5
-    // stage 6
-    // stage 7
-    if (do_cols) {
-      in[0] = _mm_max_epi32(in[0], clamp_lo);
-      in[0] = _mm_min_epi32(in[0], clamp_hi);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+  // stage 5
+  // stage 6
+  // stage 7
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    if (out_shift != 0) {
       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
       in[0] = _mm_add_epi32(in[0], offset);
       in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
-      in[0] = _mm_max_epi32(in[0], clamp_lo_out);
-      in[0] = _mm_min_epi32(in[0], clamp_hi_out);
     }
-
-    out[0] = in[0];
-    out[1] = in[0];
-    out[2] = in[0];
-    out[3] = in[0];
-    out[4] = in[0];
-    out[5] = in[0];
-    out[6] = in[0];
-    out[7] = in[0];
-    out[8] = in[0];
-    out[9] = in[0];
-    out[10] = in[0];
-    out[11] = in[0];
-    out[12] = in[0];
-    out[13] = in[0];
-    out[14] = in[0];
-    out[15] = in[0];
   }
+
+  in[0] = _mm_max_epi32(in[0], clamp_lo);
+  in[0] = _mm_min_epi32(in[0], clamp_hi);
+  out[0] = in[0];
+  out[1] = in[0];
+  out[2] = in[0];
+  out[3] = in[0];
+  out[4] = in[0];
+  out[5] = in[0];
+  out[6] = in[0];
+  out[7] = in[0];
+  out[8] = in[0];
+  out[9] = in[0];
+  out[10] = in[0];
+  out[11] = in[0];
+  out[12] = in[0];
+  out[13] = in[0];
+  out[14] = in[0];
+  out[15] = in[0];
 }
 
 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
@@ -1760,140 +1925,120 @@ static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u[16], x, y;
+  // stage 0
+  // stage 1
+  u[0] = in[0];
+  u[2] = in[4];
+  u[4] = in[2];
+  u[6] = in[6];
+  u[8] = in[1];
+  u[10] = in[5];
+  u[12] = in[3];
+  u[14] = in[7];
 
-  {
-    // stage 0
-    // stage 1
-    u[0] = in[0];
-    u[2] = in[4];
-    u[4] = in[2];
-    u[6] = in[6];
-    u[8] = in[1];
-    u[10] = in[5];
-    u[12] = in[3];
-    u[14] = in[7];
-
-    // stage 2
-    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
-    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
-
-    u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
-    u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
-
-    u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
-    u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
-
-    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
-    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+  // stage 2
+  u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+  u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
 
-    // stage 3
-    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
-    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
-    u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
-    u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+  u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+  u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
 
-    addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+  u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+  u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
 
-    // stage 4
-    x = _mm_mullo_epi32(u[0], cospi32);
-    u[0] = _mm_add_epi32(x, rnding);
-    u[0] = _mm_srai_epi32(u[0], bit);
-    u[1] = u[0];
+  u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+  u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
 
-    u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
-    u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+  // stage 3
+  u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+  u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+  u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+  u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
 
-    addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 
-    x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
-    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
-    u[9] = x;
-    y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
-    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
-    u[10] = y;
+  // stage 4
+  x = _mm_mullo_epi32(u[0], cospi32);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+  u[1] = u[0];
 
-    // stage 5
-    addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+  u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+  u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
 
-    x = _mm_mullo_epi32(u[5], cospi32);
-    y = _mm_mullo_epi32(u[6], cospi32);
-    u[5] = _mm_sub_epi32(y, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
 
-    u[6] = _mm_add_epi32(y, x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+  u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+  u[9] = x;
+  y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+  u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+  u[10] = y;
 
-    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+  // stage 5
+  addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 
-    // stage 6
-    addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+  x = _mm_mullo_epi32(u[5], cospi32);
+  y = _mm_mullo_epi32(u[6], cospi32);
+  u[5] = _mm_sub_epi32(y, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    x = _mm_mullo_epi32(u[10], cospi32);
-    y = _mm_mullo_epi32(u[13], cospi32);
-    u[10] = _mm_sub_epi32(y, x);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
+  u[6] = _mm_add_epi32(y, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    u[13] = _mm_add_epi32(x, y);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
+  addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 
-    x = _mm_mullo_epi32(u[11], cospi32);
-    y = _mm_mullo_epi32(u[12], cospi32);
-    u[11] = _mm_sub_epi32(y, x);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
+  // stage 6
+  addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+  x = _mm_mullo_epi32(u[10], cospi32);
+  y = _mm_mullo_epi32(u[13], cospi32);
+  u[10] = _mm_sub_epi32(y, x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
+
+  u[13] = _mm_add_epi32(x, y);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
+
+  x = _mm_mullo_epi32(u[11], cospi32);
+  y = _mm_mullo_epi32(u[12], cospi32);
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
+
+  u[12] = _mm_add_epi32(x, y);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+  // stage 7
+  addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 
-    u[12] = _mm_add_epi32(x, y);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
-    // stage 7
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
-      addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
-      addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
-      addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
-      addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
-      addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
-      addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
-      addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-    }
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
   }
 }
 
@@ -1910,167 +2055,162 @@ static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const __m128i zero = _mm_setzero_si128();
   __m128i v[16], x, y, temp1, temp2;
+  // stage 0
+  // stage 1
+  // stage 2
+  x = _mm_mullo_epi32(in[0], cospi62);
+  v[0] = _mm_add_epi32(x, rnding);
+  v[0] = _mm_srai_epi32(v[0], bit);
 
-  // Calculate the column 0, 1, 2, 3
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    x = _mm_mullo_epi32(in[0], cospi62);
-    v[0] = _mm_add_epi32(x, rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
-
-    x = _mm_mullo_epi32(in[0], cospi2);
-    v[1] = _mm_sub_epi32(zero, x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+  x = _mm_mullo_epi32(in[0], cospi2);
+  v[1] = _mm_sub_epi32(zero, x);
+  v[1] = _mm_add_epi32(v[1], rnding);
+  v[1] = _mm_srai_epi32(v[1], bit);
 
-    // stage 3
-    v[8] = v[0];
-    v[9] = v[1];
+  // stage 3
+  v[8] = v[0];
+  v[9] = v[1];
 
-    // stage 4
-    temp1 = _mm_mullo_epi32(v[8], cospi8);
-    x = _mm_mullo_epi32(v[9], cospi56);
-    temp1 = _mm_add_epi32(temp1, x);
-    temp1 = _mm_add_epi32(temp1, rnding);
-    temp1 = _mm_srai_epi32(temp1, bit);
-
-    temp2 = _mm_mullo_epi32(v[8], cospi56);
-    x = _mm_mullo_epi32(v[9], cospi8);
-    temp2 = _mm_sub_epi32(temp2, x);
-    temp2 = _mm_add_epi32(temp2, rnding);
-    temp2 = _mm_srai_epi32(temp2, bit);
-    v[8] = temp1;
-    v[9] = temp2;
+  // stage 4
+  temp1 = _mm_mullo_epi32(v[8], cospi8);
+  x = _mm_mullo_epi32(v[9], cospi56);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
 
-    // stage 5
-    v[4] = v[0];
-    v[5] = v[1];
-    v[12] = v[8];
-    v[13] = v[9];
+  temp2 = _mm_mullo_epi32(v[8], cospi56);
+  x = _mm_mullo_epi32(v[9], cospi8);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[8] = temp1;
+  v[9] = temp2;
 
-    // stage 6
-    temp1 = _mm_mullo_epi32(v[4], cospi16);
-    x = _mm_mullo_epi32(v[5], cospi48);
-    temp1 = _mm_add_epi32(temp1, x);
-    temp1 = _mm_add_epi32(temp1, rnding);
-    temp1 = _mm_srai_epi32(temp1, bit);
-
-    temp2 = _mm_mullo_epi32(v[4], cospi48);
-    x = _mm_mullo_epi32(v[5], cospi16);
-    temp2 = _mm_sub_epi32(temp2, x);
-    temp2 = _mm_add_epi32(temp2, rnding);
-    temp2 = _mm_srai_epi32(temp2, bit);
-    v[4] = temp1;
-    v[5] = temp2;
-
-    temp1 = _mm_mullo_epi32(v[12], cospi16);
-    x = _mm_mullo_epi32(v[13], cospi48);
-    temp1 = _mm_add_epi32(temp1, x);
-    temp1 = _mm_add_epi32(temp1, rnding);
-    temp1 = _mm_srai_epi32(temp1, bit);
-
-    temp2 = _mm_mullo_epi32(v[12], cospi48);
-    x = _mm_mullo_epi32(v[13], cospi16);
-    temp2 = _mm_sub_epi32(temp2, x);
-    temp2 = _mm_add_epi32(temp2, rnding);
-    temp2 = _mm_srai_epi32(temp2, bit);
-    v[12] = temp1;
-    v[13] = temp2;
+  // stage 5
+  v[4] = v[0];
+  v[5] = v[1];
+  v[12] = v[8];
+  v[13] = v[9];
 
-    // stage 7
-    v[2] = v[0];
-    v[3] = v[1];
-    v[6] = v[4];
-    v[7] = v[5];
-    v[10] = v[8];
-    v[11] = v[9];
-    v[14] = v[12];
-    v[15] = v[13];
+  // stage 6
+  temp1 = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
 
-    // stage 8
-    y = _mm_mullo_epi32(v[2], cospi32);
-    x = _mm_mullo_epi32(v[3], cospi32);
-    v[2] = _mm_add_epi32(y, x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
+  temp2 = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[4] = temp1;
+  v[5] = temp2;
+
+  temp1 = _mm_mullo_epi32(v[12], cospi16);
+  x = _mm_mullo_epi32(v[13], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
 
-    v[3] = _mm_sub_epi32(y, x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
+  temp2 = _mm_mullo_epi32(v[12], cospi48);
+  x = _mm_mullo_epi32(v[13], cospi16);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[12] = temp1;
+  v[13] = temp2;
 
-    y = _mm_mullo_epi32(v[6], cospi32);
-    x = _mm_mullo_epi32(v[7], cospi32);
-    v[6] = _mm_add_epi32(y, x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
-
-    v[7] = _mm_sub_epi32(y, x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
-
-    y = _mm_mullo_epi32(v[10], cospi32);
-    x = _mm_mullo_epi32(v[11], cospi32);
-    v[10] = _mm_add_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  // stage 7
+  v[2] = v[0];
+  v[3] = v[1];
+  v[6] = v[4];
+  v[7] = v[5];
+  v[10] = v[8];
+  v[11] = v[9];
+  v[14] = v[12];
+  v[15] = v[13];
 
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  // stage 8
+  y = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  v[2] = _mm_add_epi32(y, x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
 
-    y = _mm_mullo_epi32(v[14], cospi32);
-    x = _mm_mullo_epi32(v[15], cospi32);
-    v[14] = _mm_add_epi32(y, x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  v[3] = _mm_sub_epi32(y, x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[15] = _mm_sub_epi32(y, x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+  y = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  v[6] = _mm_add_epi32(y, x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_sub_epi32(y, x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  y = _mm_mullo_epi32(v[10], cospi32);
+  x = _mm_mullo_epi32(v[11], cospi32);
+  v[10] = _mm_add_epi32(y, x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_sub_epi32(y, x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  y = _mm_mullo_epi32(v[14], cospi32);
+  x = _mm_mullo_epi32(v[15], cospi32);
+  v[14] = _mm_add_epi32(y, x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_sub_epi32(y, x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    // stage 9
-    if (do_cols) {
-      out[0] = v[0];
-      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
-      out[2] = v[12];
-      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
-      out[4] = v[6];
-      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
-      out[6] = v[10];
-      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
-      out[8] = v[3];
-      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
-      out[10] = v[15];
-      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
-      out[12] = v[5];
-      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
-      out[14] = v[9];
-      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
-      const __m128i clamp_hi_out =
-          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = _mm_sub_epi32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = _mm_sub_epi32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = _mm_sub_epi32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = _mm_sub_epi32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = _mm_sub_epi32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = _mm_sub_epi32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = _mm_sub_epi32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = _mm_sub_epi32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-    }
+    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 }
 
@@ -2107,291 +2247,287 @@ static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i zero = _mm_setzero_si128();
   __m128i u[16], x, y;
 
-  // Calculate the column 0, 1, 2, 3
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    __m128i zero = _mm_setzero_si128();
-    x = _mm_mullo_epi32(in[0], cospi62);
-    u[0] = _mm_add_epi32(x, rnding);
-    u[0] = _mm_srai_epi32(u[0], bit);
-
-    x = _mm_mullo_epi32(in[0], cospi2);
-    u[1] = _mm_sub_epi32(zero, x);
-    u[1] = _mm_add_epi32(u[1], rnding);
-    u[1] = _mm_srai_epi32(u[1], bit);
-
-    x = _mm_mullo_epi32(in[2], cospi54);
-    u[2] = _mm_add_epi32(x, rnding);
-    u[2] = _mm_srai_epi32(u[2], bit);
-
-    x = _mm_mullo_epi32(in[2], cospi10);
-    u[3] = _mm_sub_epi32(zero, x);
-    u[3] = _mm_add_epi32(u[3], rnding);
-    u[3] = _mm_srai_epi32(u[3], bit);
-
-    x = _mm_mullo_epi32(in[4], cospi46);
-    u[4] = _mm_add_epi32(x, rnding);
-    u[4] = _mm_srai_epi32(u[4], bit);
-
-    x = _mm_mullo_epi32(in[4], cospi18);
-    u[5] = _mm_sub_epi32(zero, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  // stage 0
+  // stage 1
+  // stage 2
+  x = _mm_mullo_epi32(in[0], cospi62);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
 
-    x = _mm_mullo_epi32(in[6], cospi38);
-    u[6] = _mm_add_epi32(x, rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  x = _mm_mullo_epi32(in[0], cospi2);
+  u[1] = _mm_sub_epi32(zero, x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
 
-    x = _mm_mullo_epi32(in[6], cospi26);
-    u[7] = _mm_sub_epi32(zero, x);
-    u[7] = _mm_add_epi32(u[7], rnding);
-    u[7] = _mm_srai_epi32(u[7], bit);
+  x = _mm_mullo_epi32(in[2], cospi54);
+  u[2] = _mm_add_epi32(x, rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-    u[8] = _mm_mullo_epi32(in[7], cospi34);
-    u[8] = _mm_add_epi32(u[8], rnding);
-    u[8] = _mm_srai_epi32(u[8], bit);
+  x = _mm_mullo_epi32(in[2], cospi10);
+  u[3] = _mm_sub_epi32(zero, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
 
-    u[9] = _mm_mullo_epi32(in[7], cospi30);
-    u[9] = _mm_add_epi32(u[9], rnding);
-    u[9] = _mm_srai_epi32(u[9], bit);
+  x = _mm_mullo_epi32(in[4], cospi46);
+  u[4] = _mm_add_epi32(x, rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    u[10] = _mm_mullo_epi32(in[5], cospi42);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
+  x = _mm_mullo_epi32(in[4], cospi18);
+  u[5] = _mm_sub_epi32(zero, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
 
-    u[11] = _mm_mullo_epi32(in[5], cospi22);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
+  x = _mm_mullo_epi32(in[6], cospi38);
+  u[6] = _mm_add_epi32(x, rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    u[12] = _mm_mullo_epi32(in[3], cospi50);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
+  x = _mm_mullo_epi32(in[6], cospi26);
+  u[7] = _mm_sub_epi32(zero, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    u[13] = _mm_mullo_epi32(in[3], cospi14);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
+  u[8] = _mm_mullo_epi32(in[7], cospi34);
+  u[8] = _mm_add_epi32(u[8], rnding);
+  u[8] = _mm_srai_epi32(u[8], bit);
 
-    u[14] = _mm_mullo_epi32(in[1], cospi58);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
+  u[9] = _mm_mullo_epi32(in[7], cospi30);
+  u[9] = _mm_add_epi32(u[9], rnding);
+  u[9] = _mm_srai_epi32(u[9], bit);
 
-    u[15] = _mm_mullo_epi32(in[1], cospi6);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  u[10] = _mm_mullo_epi32(in[5], cospi42);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
 
-    // stage 3
-    addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+  u[11] = _mm_mullo_epi32(in[5], cospi22);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
 
-    // stage 4
-    y = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    u[8] = _mm_mullo_epi32(u[8], cospi8);
-    u[8] = _mm_add_epi32(u[8], x);
-    u[8] = _mm_add_epi32(u[8], rnding);
-    u[8] = _mm_srai_epi32(u[8], bit);
-
-    x = _mm_mullo_epi32(u[9], cospi8);
-    u[9] = _mm_sub_epi32(y, x);
-    u[9] = _mm_add_epi32(u[9], rnding);
-    u[9] = _mm_srai_epi32(u[9], bit);
-
-    x = _mm_mullo_epi32(u[11], cospi24);
-    y = _mm_mullo_epi32(u[10], cospi24);
-    u[10] = _mm_mullo_epi32(u[10], cospi40);
-    u[10] = _mm_add_epi32(u[10], x);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
-
-    x = _mm_mullo_epi32(u[11], cospi40);
-    u[11] = _mm_sub_epi32(y, x);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
-
-    x = _mm_mullo_epi32(u[13], cospi8);
-    y = _mm_mullo_epi32(u[12], cospi8);
-    u[12] = _mm_mullo_epi32(u[12], cospim56);
-    u[12] = _mm_add_epi32(u[12], x);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
-
-    x = _mm_mullo_epi32(u[13], cospim56);
-    u[13] = _mm_sub_epi32(y, x);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
-
-    x = _mm_mullo_epi32(u[15], cospi40);
-    y = _mm_mullo_epi32(u[14], cospi40);
-    u[14] = _mm_mullo_epi32(u[14], cospim24);
-    u[14] = _mm_add_epi32(u[14], x);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
-
-    x = _mm_mullo_epi32(u[15], cospim24);
-    u[15] = _mm_sub_epi32(y, x);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  u[12] = _mm_mullo_epi32(in[3], cospi50);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
 
-    // stage 5
-    addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+  u[13] = _mm_mullo_epi32(in[3], cospi14);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
 
-    // stage 6
-    x = _mm_mullo_epi32(u[5], cospi48);
-    y = _mm_mullo_epi32(u[4], cospi48);
-    u[4] = _mm_mullo_epi32(u[4], cospi16);
-    u[4] = _mm_add_epi32(u[4], x);
-    u[4] = _mm_add_epi32(u[4], rnding);
-    u[4] = _mm_srai_epi32(u[4], bit);
-
-    x = _mm_mullo_epi32(u[5], cospi16);
-    u[5] = _mm_sub_epi32(y, x);
-    u[5] = _mm_add_epi32(u[5], rnding);
-    u[5] = _mm_srai_epi32(u[5], bit);
+  u[14] = _mm_mullo_epi32(in[1], cospi58);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
 
-    x = _mm_mullo_epi32(u[7], cospi16);
-    y = _mm_mullo_epi32(u[6], cospi16);
-    u[6] = _mm_mullo_epi32(u[6], cospim48);
-    u[6] = _mm_add_epi32(u[6], x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  u[15] = _mm_mullo_epi32(in[1], cospi6);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
 
-    x = _mm_mullo_epi32(u[7], cospim48);
-    u[7] = _mm_sub_epi32(y, x);
-    u[7] = _mm_add_epi32(u[7], rnding);
-    u[7] = _mm_srai_epi32(u[7], bit);
-
-    x = _mm_mullo_epi32(u[13], cospi48);
-    y = _mm_mullo_epi32(u[12], cospi48);
-    u[12] = _mm_mullo_epi32(u[12], cospi16);
-    u[12] = _mm_add_epi32(u[12], x);
-    u[12] = _mm_add_epi32(u[12], rnding);
-    u[12] = _mm_srai_epi32(u[12], bit);
-
-    x = _mm_mullo_epi32(u[13], cospi16);
-    u[13] = _mm_sub_epi32(y, x);
-    u[13] = _mm_add_epi32(u[13], rnding);
-    u[13] = _mm_srai_epi32(u[13], bit);
-
-    x = _mm_mullo_epi32(u[15], cospi16);
-    y = _mm_mullo_epi32(u[14], cospi16);
-    u[14] = _mm_mullo_epi32(u[14], cospim48);
-    u[14] = _mm_add_epi32(u[14], x);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
-
-    x = _mm_mullo_epi32(u[15], cospim48);
-    u[15] = _mm_sub_epi32(y, x);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  // stage 3
+  addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 7
-    addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+  // stage 4
+  y = _mm_mullo_epi32(u[8], cospi56);
+  x = _mm_mullo_epi32(u[9], cospi56);
+  u[8] = _mm_mullo_epi32(u[8], cospi8);
+  u[8] = _mm_add_epi32(u[8], x);
+  u[8] = _mm_add_epi32(u[8], rnding);
+  u[8] = _mm_srai_epi32(u[8], bit);
+
+  x = _mm_mullo_epi32(u[9], cospi8);
+  u[9] = _mm_sub_epi32(y, x);
+  u[9] = _mm_add_epi32(u[9], rnding);
+  u[9] = _mm_srai_epi32(u[9], bit);
+
+  x = _mm_mullo_epi32(u[11], cospi24);
+  y = _mm_mullo_epi32(u[10], cospi24);
+  u[10] = _mm_mullo_epi32(u[10], cospi40);
+  u[10] = _mm_add_epi32(u[10], x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
+
+  x = _mm_mullo_epi32(u[11], cospi40);
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
+
+  x = _mm_mullo_epi32(u[13], cospi8);
+  y = _mm_mullo_epi32(u[12], cospi8);
+  u[12] = _mm_mullo_epi32(u[12], cospim56);
+  u[12] = _mm_add_epi32(u[12], x);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+
+  x = _mm_mullo_epi32(u[13], cospim56);
+  u[13] = _mm_sub_epi32(y, x);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
+
+  x = _mm_mullo_epi32(u[15], cospi40);
+  y = _mm_mullo_epi32(u[14], cospi40);
+  u[14] = _mm_mullo_epi32(u[14], cospim24);
+  u[14] = _mm_add_epi32(u[14], x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
+
+  x = _mm_mullo_epi32(u[15], cospim24);
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
 
-    // stage 8
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
-    u[2] = _mm_add_epi32(y, x);
-    u[2] = _mm_add_epi32(u[2], rnding);
-    u[2] = _mm_srai_epi32(u[2], bit);
-
-    u[3] = _mm_sub_epi32(y, x);
-    u[3] = _mm_add_epi32(u[3], rnding);
-    u[3] = _mm_srai_epi32(u[3], bit);
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
-    u[6] = _mm_add_epi32(y, x);
-    u[6] = _mm_add_epi32(u[6], rnding);
-    u[6] = _mm_srai_epi32(u[6], bit);
+  // stage 5
+  addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 
-    u[7] = _mm_sub_epi32(y, x);
-    u[7] = _mm_add_epi32(u[7], rnding);
-    u[7] = _mm_srai_epi32(u[7], bit);
+  // stage 6
+  x = _mm_mullo_epi32(u[5], cospi48);
+  y = _mm_mullo_epi32(u[4], cospi48);
+  u[4] = _mm_mullo_epi32(u[4], cospi16);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
 
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
-    u[10] = _mm_add_epi32(y, x);
-    u[10] = _mm_add_epi32(u[10], rnding);
-    u[10] = _mm_srai_epi32(u[10], bit);
+  x = _mm_mullo_epi32(u[5], cospi16);
+  u[5] = _mm_sub_epi32(y, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  x = _mm_mullo_epi32(u[7], cospi16);
+  y = _mm_mullo_epi32(u[6], cospi16);
+  u[6] = _mm_mullo_epi32(u[6], cospim48);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
 
-    u[11] = _mm_sub_epi32(y, x);
-    u[11] = _mm_add_epi32(u[11], rnding);
-    u[11] = _mm_srai_epi32(u[11], bit);
+  x = _mm_mullo_epi32(u[7], cospim48);
+  u[7] = _mm_sub_epi32(y, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
-    u[14] = _mm_add_epi32(y, x);
-    u[14] = _mm_add_epi32(u[14], rnding);
-    u[14] = _mm_srai_epi32(u[14], bit);
+  x = _mm_mullo_epi32(u[13], cospi48);
+  y = _mm_mullo_epi32(u[12], cospi48);
+  u[12] = _mm_mullo_epi32(u[12], cospi16);
+  u[12] = _mm_add_epi32(u[12], x);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+
+  x = _mm_mullo_epi32(u[13], cospi16);
+  u[13] = _mm_sub_epi32(y, x);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
+
+  x = _mm_mullo_epi32(u[15], cospi16);
+  y = _mm_mullo_epi32(u[14], cospi16);
+  u[14] = _mm_mullo_epi32(u[14], cospim48);
+  u[14] = _mm_add_epi32(u[14], x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
+
+  x = _mm_mullo_epi32(u[15], cospim48);
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
 
-    u[15] = _mm_sub_epi32(y, x);
-    u[15] = _mm_add_epi32(u[15], rnding);
-    u[15] = _mm_srai_epi32(u[15], bit);
+  // stage 7
+  addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 
-    // stage 9
-    if (do_cols) {
-      out[0] = u[0];
-      out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
-      out[2] = u[12];
-      out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
-      out[4] = u[6];
-      out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
-      out[6] = u[10];
-      out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
-      out[8] = u[3];
-      out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
-      out[10] = u[15];
-      out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
-      out[12] = u[5];
-      out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
-      out[14] = u[9];
-      out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
-      const __m128i clamp_hi_out =
-          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+  // stage 8
+  y = _mm_mullo_epi32(u[2], cospi32);
+  x = _mm_mullo_epi32(u[3], cospi32);
+  u[2] = _mm_add_epi32(y, x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
 
-      neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-    }
+  u[3] = _mm_sub_epi32(y, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+  y = _mm_mullo_epi32(u[6], cospi32);
+  x = _mm_mullo_epi32(u[7], cospi32);
+  u[6] = _mm_add_epi32(y, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(y, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  y = _mm_mullo_epi32(u[10], cospi32);
+  x = _mm_mullo_epi32(u[11], cospi32);
+  u[10] = _mm_add_epi32(y, x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
+
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
+
+  y = _mm_mullo_epi32(u[14], cospi32);
+  x = _mm_mullo_epi32(u[15], cospi32);
+  u[14] = _mm_add_epi32(y, x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
+
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(zero, u[8]);
+    out[2] = u[12];
+    out[3] = _mm_sub_epi32(zero, u[4]);
+    out[4] = u[6];
+    out[5] = _mm_sub_epi32(zero, u[14]);
+    out[6] = u[10];
+    out[7] = _mm_sub_epi32(zero, u[2]);
+    out[8] = u[3];
+    out[9] = _mm_sub_epi32(zero, u[11]);
+    out[10] = u[15];
+    out[11] = _mm_sub_epi32(zero, u[7]);
+    out[12] = u[5];
+    out[13] = _mm_sub_epi32(zero, u[13]);
+    out[14] = u[9];
+    out[15] = _mm_sub_epi32(zero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
   }
 }
 
@@ -2557,38 +2693,22 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     v[15] = u[15];
 
     // stage 7
-    if (do_cols) {
-      addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
-      addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
-      addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
-      addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
-      addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
-      addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
-      addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
-      addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
-    } else {
+    addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
-      addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
-                          &clamp_hi_out, out_shift);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8(out, out_shift);
+      highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
     }
   }
 }
@@ -2626,353 +2746,381 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  const __m128i zero = _mm_setzero_si128();
   __m128i u[16], v[16], x, y;
-
   // Calculate the column 0, 1, 2, 3
-  {
-    // stage 0
-    // stage 1
-    // stage 2
-    v[0] = _mm_mullo_epi32(in[15], cospi2);
-    x = _mm_mullo_epi32(in[0], cospi62);
-    v[0] = _mm_add_epi32(v[0], x);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
-
-    v[1] = _mm_mullo_epi32(in[15], cospi62);
-    x = _mm_mullo_epi32(in[0], cospi2);
-    v[1] = _mm_sub_epi32(v[1], x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
-
-    v[2] = _mm_mullo_epi32(in[13], cospi10);
-    x = _mm_mullo_epi32(in[2], cospi54);
-    v[2] = _mm_add_epi32(v[2], x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
-
-    v[3] = _mm_mullo_epi32(in[13], cospi54);
-    x = _mm_mullo_epi32(in[2], cospi10);
-    v[3] = _mm_sub_epi32(v[3], x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
-
-    v[4] = _mm_mullo_epi32(in[11], cospi18);
-    x = _mm_mullo_epi32(in[4], cospi46);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
-
-    v[5] = _mm_mullo_epi32(in[11], cospi46);
-    x = _mm_mullo_epi32(in[4], cospi18);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
-
-    v[6] = _mm_mullo_epi32(in[9], cospi26);
-    x = _mm_mullo_epi32(in[6], cospi38);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
-
-    v[7] = _mm_mullo_epi32(in[9], cospi38);
-    x = _mm_mullo_epi32(in[6], cospi26);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
-
-    v[8] = _mm_mullo_epi32(in[7], cospi34);
-    x = _mm_mullo_epi32(in[8], cospi30);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
-
-    v[9] = _mm_mullo_epi32(in[7], cospi30);
-    x = _mm_mullo_epi32(in[8], cospi34);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
-
-    v[10] = _mm_mullo_epi32(in[5], cospi42);
-    x = _mm_mullo_epi32(in[10], cospi22);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
-
-    v[11] = _mm_mullo_epi32(in[5], cospi22);
-    x = _mm_mullo_epi32(in[10], cospi42);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
-
-    v[12] = _mm_mullo_epi32(in[3], cospi50);
-    x = _mm_mullo_epi32(in[12], cospi14);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(in[3], cospi14);
-    x = _mm_mullo_epi32(in[12], cospi50);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
-
-    v[14] = _mm_mullo_epi32(in[1], cospi58);
-    x = _mm_mullo_epi32(in[14], cospi6);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
-
-    v[15] = _mm_mullo_epi32(in[1], cospi6);
-    x = _mm_mullo_epi32(in[14], cospi58);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
-
-    // stage 3
-    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
-
-    // stage 4
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
-
-    v[8] = _mm_mullo_epi32(u[8], cospi8);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
-
-    v[9] = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi8);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
-
-    v[10] = _mm_mullo_epi32(u[10], cospi40);
-    x = _mm_mullo_epi32(u[11], cospi24);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
-
-    v[11] = _mm_mullo_epi32(u[10], cospi24);
-    x = _mm_mullo_epi32(u[11], cospi40);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
-
-    v[12] = _mm_mullo_epi32(u[12], cospim56);
-    x = _mm_mullo_epi32(u[13], cospi8);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(u[12], cospi8);
-    x = _mm_mullo_epi32(u[13], cospim56);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
-
-    v[14] = _mm_mullo_epi32(u[14], cospim24);
-    x = _mm_mullo_epi32(u[15], cospi40);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
-
-    v[15] = _mm_mullo_epi32(u[14], cospi40);
-    x = _mm_mullo_epi32(u[15], cospim24);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
-
-    // stage 5
-    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
-
-    // stage 6
-    v[0] = u[0];
-    v[1] = u[1];
-    v[2] = u[2];
-    v[3] = u[3];
-
-    v[4] = _mm_mullo_epi32(u[4], cospi16);
-    x = _mm_mullo_epi32(u[5], cospi48);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
-
-    v[5] = _mm_mullo_epi32(u[4], cospi48);
-    x = _mm_mullo_epi32(u[5], cospi16);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
-
-    v[6] = _mm_mullo_epi32(u[6], cospim48);
-    x = _mm_mullo_epi32(u[7], cospi16);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
-
-    v[7] = _mm_mullo_epi32(u[6], cospi16);
-    x = _mm_mullo_epi32(u[7], cospim48);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
-
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
-
-    v[12] = _mm_mullo_epi32(u[12], cospi16);
-    x = _mm_mullo_epi32(u[13], cospi48);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(u[12], cospi48);
-    x = _mm_mullo_epi32(u[13], cospi16);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
-
-    v[14] = _mm_mullo_epi32(u[14], cospim48);
-    x = _mm_mullo_epi32(u[15], cospi16);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
-
-    v[15] = _mm_mullo_epi32(u[14], cospi16);
-    x = _mm_mullo_epi32(u[15], cospim48);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
-
-    // stage 7
-    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
-    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
-
-    // stage 8
-    v[0] = u[0];
-    v[1] = u[1];
-
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
-    v[2] = _mm_add_epi32(y, x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
-
-    v[3] = _mm_sub_epi32(y, x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
-
-    v[4] = u[4];
-    v[5] = u[5];
+  // stage 0
+  // stage 1
+  // stage 2
+  v[0] = _mm_mullo_epi32(in[15], cospi2);
+  x = _mm_mullo_epi32(in[0], cospi62);
+  v[0] = _mm_add_epi32(v[0], x);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  v[0] = _mm_srai_epi32(v[0], bit);
+
+  v[1] = _mm_mullo_epi32(in[15], cospi62);
+  x = _mm_mullo_epi32(in[0], cospi2);
+  v[1] = _mm_sub_epi32(v[1], x);
+  v[1] = _mm_add_epi32(v[1], rnding);
+  v[1] = _mm_srai_epi32(v[1], bit);
+
+  v[2] = _mm_mullo_epi32(in[13], cospi10);
+  x = _mm_mullo_epi32(in[2], cospi54);
+  v[2] = _mm_add_epi32(v[2], x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
+
+  v[3] = _mm_mullo_epi32(in[13], cospi54);
+  x = _mm_mullo_epi32(in[2], cospi10);
+  v[3] = _mm_sub_epi32(v[3], x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
+
+  v[4] = _mm_mullo_epi32(in[11], cospi18);
+  x = _mm_mullo_epi32(in[4], cospi46);
+  v[4] = _mm_add_epi32(v[4], x);
+  v[4] = _mm_add_epi32(v[4], rnding);
+  v[4] = _mm_srai_epi32(v[4], bit);
+
+  v[5] = _mm_mullo_epi32(in[11], cospi46);
+  x = _mm_mullo_epi32(in[4], cospi18);
+  v[5] = _mm_sub_epi32(v[5], x);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  v[6] = _mm_mullo_epi32(in[9], cospi26);
+  x = _mm_mullo_epi32(in[6], cospi38);
+  v[6] = _mm_add_epi32(v[6], x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_mullo_epi32(in[9], cospi38);
+  x = _mm_mullo_epi32(in[6], cospi26);
+  v[7] = _mm_sub_epi32(v[7], x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  v[8] = _mm_mullo_epi32(in[7], cospi34);
+  x = _mm_mullo_epi32(in[8], cospi30);
+  v[8] = _mm_add_epi32(v[8], x);
+  v[8] = _mm_add_epi32(v[8], rnding);
+  v[8] = _mm_srai_epi32(v[8], bit);
+
+  v[9] = _mm_mullo_epi32(in[7], cospi30);
+  x = _mm_mullo_epi32(in[8], cospi34);
+  v[9] = _mm_sub_epi32(v[9], x);
+  v[9] = _mm_add_epi32(v[9], rnding);
+  v[9] = _mm_srai_epi32(v[9], bit);
+
+  v[10] = _mm_mullo_epi32(in[5], cospi42);
+  x = _mm_mullo_epi32(in[10], cospi22);
+  v[10] = _mm_add_epi32(v[10], x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_mullo_epi32(in[5], cospi22);
+  x = _mm_mullo_epi32(in[10], cospi42);
+  v[11] = _mm_sub_epi32(v[11], x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  v[12] = _mm_mullo_epi32(in[3], cospi50);
+  x = _mm_mullo_epi32(in[12], cospi14);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
+
+  v[13] = _mm_mullo_epi32(in[3], cospi14);
+  x = _mm_mullo_epi32(in[12], cospi50);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
+
+  v[14] = _mm_mullo_epi32(in[1], cospi58);
+  x = _mm_mullo_epi32(in[14], cospi6);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_mullo_epi32(in[1], cospi6);
+  x = _mm_mullo_epi32(in[14], cospi58);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
-    v[6] = _mm_add_epi32(y, x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
+  // stage 3
+  addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
-    v[7] = _mm_sub_epi32(y, x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
+  // stage 4
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+
+  v[8] = _mm_mullo_epi32(u[8], cospi8);
+  x = _mm_mullo_epi32(u[9], cospi56);
+  v[8] = _mm_add_epi32(v[8], x);
+  v[8] = _mm_add_epi32(v[8], rnding);
+  v[8] = _mm_srai_epi32(v[8], bit);
+
+  v[9] = _mm_mullo_epi32(u[8], cospi56);
+  x = _mm_mullo_epi32(u[9], cospi8);
+  v[9] = _mm_sub_epi32(v[9], x);
+  v[9] = _mm_add_epi32(v[9], rnding);
+  v[9] = _mm_srai_epi32(v[9], bit);
+
+  v[10] = _mm_mullo_epi32(u[10], cospi40);
+  x = _mm_mullo_epi32(u[11], cospi24);
+  v[10] = _mm_add_epi32(v[10], x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_mullo_epi32(u[10], cospi24);
+  x = _mm_mullo_epi32(u[11], cospi40);
+  v[11] = _mm_sub_epi32(v[11], x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  v[12] = _mm_mullo_epi32(u[12], cospim56);
+  x = _mm_mullo_epi32(u[13], cospi8);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
+
+  v[13] = _mm_mullo_epi32(u[12], cospi8);
+  x = _mm_mullo_epi32(u[13], cospim56);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
+
+  v[14] = _mm_mullo_epi32(u[14], cospim24);
+  x = _mm_mullo_epi32(u[15], cospi40);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_mullo_epi32(u[14], cospi40);
+  x = _mm_mullo_epi32(u[15], cospim24);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
+  // stage 5
+  addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
-    v[10] = _mm_add_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+
+  v[4] = _mm_mullo_epi32(u[4], cospi16);
+  x = _mm_mullo_epi32(u[5], cospi48);
+  v[4] = _mm_add_epi32(v[4], x);
+  v[4] = _mm_add_epi32(v[4], rnding);
+  v[4] = _mm_srai_epi32(v[4], bit);
+
+  v[5] = _mm_mullo_epi32(u[4], cospi48);
+  x = _mm_mullo_epi32(u[5], cospi16);
+  v[5] = _mm_sub_epi32(v[5], x);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  v[6] = _mm_mullo_epi32(u[6], cospim48);
+  x = _mm_mullo_epi32(u[7], cospi16);
+  v[6] = _mm_add_epi32(v[6], x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_mullo_epi32(u[6], cospi16);
+  x = _mm_mullo_epi32(u[7], cospim48);
+  v[7] = _mm_sub_epi32(v[7], x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+  v[10] = u[10];
+  v[11] = u[11];
+
+  v[12] = _mm_mullo_epi32(u[12], cospi16);
+  x = _mm_mullo_epi32(u[13], cospi48);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
+
+  v[13] = _mm_mullo_epi32(u[12], cospi48);
+  x = _mm_mullo_epi32(u[13], cospi16);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
+
+  v[14] = _mm_mullo_epi32(u[14], cospim48);
+  x = _mm_mullo_epi32(u[15], cospi16);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_mullo_epi32(u[14], cospi16);
+  x = _mm_mullo_epi32(u[15], cospim48);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
+  // stage 7
+  addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 
-    v[12] = u[12];
-    v[13] = u[13];
+  // stage 8
+  v[0] = u[0];
+  v[1] = u[1];
+
+  y = _mm_mullo_epi32(u[2], cospi32);
+  x = _mm_mullo_epi32(u[3], cospi32);
+  v[2] = _mm_add_epi32(y, x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
+
+  v[3] = _mm_sub_epi32(y, x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
+
+  v[4] = u[4];
+  v[5] = u[5];
+
+  y = _mm_mullo_epi32(u[6], cospi32);
+  x = _mm_mullo_epi32(u[7], cospi32);
+  v[6] = _mm_add_epi32(y, x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_sub_epi32(y, x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+
+  y = _mm_mullo_epi32(u[10], cospi32);
+  x = _mm_mullo_epi32(u[11], cospi32);
+  v[10] = _mm_add_epi32(y, x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_sub_epi32(y, x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  v[12] = u[12];
+  v[13] = u[13];
+
+  y = _mm_mullo_epi32(u[14], cospi32);
+  x = _mm_mullo_epi32(u[15], cospi32);
+  v[14] = _mm_add_epi32(y, x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_sub_epi32(y, x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
 
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
-    v[14] = _mm_add_epi32(y, x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = _mm_sub_epi32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = _mm_sub_epi32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = _mm_sub_epi32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = _mm_sub_epi32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = _mm_sub_epi32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = _mm_sub_epi32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = _mm_sub_epi32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = _mm_sub_epi32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 
-    v[15] = _mm_sub_epi32(y, x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+}
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0_low, a0_high, a1_low, a1_high;
+  __m128i zero = _mm_set1_epi32(0);
+  offset = _mm_unpacklo_epi32(offset, zero);
 
-    // stage 9
-    if (do_cols) {
-      out[0] = v[0];
-      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
-      out[2] = v[12];
-      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
-      out[4] = v[6];
-      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
-      out[6] = v[10];
-      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
-      out[8] = v[3];
-      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
-      out[10] = v[15];
-      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
-      out[12] = v[5];
-      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
-      out[14] = v[9];
-      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
-    } else {
-      const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
-      const __m128i clamp_hi_out =
-          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+  for (int i = 0; i < 16; i++) {
+    a0_low = _mm_mul_epi32(in[i], fact);
+    a0_low = _mm_add_epi32(a0_low, offset);
+    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+    a0_high = _mm_srli_si128(in[i], 4);
+    a0_high = _mm_mul_epi32(a0_high, fact);
+    a0_high = _mm_add_epi32(a0_high, offset);
+    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+  }
 
-      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
-                       &clamp_hi_out, out_shift);
-    }
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
   }
 }
-
 static INLINE void idct64_stage8_sse4_1(
     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
@@ -3091,21 +3239,21 @@ static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
 
 static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
                                          int bd, int out_shift,
-                                         const int log_range) {
-  if (do_cols) {
-    for (int i = 0; i < 32; i++) {
-      addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
-    }
-  } else {
+                                         const __m128i *clamp_lo,
+                                         const __m128i *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+  }
+
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    for (int i = 0; i < 32; i++) {
-      addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
-                          &clamp_lo_out, &clamp_hi_out, out_shift);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    for (int i = 0; i < 64; i += 4) {
+      round_shift_4x4(out + i, out_shift);
+      highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+                                4);
     }
   }
 }
@@ -3115,8 +3263,8 @@ static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
   const int32_t *cospi = cospi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 
@@ -3135,88 +3283,82 @@ static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
     // stage 9
     // stage 10
     // stage 11
-    if (do_cols) {
-      x = _mm_max_epi32(x, clamp_lo);
-      x = _mm_min_epi32(x, clamp_hi);
-    } else {
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
-      x = _mm_add_epi32(x, offset);
-      x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-
-      x = _mm_max_epi32(x, clamp_lo_out);
-      x = _mm_min_epi32(x, clamp_hi_out);
+      clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+        x = _mm_add_epi32(x, offset);
+        x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+      }
     }
-
+    x = _mm_max_epi32(x, clamp_lo);
+    x = _mm_min_epi32(x, clamp_hi);
     out[0] = x;
-    out[63] = x;
     out[1] = x;
-    out[62] = x;
     out[2] = x;
-    out[61] = x;
     out[3] = x;
-    out[60] = x;
     out[4] = x;
-    out[59] = x;
     out[5] = x;
-    out[58] = x;
     out[6] = x;
-    out[57] = x;
     out[7] = x;
-    out[56] = x;
     out[8] = x;
-    out[55] = x;
     out[9] = x;
-    out[54] = x;
     out[10] = x;
-    out[53] = x;
     out[11] = x;
-    out[52] = x;
     out[12] = x;
-    out[51] = x;
     out[13] = x;
-    out[50] = x;
     out[14] = x;
-    out[49] = x;
     out[15] = x;
-    out[48] = x;
     out[16] = x;
-    out[47] = x;
     out[17] = x;
-    out[46] = x;
     out[18] = x;
-    out[45] = x;
     out[19] = x;
-    out[44] = x;
     out[20] = x;
-    out[43] = x;
     out[21] = x;
-    out[42] = x;
     out[22] = x;
-    out[41] = x;
     out[23] = x;
-    out[40] = x;
     out[24] = x;
-    out[39] = x;
     out[25] = x;
-    out[38] = x;
     out[26] = x;
-    out[37] = x;
     out[27] = x;
-    out[36] = x;
     out[28] = x;
-    out[35] = x;
     out[29] = x;
-    out[34] = x;
     out[30] = x;
-    out[33] = x;
     out[31] = x;
     out[32] = x;
+    out[33] = x;
+    out[34] = x;
+    out[35] = x;
+    out[36] = x;
+    out[37] = x;
+    out[38] = x;
+    out[39] = x;
+    out[40] = x;
+    out[41] = x;
+    out[42] = x;
+    out[43] = x;
+    out[44] = x;
+    out[45] = x;
+    out[46] = x;
+    out[47] = x;
+    out[48] = x;
+    out[49] = x;
+    out[50] = x;
+    out[51] = x;
+    out[52] = x;
+    out[53] = x;
+    out[54] = x;
+    out[55] = x;
+    out[56] = x;
+    out[57] = x;
+    out[58] = x;
+    out[59] = x;
+    out[60] = x;
+    out[61] = x;
+    out[62] = x;
+    out[63] = x;
   }
 }
 
@@ -3434,7 +3576,6 @@ static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
     u[6] = u[1];
     u[5] = u[2];
     u[4] = u[3];
-    u[9] = u[9];
 
     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
@@ -3448,7 +3589,7 @@ static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
                           bit);
 
     // stage 11
-    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -3758,7 +3899,7 @@ static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
                           bit);
 
     // stage 11
-    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
   }
 }
 
@@ -4221,20 +4362,20 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
     for (i = 56; i < 64; i++) v[i] = u[i];
 
     // stage 11
-    if (do_cols) {
-      for (i = 0; i < 32; i++) {
-        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
-      }
-    } else {
+    for (i = 0; i < 32; i++) {
+      addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    if (!do_cols) {
       const int log_range_out = AOMMAX(16, bd + 6);
-      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-      for (i = 0; i < 32; i++) {
-        addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
-                            &clamp_lo_out, &clamp_hi_out, out_shift);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      for (i = 0; i < 64; i += 4) {
+        round_shift_4x4(out + i, out_shift);
+        highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
+                                  &clamp_hi_out, 4);
       }
     }
   }
@@ -4246,8 +4387,8 @@ static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
-  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
-  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i bf1;
 
   // stage 0
@@ -4269,17 +4410,17 @@ static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
     bf1 = _mm_min_epi32(bf1, clamp_hi);
   } else {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
-    bf1 = _mm_add_epi32(bf1, offset);
-    bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
-    bf1 = _mm_max_epi32(bf1, clamp_lo_out);
-    bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    if (out_shift != 0) {
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      bf1 = _mm_add_epi32(bf1, offset);
+      bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+    }
   }
+
+  bf1 = _mm_max_epi32(bf1, clamp_lo);
+  bf1 = _mm_min_epi32(bf1, clamp_hi);
   out[0] = bf1;
   out[1] = bf1;
   out[2] = bf1;
@@ -4422,7 +4563,7 @@ static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
                        &rounding, bit);
 
   // stage 9
-  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 }
 
 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
@@ -4568,9 +4709,8 @@ static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
   // stage 8
   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
                        &rounding, bit);
-
   // stage 9
-  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 }
 
 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
@@ -4926,62 +5066,30 @@ static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
   bf0[31] = bf1[31];
 
   // stage 9
-  if (do_cols) {
-    addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
-    addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
-    addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
-    addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
-    addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
-    addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
-    addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
-    addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
-    addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
-    addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
-    addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
-    addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
-    addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
-    addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
-    addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
-    addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
-  } else {
+  addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
     const int log_range_out = AOMMAX(16, bd + 6);
-    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
-        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
-    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
-        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
-    addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
-    addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
-                        &clamp_hi_out, out_shift);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    round_shift_8x8(out + 16, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
   }
 }
 
@@ -4992,127 +5100,23 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
     case IDTX:
-      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                    tx_type, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
     case H_DCT:
-    case V_ADST:
     case H_ADST:
-    case V_FLIPADST:
     case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
-                                         int stride,
-                                         const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
     case V_DCT:
-    case H_DCT:
     case V_ADST:
-    case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
                                                 txfm_param->tx_size,
                                                 txfm_param->eob, bd);
       break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
-      break;
     default:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-  }
-}
-
-void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
-                                          uint8_t *dest, int stride,
-                                          const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
-                                                txfm_param->tx_size,
-                                                txfm_param->eob, bd);
-      break;
-      // Assembly version doesn't support IDTX, so use C version for it.
-    case IDTX:
-      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 tx_type, bd);
+      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
       break;
-    default: assert(0);
   }
 }
-
 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
                                         int stride,
                                         const TxfmParam *txfm_param) {
@@ -5127,53 +5131,268 @@ void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
-  switch (tx_type) {
-      // Assembly version doesn't support some transform types, so use C version
-      // for those.
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    default:
-      av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                    tx_type, bd);
-      break;
-  }
+  av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                                bd);
 }
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  for (int i = 0; i < 32; i += 16) {
+    out[i] = _mm_slli_epi32(in[i], 2);
+    out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+    out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+    out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+    out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+    out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+    out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+    out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+    out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+    out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+    out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+    out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+    out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+    out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+    out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+    out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+  }
 
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    round_shift_8x8(out + 16, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
 static const transform_1d_sse4_1
     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
       {
-          { NULL, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
+          { idct4x4_sse4_1, NULL, NULL, NULL },
+          { iadst4x4_sse4_1, NULL, NULL, NULL },
+          { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
       },
       { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
         { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
+        { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
       {
           { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
             NULL },
           { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
             NULL },
-          { NULL, NULL, NULL, NULL },
+          { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
       },
       { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
           idct32x32_sse4_1 },
         { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
+        { iidentity32_sse4_1, NULL, NULL, NULL } },
       { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
           idct64x64_sse4_1 },
         { NULL, NULL, NULL, NULL },
         { NULL, NULL, NULL, NULL } }
     };
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div4 = input_stride >> 2;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < buf_size_w_div4; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
 
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+  }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div8 = input_stride >> 2;
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[64 * 4];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[32];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < (input_stride >> 2); i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, 0, txfm_size_row,
+                                     bd);
+    }
+  }
+}
 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
                                                     uint16_t *output,
                                                     int stride, TX_TYPE tx_type,
@@ -5182,7 +5401,7 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
   __m128i buf1[64 * 16];
   int eobx, eoby;
   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
   const int txfm_size_col = tx_size_wide[tx_size];
@@ -5220,7 +5439,8 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
       av1_round_shift_rect_array_32_sse4_1(
           buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
     }
-    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
 
     __m128i *_buf1 = buf1 + i * 4;
     if (lr_flip) {
@@ -5244,7 +5464,7 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
   // 2nd stage: column transform
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
-             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
                                     buf1 + i * txfm_size_row, txfm_size_row,
@@ -5261,6 +5481,230 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
   }
 }
 
+static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
+                                            uint16_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size,
+                                            int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[8];
+  const int32_t *input_row = input;
+  __m128i *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
+                                       NewInvSqrt2);
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+           -shift[0]);
+
+  if (lr_flip) {
+    TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  } else {
+    TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+                                 bd);
+}
+
+static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
+                                            uint16_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size,
+                                            int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[8];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
+                buf1[3]);
+  TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
+                buf1[7]);
+
+  av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
+                                       NewInvSqrt2);
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+  __m128i *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_sse2(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < 2; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  }
+  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+  // write to buffer
+  highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
+                                 txfm_size_row, bd);
+}
+
+static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_h_div8 = txfm_size_row >> 2;
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[16];
+  const int32_t *input_row = input;
+  __m128i *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  for (int i = 0; i < (txfm_size_row >> 2); i++) {
+    row_txfm(buf0 + (i << 2), buf0 + (i << 2),
+             av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  }
+
+  if (lr_flip) {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+                    buf1[4 * j + 3]);
+    }
+  } else {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+                    buf1[4 * j + 2], buf1[4 * j + 3]);
+    }
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+                                 bd);
+}
+
+static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[16];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  for (int j = 0; j < buf_size_w_div8; j++) {
+    TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+                  buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+  }
+  row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+  __m128i *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_sse2(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  }
+  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
+                                   output + 8 * i, stride, ud_flip,
+                                   txfm_size_row, bd);
+  }
+}
+
 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
                                                uint8_t *output, int stride,
                                                TX_TYPE tx_type, TX_SIZE tx_size,
@@ -5279,70 +5723,99 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
           bd);
       break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      highbd_inv_txfm2d_add_h_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      highbd_inv_txfm2d_add_v_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case IDTX:
+      highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+                                        stride, tx_type, tx_size, eob, bd);
+      break;
     default: assert(0); break;
   }
 }
 
+void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
+}
+
 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
                                     int stride, const TxfmParam *txfm_param) {
   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-    case TX_32X32:
-      av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X16:
-      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
-      break;
     case TX_8X8:
       av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
-      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
-      break;
-    case TX_8X16:
-      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X8:
-      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
-      break;
-    case TX_16X32:
-      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
-      break;
-    case TX_32X16:
-      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
-      break;
-    case TX_32X64:
-      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
-      break;
-    case TX_64X32:
-      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_4X4:
       av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_16X4:
-      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
       break;
     case TX_4X16:
-      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
-      break;
-    case TX_8X32:
-      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
       break;
-    case TX_32X8:
-      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
-      break;
-    case TX_64X64:
-    case TX_16X64:
-    case TX_64X16:
+    default:
       av1_highbd_inv_txfm2d_add_universe_sse4_1(
-          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
-          txfm_param->eob, txfm_param->bd);
+          input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
+          txfm_param->bd);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
 }
diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
index e298cf653..70f1ec709 100644
--- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,23 +22,23 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-void av1_highbd_jnt_convolve_2d_copy_avx2(
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -78,15 +78,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
           const __m256i res_unsigned_lo =
               _mm256_add_epi32(res_32b_lo, offset_const);
 
-          const __m256i comp_avg_res_lo = highbd_comp_avg(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
           const __m256i res_unsigned_hi =
               _mm256_add_epi32(res_32b_hi, offset_const);
 
-          const __m256i comp_avg_res_hi = highbd_comp_avg(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i round_result_lo = highbd_convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -135,8 +137,9 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
             const __m256i res_unsigned_lo =
                 _mm256_add_epi32(res_32b, offset_const);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -179,15 +182,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
             const __m256i res_unsigned_lo =
                 _mm256_add_epi32(res_32b_lo, offset_const);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
             const __m256i res_unsigned_hi =
                 _mm256_add_epi32(res_32b_hi, offset_const);
 
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -223,11 +228,11 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_2d_avx2(
+void av1_highbd_dist_wtd_convolve_2d_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
@@ -244,7 +249,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
 
   __m256i s[8], coeffs_y[4], coeffs_x[4];
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -272,8 +277,8 @@ void av1_highbd_jnt_convolve_2d_avx2(
   const __m256i clip_pixel_to_bd =
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -364,8 +369,9 @@ void av1_highbd_jnt_convolve_2d_avx2(
 
             const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -409,10 +415,12 @@ void av1_highbd_jnt_convolve_2d_avx2(
             const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
             const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -456,24 +464,24 @@ void av1_highbd_jnt_convolve_2d_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_x_avx2(
+void av1_highbd_dist_wtd_convolve_x_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   int i, j;
   __m256i s[4], coeffs_x[4];
 
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -496,7 +504,7 @@ void av1_highbd_jnt_convolve_x_avx2(
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
 
   assert(bits >= 0);
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -548,7 +556,7 @@ void av1_highbd_jnt_convolve_x_avx2(
           const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
           const __m256i comp_avg_res = highbd_comp_avg(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
 
           const __m256i round_result = highbd_convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -588,10 +596,12 @@ void av1_highbd_jnt_convolve_x_avx2(
           const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
           const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-          const __m256i comp_avg_res_lo = highbd_comp_avg(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-          const __m256i comp_avg_res_hi = highbd_comp_avg(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
 
           const __m256i round_result_lo = highbd_convolve_rounding(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -623,24 +633,24 @@ void av1_highbd_jnt_convolve_x_avx2(
   }
 }
 
-void av1_highbd_jnt_convolve_y_avx2(
+void av1_highbd_dist_wtd_convolve_y_avx2(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
   __m256i s[8], coeffs_y[4];
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -662,7 +672,7 @@ void av1_highbd_jnt_convolve_y_avx2(
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m256i zero = _mm256_setzero_si256();
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];
@@ -753,8 +763,9 @@ void av1_highbd_jnt_convolve_y_avx2(
 
             const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
 
-            const __m256i comp_avg_res = highbd_comp_avg(
-                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result = highbd_convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -799,10 +810,12 @@ void av1_highbd_jnt_convolve_y_avx2(
             const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
             const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
 
-            const __m256i comp_avg_res_lo = highbd_comp_avg(
-                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-            const __m256i comp_avg_res_hi = highbd_comp_avg(
-                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
 
             const __m256i round_result_lo =
                 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
index 1a29985b5..f033a6f94 100644
--- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -17,23 +17,23 @@
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "aom_dsp/x86/convolve_sse4_1.h"
 
-void av1_highbd_jnt_convolve_y_sse4_1(
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   const int bits = FILTER_BITS - conv_params->round_0;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
   assert(bits >= 0);
   int i, j;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
@@ -56,7 +56,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
   const __m128i zero = _mm_setzero_si128();
   __m128i s[16], coeffs_y[4];
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];
@@ -121,10 +121,12 @@ void av1_highbd_jnt_convolve_y_sse4_1(
             const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
             const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
 
-            const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
-                &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
-            const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
-                &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_1 =
+                highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_0 =
                 highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
@@ -186,16 +188,16 @@ void av1_highbd_jnt_convolve_y_sse4_1(
 
             const __m128i comp_avg_res_lo_0 =
                 highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_lo_1 =
                 highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_hi_0 =
                 highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
             const __m128i comp_avg_res_hi_1 =
                 highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
-                                       &wt0, &wt1, use_jnt_comp_avg);
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
 
             const __m128i round_result_lo_0 =
                 highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
@@ -257,24 +259,24 @@ void av1_highbd_jnt_convolve_y_sse4_1(
   }
 }
 
-void av1_highbd_jnt_convolve_x_sse4_1(
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
     int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
-    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
   const int bits = FILTER_BITS - conv_params->round_1;
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
   int i, j;
   __m128i s[4], coeffs_x[4];
 
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi32(w0);
@@ -297,7 +299,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
 
   assert(bits >= 0);
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -339,7 +341,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
           const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
 
           const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
-              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
           const __m128i round_result = highbd_convolve_rounding_sse2(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
@@ -359,10 +361,12 @@ void av1_highbd_jnt_convolve_x_sse4_1(
           const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
           const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
 
-          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
-              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
-          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
-              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
 
           const __m128i round_result_lo = highbd_convolve_rounding_sse2(
               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h b/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
index 6f24e5948..5734810f5 100644
--- a/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -75,13 +75,20 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
                 out[63]);
 }
 
-static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
-  for (int j = 0; j < 8; j++) {
-    for (int i = 0; i < 8; i++) {
-      TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
-                    input[i * 32 + j + 16], input[i * 32 + j + 24],
-                    output[j * 32 + i + 0], output[j * 32 + i + 8],
-                    output[j * 32 + i + 16], output[j * 32 + i + 24]);
+static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output,
+                                   const int width, const int height) {
+  const int numcol = height >> 2;
+  const int numrow = width >> 2;
+  for (int j = 0; j < numrow; j++) {
+    for (int i = 0; i < numcol; i++) {
+      TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
+                    input[i * width + j + (numrow * 1)],
+                    input[i * width + j + (numrow * 2)],
+                    input[i * width + j + (numrow * 3)],
+                    output[j * height + i + (numcol * 0)],
+                    output[j * height + i + (numcol * 1)],
+                    output[j * height + i + (numcol * 2)],
+                    output[j * height + i + (numcol * 3)]);
     }
   }
 }
diff --git a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
index 4bcab0564..60a819308 100644
--- a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
@@ -15,9 +15,9 @@
 
 #include "av1/common/warped_motion.h"
 
-static const uint8_t warp_highbd_arrange_bytes[16] = {
-  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-};
+static const uint8_t warp_highbd_arrange_bytes[16] = { 0,  2,  4,  6, 8, 10,
+                                                       12, 14, 1,  3, 5, 7,
+                                                       9,  11, 13, 15 };
 
 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
   0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
@@ -25,24 +25,28 @@ static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
   4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
 };
-static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
-  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
-};
-static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
-  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
-};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8,  9,  10, 11, 8,  9,
+                                                         10, 11, 8,  9,  10, 11,
+                                                         8,  9,  10, 11 };
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
+                                                         14, 15, 12, 13, 14, 15,
+                                                         12, 13, 14, 15 };
 
 static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
                                                           __m128i *coeff) {
   // Filter even-index pixels
-  const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_2 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_4 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_6 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_0 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
 
   // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -63,14 +67,18 @@ static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
   coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
   // Filter odd-index pixels
-  const __m128i tmp_1 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_3 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_5 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_7 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_1 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
 
   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -87,7 +95,7 @@ static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
     int sx, __m128i *coeff) {
   // Filter coeff
   const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+      (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
 
   coeff[0] = _mm_shuffle_epi8(
       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
@@ -454,16 +462,16 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
 
         // Filter even-index pixels
         const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
         const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -491,16 +499,16 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
         const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
 
         const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
         const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
+            (__m128i *)(av1_warped_filter +
                         ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
         const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
@@ -537,7 +545,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
             __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
             __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
 
-            if (conv_params->use_jnt_comp_avg) {
+            if (conv_params->use_dist_wtd_comp_avg) {
               res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
                                      _mm_mullo_epi32(res_lo, wt1));
               res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
@@ -570,7 +578,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
                   (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
               __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
 
-              if (conv_params->use_jnt_comp_avg) {
+              if (conv_params->use_dist_wtd_comp_avg) {
                 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
                                        _mm_mullo_epi32(res_hi, wt1));
                 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
diff --git a/media/libaom/src/av1/common/x86/intra_edge_sse4.c b/media/libaom/src/av1/common/x86/intra_edge_sse4.c
index 0c857b583..fc69f41d7 100644
--- a/media/libaom/src/av1/common/x86/intra_edge_sse4.c
+++ b/media/libaom/src/av1/common/x86/intra_edge_sse4.c
@@ -212,10 +212,10 @@ void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
     { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
   };
 
-  DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
-    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
-    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
-  };
+  DECLARE_ALIGNED(
+      16, static const int8_t,
+      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
 
   // Extend first/last samples (upper-left p[-1], last p[sz-1])
   // to support 4-tap filter
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
index 9f2e2b457..6de61573e 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
@@ -23,8 +23,8 @@
 static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16(w0);
-  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
+  const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
   const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
   return wt;
 }
@@ -35,22 +35,20 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
 }
 
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_horiz;
+  int i, j, is_horiz_4tap = 0;
   const int bits = FILTER_BITS - conv_params->round_1;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -58,87 +56,147 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int rounding_shift =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  __m256i filt[4], coeffs[4];
 
   assert(bits >= 0);
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
   const __m256i round_const =
       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
-  for (i = 0; i < h; i += 2) {
-    const uint8_t *src_data = src_ptr + i * src_stride;
-    CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
-    for (j = 0; j < w; j += 8) {
-      const __m256i data =
-          load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+  __m256i filt[4], coeffs[4];
 
-      __m256i res = convolve_lowbd_x(data, coeffs, filt);
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
 
-      res = _mm256_slli_epi16(res, bits);
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
 
-      const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
 
-      // Accumulate values into the destination buffer
-      if (do_average) {
-        const __m256i data_ref_0 =
-            load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
-        const __m256i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+        res = _mm256_slli_epi16(res, bits);
 
-        const __m256i round_result = convolve_rounding(
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
 
-        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-        const __m128i res_0 = _mm256_castsi256_si128(res_8);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
-        if (w > 4) {
-          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-          _mm_storel_epi64(
-              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
         } else {
-          *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
-          *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-              _mm_cvtsi128_si32(res_1);
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
         }
-      } else {
-        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+      }
+    }
+  } else {
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+        __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+        res = _mm256_slli_epi16(res, bits);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                        res_1);
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
       }
     }
   }
 }
 
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+  int i, j, is_vert_4tap = 0;
   // +1 to compensate for dividing the filter coeffs by 2
   const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
   const __m256i round_const =
@@ -146,7 +204,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -162,201 +220,395 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
 
   assert((FILTER_BITS - conv_params->round_0) >= 0);
 
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
   (void)conv_params;
   (void)filter_params_x;
-  (void)subpel_x_q4;
-
-  for (j = 0; j < w; j += 16) {
-    const uint8_t *data = &src_ptr[j];
-    __m256i src6;
-    // Load lines a and b. Line a to lower 128, line b to upper 128
-    {
-      __m256i src_ab[7];
-      __m256i src_a[7];
-      src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-      for (int kk = 0; kk < 6; ++kk) {
-        data += src_stride;
-        src_a[kk + 1] =
-            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-        src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+  (void)subpel_x_qn;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src4;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[4];
+        __m256i src_a[5];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 4; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+        }
+        src4 = src_a[4];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+        s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
       }
-      src6 = src_a[6];
-      s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
-      s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
-      s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
-      s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
-      s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
-      s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
-    }
 
-    for (i = 0; i < h; i += 2) {
-      data = &src_ptr[(i + 7) * src_stride + j];
-      const __m256i src7 =
-          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-      const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 5) * src_stride + j];
+        const __m256i src5 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
 
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + src_stride)));
-      const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+        src4 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
 
-      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
 
-      __m256i res_lo = convolve_lowbd(s, coeffs);
+        __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
 
-      res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
 
-      const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
-      const __m256i res_lo_0_shift =
-          _mm256_slli_epi32(res_lo_0_32b, left_shift);
-      const __m256i res_lo_0_round = _mm256_sra_epi32(
-          _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
 
-      const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
-      const __m256i res_lo_1_shift =
-          _mm256_slli_epi32(res_lo_1_32b, left_shift);
-      const __m256i res_lo_1_round = _mm256_sra_epi32(
-          _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
 
-      const __m256i res_lo_round =
-          _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
 
-      const __m256i res_lo_unsigned =
-          _mm256_add_epi16(res_lo_round, offset_const_2);
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
 
-      if (w - j < 16) {
-        if (do_average) {
-          const __m256i data_ref_0 = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
 
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
           } else {
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_0);
-            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
           }
         } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
 
-          const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
+
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
+
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
         }
-      } else {
-        __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+        s[0] = s[1];
+        s[1] = s[2];
 
-        res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[7];
+        __m256i src_a[7];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 6; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+        }
+        src6 = src_a[6];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+        s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+        s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+      }
 
-        const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
-        const __m256i res_hi_0_shift =
-            _mm256_slli_epi32(res_hi_0_32b, left_shift);
-        const __m256i res_hi_0_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 7) * src_stride + j];
+        const __m256i src7 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
 
-        const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
-        const __m256i res_hi_1_shift =
-            _mm256_slli_epi32(res_hi_1_32b, left_shift);
-        const __m256i res_hi_1_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
 
-        const __m256i res_hi_round =
-            _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
 
-        const __m256i res_hi_unsigned =
-            _mm256_add_epi16(res_hi_round, offset_const_2);
+        __m256i res_lo = convolve_lowbd(s, coeffs);
 
-        if (do_average) {
-          const __m256i data_ref_0_lo = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
 
-          const __m256i data_ref_0_hi =
-              load_line2_avx2(&dst[i * dst_stride + j + 8],
-                              &dst[i * dst_stride + j + 8 + dst_stride]);
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
 
-          const __m256i comp_avg_res_lo =
-              comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
 
-          const __m256i comp_avg_res_hi =
-              comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
 
-          const __m256i round_result_lo = convolve_rounding(
-              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
 
-          const __m256i round_result_hi = convolve_rounding(
-              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
-          const __m256i res_8 =
-              _mm256_packus_epi16(round_result_lo, round_result_hi);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
 
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-          _mm_store_si128(
-              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
 
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
         } else {
-          const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+          __m256i res_hi = convolve_lowbd(s + 4, coeffs);
 
-          const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_lo_1);
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
 
-          const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
 
-          const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
-          _mm_store_si128(
-              (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
+
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
         }
-      }
-      s[0] = s[1];
-      s[1] = s[2];
-      s[2] = s[3];
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
 
-      s[4] = s[5];
-      s[5] = s[6];
-      s[6] = s[7];
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
     }
   }
 }
 
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                              int dst_stride0, int w, int h,
-                              const InterpFilterParams *filter_params_x,
-                              const InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
 
   DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  int im_h = h + filter_params_y->taps - 1;
+
   int im_stride = 8;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -364,18 +616,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const int rounding_shift =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
 
   assert(conv_params->round_0 > 0);
 
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
   const __m256i round_const_h = _mm256_set1_epi16(
       ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
   const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -385,9 +628,29 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
       (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
   const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
 
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    {
+  __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
       const uint8_t *src_h = src_ptr + j;
       for (i = 0; i < im_h; i += 2) {
         __m256i data =
@@ -396,49 +659,59 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
           data = _mm256_inserti128_si256(
               data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
         src_h += (src_stride << 1);
-        __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
 
         res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
                                round_shift_h);
 
         _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
       }
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
     }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
 
-    /* Vertical filter */
-    {
+      /* Vertical filter */
+      __m256i s[6];
       __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
       __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
       __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
       __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
 
       s[0] = _mm256_unpacklo_epi16(s0, s1);
       s[1] = _mm256_unpacklo_epi16(s2, s3);
-      s[2] = _mm256_unpacklo_epi16(s4, s5);
 
-      s[4] = _mm256_unpackhi_epi16(s0, s1);
-      s[5] = _mm256_unpackhi_epi16(s2, s3);
-      s[6] = _mm256_unpackhi_epi16(s4, s5);
+      s[3] = _mm256_unpackhi_epi16(s0, s1);
+      s[4] = _mm256_unpackhi_epi16(s2, s3);
 
       for (i = 0; i < h; i += 2) {
         const int16_t *data = &im_block[i * im_stride];
 
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
 
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
 
-        const __m256i res_a = convolve(s, coeffs_y);
+        const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
         const __m256i res_a_round = _mm256_sra_epi32(
             _mm256_add_epi32(res_a, round_const_v), round_shift_v);
 
         if (w - j > 4) {
-          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
           const __m256i res_b_round = _mm256_sra_epi32(
               _mm256_add_epi32(res_b, round_const_v), round_shift_v);
           const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
@@ -448,8 +721,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
             const __m256i data_ref_0 =
                 load_line2_avx2(&dst[i * dst_stride + j],
                                 &dst[i * dst_stride + j + dst_stride]);
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
             const __m256i round_result = convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -479,8 +752,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                 load_line2_avx2(&dst[i * dst_stride + j],
                                 &dst[i * dst_stride + j + dst_stride]);
 
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
 
             const __m256i round_result = convolve_rounding(
                 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -504,38 +777,49 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
                             res_1);
           }
         }
-
         s[0] = s[1];
         s[1] = s[2];
-        s[2] = s[3];
-
+        s[3] = s[4];
         s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
       }
     }
+  } else {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+    }
   }
 }
 
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   (void)filter_params_x;
   (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
 
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const __m256i wt = unpack_weights_avx2(conv_params);
   const __m256i zero = _mm256_setzero_si256();
 
@@ -562,7 +846,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
               _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
 
           const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m256i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -600,7 +884,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
           const __m256i data_ref_0 = load_line2_avx2(
               &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
           const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m256i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
index 87dc3242e..f8f640a11 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
 
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   const int dst_stride = conv_params->dst_stride;
@@ -37,7 +37,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i wt1 = _mm_set1_epi16(w1);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -48,9 +48,9 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   __m128i coeffs[4];
 
   (void)filter_params_y;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
 
   if (w == 4) {
     do {
@@ -77,7 +77,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -134,7 +134,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -150,12 +150,12 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   }
 }
 
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
-                             int dst_stride0, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_q4, const int subpel_y_q4,
-                             ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
   const int bd = 8;
   CONV_BUF_TYPE *dst = conv_params->dst;
   const int dst_stride = conv_params->dst_stride;
@@ -167,7 +167,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const int offset_0 =
       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -180,9 +180,9 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
   __m128i coeffs[4];
 
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
 
   if (w == 4) {
     __m128i s[8], src6, res, res_shift;
@@ -225,7 +225,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -254,7 +254,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
 
         const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
         const __m128i round_result = convolve_rounding(
             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -331,7 +331,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -360,7 +360,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -383,3 +383,233 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
     } while (j < w);
   }
 }
+
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i temp_lo, temp_hi;
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+        temp_lo = _mm_srli_si128(src_lo, 4);
+        temp_hi = _mm_slli_si128(src_hi, 12);
+        const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        temp_lo = _mm_srli_si128(src_lo, 8);
+        temp_hi = _mm_slli_si128(src_hi, 8);
+        const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        temp_lo = _mm_srli_si128(src_lo, 12);
+        temp_hi = _mm_slli_si128(src_hi, 4);
+        const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        temp_lo = _mm_srli_si128(src_lo, 2);
+        temp_hi = _mm_slli_si128(src_hi, 14);
+        const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        temp_lo = _mm_srli_si128(src_lo, 6);
+        temp_hi = _mm_slli_si128(src_hi, 10);
+        const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        temp_lo = _mm_srli_si128(src_lo, 10);
+        temp_hi = _mm_slli_si128(src_hi, 6);
+        const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        temp_lo = _mm_srli_si128(src_lo, 14);
+        temp_hi = _mm_slli_si128(src_hi, 2);
+        const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+          else
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+      }
+    }
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c b/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
index 822772782..f45e3b267 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
@@ -16,12 +16,11 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
 
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
-                               uint8_t *dst0, int dst_stride0, int w, int h,
-                               const InterpFilterParams *filter_params_x,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_x_q4, const int subpel_y_q4,
-                               ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_ssse3(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
   CONV_BUF_TYPE *dst = conv_params->dst;
   int dst_stride = conv_params->dst_stride;
   const int bd = 8;
@@ -34,7 +33,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const int do_average = conv_params->do_average;
-  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
@@ -56,7 +55,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
   /* Horizontal filter */
   {
     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -124,7 +123,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
   /* Vertical filter */
   {
     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
 
     // coeffs 0 1 0 1 2 3 2 3
@@ -211,7 +210,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
 
           const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
 
           const __m128i round_result = convolve_rounding(
               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/reconinter_avx2.c b/media/libaom/src/av1/common/x86/reconinter_avx2.c
index f645e0454..a38bd8317 100644
--- a/media/libaom/src/av1/common/x86/reconinter_avx2.c
+++ b/media/libaom/src/av1/common/x86/reconinter_avx2.c
@@ -28,8 +28,8 @@ static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
 }
 void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
                                           DIFFWTD_MASK_TYPE mask_type,
-                                          const uint8_t *src0, int stride0,
-                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *src0, int src0_stride,
+                                          const uint8_t *src1, int src1_stride,
                                           int h, int w) {
   const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
   const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
@@ -37,18 +37,18 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
   if (4 == w) {
     do {
       const __m128i s0A = xx_loadl_32(src0);
-      const __m128i s0B = xx_loadl_32(src0 + stride0);
-      const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
-      const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+      const __m128i s0B = xx_loadl_32(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
       const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
       const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
       const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
       const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
 
       const __m128i s1A = xx_loadl_32(src1);
-      const __m128i s1B = xx_loadl_32(src1 + stride1);
-      const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
-      const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+      const __m128i s1B = xx_loadl_32(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
       const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
       const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
       const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
@@ -58,40 +58,40 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
       const __m128i x_m8 =
           _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
       xx_storeu_128(mask, x_m8);
-      src0 += (stride0 << 2);
-      src1 += (stride1 << 2);
+      src0 += (src0_stride << 2);
+      src1 += (src1_stride << 2);
       mask += 16;
       i += 4;
     } while (i < h);
   } else if (8 == w) {
     do {
       const __m128i s0A = xx_loadl_64(src0);
-      const __m128i s0B = xx_loadl_64(src0 + stride0);
-      const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
-      const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
       const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
       const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
       const __m128i s1A = xx_loadl_64(src1);
-      const __m128i s1B = xx_loadl_64(src1 + stride1);
-      const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
-      const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
       const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
       const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
       const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
       const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
       const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
       yy_storeu_256(mask, m8);
-      src0 += stride0 << 2;
-      src1 += stride1 << 2;
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
       mask += 32;
       i += 4;
     } while (i < h);
   } else if (16 == w) {
     do {
       const __m128i s0A = xx_load_128(src0);
-      const __m128i s0B = xx_load_128(src0 + stride0);
+      const __m128i s0B = xx_load_128(src0 + src0_stride);
       const __m128i s1A = xx_load_128(src1);
-      const __m128i s1B = xx_load_128(src1 + stride1);
+      const __m128i s1B = xx_load_128(src1 + src1_stride);
       const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
       const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
       const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
@@ -103,8 +103,8 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
       const __m256i m8 =
           _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
       yy_storeu_256(mask, m8);
-      src0 += stride0 << 1;
-      src1 += stride1 << 1;
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
       mask += 32;
       i += 2;
     } while (i < h);
@@ -127,8 +127,8 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
         yy_storeu_256(mask + j, m8);
         j += 32;
       } while (j < w);
-      src0 += stride0;
-      src1 += stride1;
+      src0 += src0_stride;
+      src1 += src1_stride;
       mask += w;
       i += 1;
     } while (i < h);
diff --git a/media/libaom/src/av1/common/x86/selfguided_avx2.c b/media/libaom/src/av1/common/x86/selfguided_avx2.c
index 0aaf1f454..3c5558dda 100644
--- a/media/libaom/src/av1/common/x86/selfguided_avx2.c
+++ b/media/libaom/src/av1/common/x86/selfguided_avx2.c
@@ -219,12 +219,12 @@ static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
 static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
                     int width, int height, int buf_stride, int bit_depth,
                     int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -263,7 +263,7 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
                             SGRPROJ_MTABLE_BITS),
           _mm256_set1_epi32(255));
 
-      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
 
       yy_storeu_256(A + i * buf_stride + j, a_res);
 
@@ -356,12 +356,12 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
                          const int32_t *D, int width, int height,
                          int buf_stride, int bit_depth, int sgr_params_idx,
                          int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -400,7 +400,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
                             SGRPROJ_MTABLE_BITS),
           _mm256_set1_epi32(255));
 
-      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
 
       yy_storeu_256(A + i * buf_stride + j, a_res);
 
@@ -604,7 +604,7 @@ int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
     integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
                     buf_stride);
 
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // Write to flt0 and flt1
   // If params->r == 0 we skip the corresponding filter. We only allow one of
   // the radii to be 0, as having both equal to 0 would be equivalent to
@@ -630,11 +630,11 @@ int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
   return 0;
 }
 
-void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
-                                       int height, int stride, int eps,
-                                       const int *xqd, uint8_t *dst8,
-                                       int dst_stride, int32_t *tmpbuf,
-                                       int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+                                           int height, int stride, int eps,
+                                           const int *xqd, uint8_t *dst8,
+                                           int dst_stride, int32_t *tmpbuf,
+                                           int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -642,9 +642,9 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
   (void)ret;
   assert(!ret);
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
 
   __m256i xq0 = _mm256_set1_epi32(xq[0]);
   __m256i xq1 = _mm256_set1_epi32(xq[1]);
diff --git a/media/libaom/src/av1/common/x86/selfguided_sse4.c b/media/libaom/src/av1/common/x86/selfguided_sse4.c
index ea3f6d942..72c7708f1 100644
--- a/media/libaom/src/av1/common/x86/selfguided_sse4.c
+++ b/media/libaom/src/av1/common/x86/selfguided_sse4.c
@@ -170,12 +170,12 @@ static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
 static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
                     int width, int height, int buf_stride, int bit_depth,
                     int sgr_params_idx, int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+  const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -216,10 +216,11 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
 
       // 'Gather' type instructions are not available pre-AVX2, so synthesize a
       // gather using scalar loads.
-      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
+      const __m128i a_res =
+          _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
       xx_storeu_128(A + i * buf_stride + j, a_res);
 
@@ -310,12 +311,12 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
                          const int32_t *D, int width, int height,
                          int buf_stride, int bit_depth, int sgr_params_idx,
                          int radius_idx) {
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   const int r = params->r[radius_idx];
   const int n = (2 * r + 1) * (2 * r + 1);
   const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
   // one_over_n[n-1] is 2^12/n, so easily fits in an int16
-  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+  const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
 
   const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
   const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -356,10 +357,11 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
 
       // 'Gather' type instructions are not available pre-AVX2, so synthesize a
       // gather using scalar loads.
-      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
-                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
+      const __m128i a_res =
+          _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
       xx_storeu_128(A + i * buf_stride + j, a_res);
 
@@ -554,7 +556,7 @@ int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
     integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
                     buf_stride);
 
-  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // Write to flt0 and flt1
   // If params->r == 0 we skip the corresponding filter. We only allow one of
   // the radii to be 0, as having both equal to 0 would be equivalent to
@@ -580,11 +582,11 @@ int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
   return 0;
 }
 
-void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
-                                         int height, int stride, int eps,
-                                         const int *xqd, uint8_t *dst8,
-                                         int dst_stride, int32_t *tmpbuf,
-                                         int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+                                             int height, int stride, int eps,
+                                             const int *xqd, uint8_t *dst8,
+                                             int dst_stride, int32_t *tmpbuf,
+                                             int bit_depth, int highbd) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -592,9 +594,9 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
   (void)ret;
   assert(!ret);
-  const sgr_params_type *const params = &sgr_params[eps];
+  const sgr_params_type *const params = &av1_sgr_params[eps];
   int xq[2];
-  decode_xq(xqd, xq, params);
+  av1_decode_xq(xqd, xq, params);
 
   __m128i xq0 = _mm_set1_epi32(xq[0]);
   __m128i xq1 = _mm_set1_epi32(xq[1]);
diff --git a/media/libaom/src/av1/common/x86/warp_plane_avx2.c b/media/libaom/src/av1/common/x86/warp_plane_avx2.c
new file mode 100644
index 000000000..53a928d76
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/warp_plane_avx2.c
@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
+  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
+  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
+                                      5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
+                                      6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src1[32]) = { 4,  6,  6,  8,  8,  10, 10, 12, 5,  7, 7,
+                                      9,  9,  11, 11, 13, 4,  6,  6,  8,  8, 10,
+                                      10, 12, 5,  7,  7,  9,  9,  11, 11, 13 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src2[32]) = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
+                                      6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
+                                      7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src3[32]) = { 5,  7,  7,  9,  9,  11, 11, 13, 6,  8, 8,
+                                      10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
+                                      11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
+
+static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
+                                          __m256i *coeff,
+                                          const __m256i *shuffle_src,
+                                          const __m256i *round_const,
+                                          const __m128i *shift, int row) {
+  const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
+  const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
+  const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
+  const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
+
+  const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
+  const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
+  const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
+  const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
+
+  const __m256i res_even = _mm256_add_epi16(res_02, res_46);
+  const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
+  const __m256i res =
+      _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
+  horz_out[row] = _mm256_srl_epi16(res, *shift);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
+                                                        int sx,
+                                                        __m256i *coeff) {
+  __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+
+  __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+
+  __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
+  __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
+  __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
+  __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
+
+  __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
+  __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
+  __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
+  __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
+
+  __m128i tmp_8 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
+
+  __m128i tmp_9 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
+
+  __m128i tmp_10 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
+
+  __m128i tmp_11 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
+
+  tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
+
+  tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
+
+  tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
+
+  tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
+
+  const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
+  const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
+  const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
+  const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
+
+  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
+                                                              __m256i *coeff) {
+  __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
+  const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
+  const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
+  const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
+
+  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
+                                                               __m256i *coeff) {
+  const __m128i tmp_0 =
+      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
+
+  const __m256i res_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
+
+  coeff[0] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
+  coeff[1] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
+  coeff[2] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
+  coeff[3] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
+}
+
+static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
+                                          int sx, int alpha, int beta, int row,
+                                          const __m256i *shuffle_src,
+                                          const __m256i *round_const,
+                                          const __m128i *shift) {
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
+  filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
+                         row);
+}
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                   __m256i *coeff) {
+  const __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+  coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
+  coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
+  coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
+  coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
+}
+
+static INLINE void warp_horizontal_filter_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  int k, iy, sx, row = 0;
+  __m256i coeff[4];
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    sx = sx4 + beta * (k + 4);
+    horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
+                           round_const, shift);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  sx = sx4 + beta * (k + 4);
+  prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)alpha;
+  int k, iy, sx, row = 0;
+  __m256i coeff[4];
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    sx = sx4 + beta * (k + 4);
+    prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  sx = sx4 + beta * (k + 4);
+  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_beta0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)beta;
+  int k, iy, row = 0;
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)alpha;
+  int k, iy, row = 0;
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void unpack_weights_and_set_round_const_avx2(
+    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+    __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
+  *res_sub_const =
+      _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                        (1 << (offset_bits - conv_params->round_1 - 1)));
+  *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16((short)w0);
+  const __m256i wt1 = _mm256_set1_epi16((short)w1);
+  *wt = _mm256_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
+                                                       int sy,
+                                                       __m256i *coeffs) {
+  __m128i filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m128i filt_10 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_11 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_12 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_13 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i filt_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+  __m256i filt_1 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+  __m256i filt_2 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+  __m256i filt_3 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+  filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_10 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_11 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_12 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_13 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+  filt_1 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+  filt_2 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+  filt_3 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
+                                                              __m256i *coeffs) {
+  __m128i filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
+  __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
+  __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
+  __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+  filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_0 = _mm256_broadcastsi128_si256(filt_00);
+  filt_1 = _mm256_broadcastsi128_si256(filt_01);
+  filt_2 = _mm256_broadcastsi128_si256(filt_02);
+  filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
+                                                              __m256i *coeffs) {
+  const __m128i filt_0 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  const __m128i filt_1 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i res_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
+
+  coeffs[0] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
+  coeffs[1] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
+  coeffs[2] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
+  coeffs[3] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
+
+  coeffs[4] = coeffs[0];
+  coeffs[5] = coeffs[1];
+  coeffs[6] = coeffs[2];
+  coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
+                                                   __m256i *src,
+                                                   __m256i *coeffs,
+                                                   __m256i *res_lo,
+                                                   __m256i *res_hi, int row) {
+  const __m256i src_6 = horz_out[row + 3];
+  const __m256i src_7 =
+      _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
+
+  src[6] = _mm256_unpacklo_epi16(src_6, src_7);
+
+  const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
+  const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
+  const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
+  const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
+
+  const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
+                                            _mm256_add_epi32(res_4, res_6));
+
+  src[7] = _mm256_unpackhi_epi16(src_6, src_7);
+
+  const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
+  const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
+  const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
+  const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
+
+  const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
+                                           _mm256_add_epi32(res_5, res_7));
+
+  // Rearrange pixels back into the order 0 ... 7
+  *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+  *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output_avx2(
+    const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
+    const __m256i *wt, const __m256i *res_sub_const,
+    const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
+    int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
+    const int round_bits) {
+  __m256i res_lo_1 = *res_lo;
+  __m256i res_hi_1 = *res_hi;
+
+  if (conv_params->is_compound) {
+    __m128i *const p_0 =
+        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+    __m128i *const p_1 =
+        (__m128i *)&conv_params
+            ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
+
+    res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
+                                 reduce_bits_vert);
+
+    const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
+    __m256i res_lo_16;
+    if (conv_params->do_average) {
+      __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+      __m128i *const dst8_1 =
+          (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+      const __m128i p_16_0 = _mm_loadl_epi64(p_0);
+      const __m128i p_16_1 = _mm_loadl_epi64(p_1);
+      const __m256i p_16 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
+      if (conv_params->use_dist_wtd_comp_avg) {
+        const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
+        const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
+        const __m256i shifted_32 =
+            _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+        res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+      } else {
+        res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
+      }
+      res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
+      res_lo_16 = _mm256_srai_epi16(
+          _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
+      const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
+      const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
+      const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
+      *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
+      *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
+    } else {
+      const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
+      const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
+      _mm_storel_epi64(p_0, temp_lo_16_0);
+      _mm_storel_epi64(p_1, temp_lo_16_1);
+    }
+    if (p_width > 4) {
+      __m128i *const p4_0 =
+          (__m128i *)&conv_params
+              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+      __m128i *const p4_1 =
+          (__m128i *)&conv_params
+              ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
+      res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
+                                   reduce_bits_vert);
+      const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
+      __m256i res_hi_16;
+      if (conv_params->do_average) {
+        __m128i *const dst8_4_0 =
+            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+        __m128i *const dst8_4_1 =
+            (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
+        const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
+        const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
+        const __m256i p4_16 = _mm256_inserti128_si256(
+            _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
+        if (conv_params->use_dist_wtd_comp_avg) {
+          const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
+          const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
+          const __m256i shifted_32 =
+              _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+          res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+        } else {
+          res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
+        }
+        res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
+        res_hi_16 = _mm256_srai_epi16(
+            _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
+        __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
+        const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
+        const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
+        *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
+        *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
+      } else {
+        const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
+        const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
+        _mm_storel_epi64(p4_0, temp_hi_16_0);
+        _mm_storel_epi64(p4_1, temp_hi_16_1);
+      }
+    }
+  } else {
+    const __m256i res_lo_round = _mm256_srai_epi32(
+        _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+    const __m256i res_hi_round = _mm256_srai_epi32(
+        _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+    const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
+    const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
+    const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
+    const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
+
+    // Store, blending with 'pred' if needed
+    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+    __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+
+    if (p_width == 4) {
+      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0);
+      *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1);
+    } else {
+      _mm_storel_epi64(p, res_8bit0);
+      _mm_storel_epi64(p1, res_8bit1);
+    }
+  }
+}
+
+static INLINE void warp_vertical_filter_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  int k, row = 0;
+  __m256i src[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    int sy = sy4 + delta * (k + 4);
+    __m256i coeffs[8];
+    prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)gamma;
+  int k, row = 0;
+  __m256i src[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    int sy = sy4 + delta * (k + 4);
+    __m256i coeffs[8];
+    prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_delta0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)delta;
+  int k, row = 0;
+  __m256i src[8], coeffs[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)gamma;
+  int k, row = 0;
+  __m256i src[8], coeffs[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void prepare_warp_vertical_filter_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  if (gamma == 0 && delta == 0)
+    warp_vertical_filter_gamma0_delta0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else if (gamma == 0 && delta != 0)
+    warp_vertical_filter_gamma0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else if (gamma != 0 && delta == 0)
+    warp_vertical_filter_delta0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else
+    warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
+                              p_height, p_stride, p_width, i, j, sy4,
+                              reduce_bits_vert, res_add_const, round_bits,
+                              res_sub_const, round_bits_const, wt);
+}
+
+static INLINE void prepare_warp_horizontal_filter_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  if (alpha == 0 && beta == 0)
+    warp_horizontal_filter_alpha0_beta0_avx2(
+        ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+        round_const, shift, shuffle_src);
+  else if (alpha == 0 && beta != 0)
+    warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+                                       alpha, beta, p_height, height, i,
+                                       round_const, shift, shuffle_src);
+  else if (alpha != 0 && beta == 0)
+    warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+                                      alpha, beta, p_height, height, i,
+                                      round_const, shift, shuffle_src);
+  else
+    warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
+                                beta, p_height, height, i, round_const, shift,
+                                shuffle_src);
+}
+
+int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m256i row_error, col_error;
+  __m256i zero = _mm256_set1_epi16(0);
+  __m256i dup_255 = _mm256_set1_epi16(255);
+  col_error = zero;
+
+  for (i = 0; i < (p_height / 4); i++) {
+    row_error = _mm256_set1_epi16(0);
+    for (j = 0; j < (p_width / 16); j++) {
+      __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
+      __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
+      __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
+      __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
+      __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
+      __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
+      __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
+      __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
+
+      __m256i diff_1 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
+      __m256i diff_2 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
+      __m256i diff_3 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
+      __m256i diff_4 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
+
+      __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
+      __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
+      __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
+      __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
+      __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
+      __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
+      __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
+      __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
+
+      __m256i error_1_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
+      __m256i error_1_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
+      __m256i error_2_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
+      __m256i error_2_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
+      __m256i error_3_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
+      __m256i error_3_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
+      __m256i error_4_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
+      __m256i error_4_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
+
+      __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
+      __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
+      __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
+      __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
+
+      __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
+      __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
+
+      __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
+      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
+    }
+    __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
+    __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
+    __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm256_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = j * 16; l < p_width; ++l) {
+          sum_error +=
+              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
+                                     ref[l + ((i * 4) + k) * ref_stride]);
+        }
+      }
+    }
+  }
+  __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
+  __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
+  sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, sum_error_q_0);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  // Error summation for remaining height, which is not multiple of 4
+  if (p_height & 0x3) {
+    for (int k = i * 4; k < p_height; ++k) {
+      for (int l = 0; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
+                                            ref[l + k * ref_stride]);
+      }
+    }
+  }
+  return sum_error;
+}
+
+void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  __m256i horz_out[8];
+  int i, j, k;
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m256i reduce_bits_vert_const =
+      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  const __m256i round_const = _mm256_set1_epi16(
+      (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
+  const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
+
+  __m256i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
+                                          &res_sub_const, &round_bits_const,
+                                          &wt);
+
+  __m256i res_add_const_1;
+  if (conv_params->is_compound == 1) {
+    res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
+  } else {
+    res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                        ((1 << reduce_bits_vert) >> 1));
+  }
+  const int32_t const1 = alpha * (-4) + beta * (-4) +
+                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  const int32_t const2 = gamma * (-4) + delta * (-4) +
+                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
+  const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
+  const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
+
+  __m256i shuffle_src[4];
+  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
+  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
+  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
+  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += const1;
+      sy4 += const2;
+
+      sx4 &= ~const3;
+      sy4 &= ~const3;
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+
+      if (ix4 <= -7) {
+        int iy, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_0 =
+              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_1 =
+              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+      } else if (ix4 >= width + 6) {
+        int iy, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_0 = _mm256_set1_epi16(
+              const4 + ref[iy * stride + (width - 1)] * const5);
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_1 = _mm256_set1_epi16(
+              const4 + ref[iy * stride + (width - 1)] * const5);
+          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        horz_out[row] =
+            _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+        int iy, sx, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          __m128i src0 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          __m128i src1 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
+            src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
+          }
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
+            src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
+          }
+          sx = sx4 + beta * (k + 4);
+          const __m256i src_01 =
+              _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+          horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
+                                 shuffle_src, &round_const, &shift);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+        if (out_of_boundary_left >= 0) {
+          const __m128i shuffle_reg_left =
+              _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+          src = _mm_shuffle_epi8(src, shuffle_reg_left);
+        }
+        if (out_of_boundary_right >= 0) {
+          const __m128i shuffle_reg_right =
+              _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
+          src = _mm_shuffle_epi8(src, shuffle_reg_right);
+        }
+        sx = sx4 + beta * (k + 4);
+        const __m256i src_01 = _mm256_castsi128_si256(src);
+        __m256i coeff[4];
+        prepare_horizontal_filter_coeff(alpha, sx, coeff);
+        filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
+                               &round_const, &shift, row);
+      } else {
+        prepare_warp_horizontal_filter_avx2(
+            ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
+            i, &round_const, &shift, shuffle_src);
+      }
+
+      // Vertical filter
+      prepare_warp_vertical_filter_avx2(
+          pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
+          p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
+          &res_sub_const, &round_bits_const, &wt);
+    }
+  }
+}
diff --git a/media/libaom/src/av1/common/x86/warp_plane_sse2.c b/media/libaom/src/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 000000000..6ff666518
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m128i row_error, col_error;
+  __m128i zero = _mm_set1_epi16(0);
+  __m128i dup_255 = _mm_set1_epi16(255);
+  col_error = zero;
+  for (i = 0; i < (p_height); i++) {
+    row_error = zero;
+    for (j = 0; j < (p_width / 16); j++) {
+      __m128i ref_8 =
+          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
+      __m128i dst_8 =
+          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
+      __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
+      __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
+      __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
+      __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
+
+      __m128i diff_1 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
+      __m128i diff_2 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
+
+      __m128i error_1_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
+      __m128i error_1_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
+      __m128i error_2_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
+      __m128i error_2_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
+
+      __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
+      __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
+      __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
+
+      row_error = _mm_add_epi32(row_error, error_1_2);
+    }
+    __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
+    __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
+    __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int l = j * 16; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
+                                            ref[l + i * ref_stride]);
+      }
+    }
+  }
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, col_error);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  return sum_error;
+}
diff --git a/media/libaom/src/av1/common/x86/warp_plane_sse4.c b/media/libaom/src/av1/common/x86/warp_plane_sse4.c
index b810cea2e..10ddf92d0 100644
--- a/media/libaom/src/av1/common/x86/warp_plane_sse4.c
+++ b/media/libaom/src/av1/common/x86/warp_plane_sse4.c
@@ -16,7 +16,7 @@
 
 #include "av1/common/warped_motion.h"
 
-/* This is a modified version of 'warped_filter' from warped_motion.c:
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
    * Each coefficient is stored in 8 bits instead of 16 bits
    * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
 
@@ -31,8 +31,8 @@
      coefficients into the correct order more quickly.
 */
 /* clang-format off */
-DECLARE_ALIGNED(8, static const int8_t,
-                filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+DECLARE_ALIGNED(8, const int8_t,
+                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
 #if WARPEDPIXEL_PREC_BITS == 6
   // [-1, 0)
   { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
@@ -198,40 +198,53 @@ DECLARE_ALIGNED(8, static const int8_t,
 // in an SSE register into two sequences:
 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
-static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
-                                       8, 10, 10, 12, 12, 14, 14, 0 };
-static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
-                                      9, 11, 11, 13, 13, 15, 15, 0 };
-
-static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
-                                                   0, 1, 0, 1, 0, 1, 0, 1 };
-
-static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
-                                                   2, 3, 2, 3, 2, 3, 2, 3 };
-
-static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
-                                                   4, 5, 4, 5, 4, 5, 4, 5 };
-
-static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
-                                                   6, 7, 6, 7, 6, 7, 6, 7 };
-
-static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
-                                                  0, 1, 2, 3, 0, 1, 2, 3 };
-static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
-                                                  4, 5, 6, 7, 4, 5, 6, 7 };
-static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
-                                                  8, 9, 10, 11, 8, 9, 10, 11 };
-static const uint8_t shuffle_gamma0_mask3[16] = {
-  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
+                                   8, 10, 10, 12, 12, 14, 14, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
+                                  9, 11, 11, 13, 13, 15, 15, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
+                                               0, 1, 0, 1, 0, 1, 0, 1 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
+                                               2, 3, 2, 3, 2, 3, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
+                                               4, 5, 4, 5, 4, 5, 4, 5 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
+                                               6, 7, 6, 7, 6, 7, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
+                                              0, 1, 2, 3, 0, 1, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
+                                              4, 5, 6, 7, 4, 5, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
+                                              8, 9, 10, 11, 8, 9, 10, 11 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
+                                              12, 13, 14, 15, 12, 13, 14, 15 };
 
 static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
                                      const int offset_bits_horiz,
                                      const int reduce_bits_horiz, int k) {
   const __m128i src_even =
-      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
   const __m128i src_odd =
-      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
   // The pixel order we need for 'src' is:
   // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
   const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
@@ -271,21 +284,21 @@ static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
                                                    __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_1 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_2 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_3 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_4 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_5 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_6 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
   const __m128i tmp_7 = _mm_loadl_epi64(
-      (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
 
   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
   const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
@@ -319,20 +332,20 @@ static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
                                                           __m128i *coeff) {
   // Filter even-index pixels
   const __m128i tmp_0 =
-      _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
 
   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-  coeff[0] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+  coeff[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-  coeff[1] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+  coeff[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-  coeff[2] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+  coeff[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-  coeff[3] = _mm_shuffle_epi8(
-      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+  coeff[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
 }
 
 static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
@@ -449,21 +462,25 @@ static INLINE void unpack_weights_and_set_round_const(
 
   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
+  const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
   *wt = _mm_unpacklo_epi16(wt0, wt1);
 }
 
 static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
                                                   __m128i *coeffs) {
-  const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_2 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_4 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_6 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_0 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
@@ -476,14 +493,18 @@ static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
   coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
   coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
 
-  const __m128i tmp_1 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_3 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_5 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_7 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_1 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
 
   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -500,17 +521,17 @@ static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
 static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
                                                          __m128i *coeffs) {
   const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
 
   // even coeffs
   coeffs[0] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
   coeffs[1] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
   coeffs[2] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
   coeffs[3] =
-      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
 
   // odd coeffs
   coeffs[4] = coeffs[0];
@@ -577,7 +598,7 @@ static INLINE void store_vertical_filter_output(
       __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
       const __m128i p_16 = _mm_loadl_epi64(p);
 
-      if (conv_params->use_jnt_comp_avg) {
+      if (conv_params->use_dist_wtd_comp_avg) {
         const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
         const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
         const __m128i shifted_32 =
@@ -610,7 +631,7 @@ static INLINE void store_vertical_filter_output(
             (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
         const __m128i p4_16 = _mm_loadl_epi64(p4);
 
-        if (conv_params->use_jnt_comp_avg) {
+        if (conv_params->use_dist_wtd_comp_avg) {
           const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
           const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
           const __m128i shifted_32 =
diff --git a/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c b/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c
index 87a6e1239..b7ac68383 100644
--- a/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c
@@ -17,6 +17,7 @@
 #include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 
@@ -25,6 +26,20 @@
 // on the left.
 // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
 // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+
+// Exploiting the range of wiener filter coefficients,
+// horizontal filtering can be done in 16 bit intermediate precision.
+// The details are as follows :
+// Consider the horizontal wiener filter coefficients of the following form :
+//      [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
+// Subtracting  2^(FILTER_BITS) from the centre tap we get the following  :
+//      [C0, C1, C2,     -2 * (C0 + C1 + C2),             C2, C1, C0]
+// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
+// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
+// precision. Finally, after rounding the above result by round_0, we multiply
+// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
+// horizontal filter output.
+
 void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
@@ -37,224 +52,190 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
   (void)x_step_q4;
   (void)y_step_q4;
 
-  DECLARE_ALIGNED(32, uint16_t,
-                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 2;
-  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
+  int im_h = h + SUBPEL_TAPS - 2;
+  int im_stride = 8;
+  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
+  int i, j;
+  const int center_tap = (SUBPEL_TAPS - 1) / 2;
   const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
 
-  const __m128i zero_128 = _mm_setzero_si128();
-  const __m256i zero_256 = _mm256_setzero_si256();
-
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-
-  const __m256i clamp_low = zero_256;
+  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
+
+  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
+  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs_h[0] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs_h[1] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs_h[2] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs_h[3] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
+
+  const __m256i round_const_h =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
+  const __m256i round_const_horz =
+      _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
+  const __m256i clamp_low = _mm256_setzero_si256();
   const __m256i clamp_high =
       _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
 
-  /* Horizontal filter */
-  {
-    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
-    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
-
-    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
-    const __m256i round_const = _mm256_set1_epi32(
-        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
-
-    for (int i = 0; i < intermediate_height; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint8_t *data_ij = src_ptr + i * src_stride + j;
-
-        // Load 8-bit src data
-        const __m128i data_0 = xx_loadu_128(data_ij + 0);
-        const __m128i data_1 = xx_loadu_128(data_ij + 1);
-        const __m128i data_2 = xx_loadu_128(data_ij + 2);
-        const __m128i data_3 = xx_loadu_128(data_ij + 3);
-        const __m128i data_4 = xx_loadu_128(data_ij + 4);
-        const __m128i data_5 = xx_loadu_128(data_ij + 5);
-        const __m128i data_6 = xx_loadu_128(data_ij + 6);
-        const __m128i data_7 = xx_loadu_128(data_ij + 7);
-
-        // (Zero-)Extend 8-bit data to 16-bit data
-        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
-        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
-        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
-        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
-        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
-        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
-        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
-        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
-
-        // Multiply src data by filter coeffs and sum pairs
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
-        // Calculate scalar product for even- and odd-indices separately,
-        // increasing to 32-bit precision
-        const __m256i res_even_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
-        const __m256i res_odd_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
-
-        const __m256i res_even = _mm256_srai_epi32(
-            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
-        const __m256i res_odd = _mm256_srai_epi32(
-            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
-
-        // Reduce to 16-bit precision and pack even- and odd-index results
-        // back into one register. The _mm256_packs_epi32 intrinsic returns
-        // a register with the pixels ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
-        const __m256i res_clamped =
-            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
-
-        // Store in a temporary array
-        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
-      }
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
+
+  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
+
+  const __m256i round_const_v =
+      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                        (1 << (bd + conv_params->round_1 - 1)));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  for (j = 0; j < w; j += 8) {
+    for (i = 0; i < im_h; i += 2) {
+      __m256i data = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      // Load the next line
+      if (i + 1 < im_h)
+        data = _mm256_inserti128_si256(
+            data,
+            _mm_loadu_si128(
+                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+            1);
+
+      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+      res =
+          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
+
+      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
+      // the result
+      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
+      res = _mm256_add_epi16(res, data_0);
+      res = _mm256_add_epi16(res, round_const_horz);
+      const __m256i res_clamped =
+          _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
     }
-  }
 
-  /* Vertical filter */
-  {
-    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
-    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
-
-    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
-    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
-    const __m256i round_const =
-        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
-                          (1 << (bd + conv_params->round_1 - 1)));
-
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
-
-        // Load 16-bit data from the output of the horizontal filter in
-        // which the pixels are ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
-        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
-        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
-        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
-        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
-        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
-        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
-        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
-
-        // Filter the even-indices, increasing to 32-bit precision
-        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
-        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
-        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
-        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
-
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-
-        const __m256i res_even = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
-        // Filter the odd-indices, increasing to 32-bit precision
-        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
-        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
-        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
-        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
-
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
-        const __m256i res_odd = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
-        // Pixels are currently in the following order:
-        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
-        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
-        //
-        // Rearrange the pixels into the following order:
-        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
-        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
-        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
-        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
-        const __m256i res_lo_round = _mm256_srai_epi32(
-            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
-        const __m256i res_hi_round = _mm256_srai_epi32(
-            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
-
-        // Reduce to 16-bit precision and pack into the correct order:
-        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
-        const __m256i res_16bit =
-            _mm256_packs_epi32(res_lo_round, res_hi_round);
-
-        // Reduce to 8-bit precision. This messes up the order:
-        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
-        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
-        const __m256i res_8bit =
-            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
-
-        // Swap the two central 32-bit values to get the order:
-        // [ - - - - - - - - - - - - - - - - ]
-        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
-        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
-
-        // Store the lower 128-bit lane in the dst array
-        xx_storeu_128(dst + i * dst_stride + j,
-                      _mm256_castsi256_si128(res_8bit2));
+    /* Vertical filter */
+    {
+      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      __m256i s[8];
+      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+      for (i = 0; i < h - 1; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        __m256i res_a = convolve(s, coeffs_v);
+        __m256i res_b = convolve(s + 4, coeffs_v);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_b_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+        // 8 bit conversion and saturation to uint8
+        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        // Store values into the destination buffer
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+
+        _mm_storel_epi64(p_0, res_0);
+        _mm_storel_epi64(p_1, res_1);
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+      if (h - i) {
+        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
+        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
+        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
+
+        const int16_t *data = &im_block[i * im_stride];
+        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
+        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+
+        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
+        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
+
+        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
+        __m256i convolveres = convolve(s, coeffs_v);
+
+        const __m256i res_round = _mm256_sra_epi32(
+            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        __m128i reslo = _mm256_castsi256_si128(res_round);
+        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
+        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
+
+        // 8 bit conversion and saturation to uint8
+        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storel_epi64(p_0, res_8b);
       }
     }
   }
author	Moonchild <moonchild@palemoon.org>	2021-03-03 18:48:48 +0000
committer	Moonchild <moonchild@palemoon.org>	2021-03-04 00:03:46 +0000
commit	44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746 (patch)
tree	9d9cc4d21c93ae3e1a88ab5c160c3be5f6af0ca9 /media/libaom/src/av1/common
parent	353943d1a48086a39ff5f4365b22f8f058d5f66e (diff)
download	aura-central-44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746.tar.gz