summaryrefslogtreecommitdiff
path: root/media/libaom/src/av1/common
diff options
context:
space:
mode:
authorMoonchild <moonchild@palemoon.org>2021-03-03 18:48:48 +0000
committerMoonchild <moonchild@palemoon.org>2021-03-04 00:03:46 +0000
commit44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746 (patch)
tree9d9cc4d21c93ae3e1a88ab5c160c3be5f6af0ca9 /media/libaom/src/av1/common
parent353943d1a48086a39ff5f4365b22f8f058d5f66e (diff)
downloadaura-central-44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746.tar.gz
Issue mcp-graveyard/UXP%1737 - Import libaom 2.0.2 source
Diffstat (limited to 'media/libaom/src/av1/common')
-rw-r--r--media/libaom/src/av1/common/alloccommon.c253
-rw-r--r--media/libaom/src/av1/common/alloccommon.h20
-rw-r--r--media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c2628
-rw-r--r--media/libaom/src/av1/common/arm/av1_txfm_neon.c2
-rw-r--r--media/libaom/src/av1/common/arm/cfl_neon.c4
-rw-r--r--media/libaom/src/av1/common/arm/convolve_neon.c320
-rw-r--r--media/libaom/src/av1/common/arm/convolve_neon.h4
-rw-r--r--media/libaom/src/av1/common/arm/jnt_convolve_neon.c185
-rw-r--r--media/libaom/src/av1/common/arm/mem_neon.h45
-rw-r--r--media/libaom/src/av1/common/arm/selfguided_neon.c140
-rw-r--r--media/libaom/src/av1/common/arm/transpose_neon.h75
-rw-r--r--media/libaom/src/av1/common/arm/warp_plane_neon.c38
-rw-r--r--media/libaom/src/av1/common/av1_common_int.h1557
-rw-r--r--media/libaom/src/av1/common/av1_inv_txfm1d.c37
-rw-r--r--media/libaom/src/av1/common/av1_inv_txfm1d.h32
-rw-r--r--media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h8
-rw-r--r--media/libaom/src/av1/common/av1_inv_txfm2d.c35
-rw-r--r--media/libaom/src/av1/common/av1_loopfilter.c1955
-rw-r--r--media/libaom/src/av1/common/av1_loopfilter.h99
-rw-r--r--media/libaom/src/av1/common/av1_rtcd_defs.pl304
-rw-r--r--media/libaom/src/av1/common/av1_txfm.c3
-rw-r--r--media/libaom/src/av1/common/av1_txfm.h6
-rw-r--r--media/libaom/src/av1/common/blockd.c70
-rw-r--r--media/libaom/src/av1/common/blockd.h598
-rw-r--r--media/libaom/src/av1/common/cdef.c135
-rw-r--r--media/libaom/src/av1/common/cdef.h11
-rw-r--r--media/libaom/src/av1/common/cdef_block.c74
-rw-r--r--media/libaom/src/av1/common/cdef_block.h17
-rw-r--r--media/libaom/src/av1/common/cdef_block_simd.h51
-rw-r--r--media/libaom/src/av1/common/cfl.c108
-rw-r--r--media/libaom/src/av1/common/cfl.h284
-rw-r--r--media/libaom/src/av1/common/common_data.h8
-rw-r--r--media/libaom/src/av1/common/convolve.c493
-rw-r--r--media/libaom/src/av1/common/convolve.h43
-rw-r--r--media/libaom/src/av1/common/debugmodes.c50
-rw-r--r--media/libaom/src/av1/common/entropy.c13
-rw-r--r--media/libaom/src/av1/common/entropy.h6
-rw-r--r--media/libaom/src/av1/common/entropymode.c162
-rw-r--r--media/libaom/src/av1/common/entropymode.h4
-rw-r--r--media/libaom/src/av1/common/entropymv.c2
-rw-r--r--media/libaom/src/av1/common/entropymv.h12
-rw-r--r--media/libaom/src/av1/common/enums.h317
-rw-r--r--media/libaom/src/av1/common/filter.h123
-rw-r--r--media/libaom/src/av1/common/frame_buffers.c13
-rw-r--r--media/libaom/src/av1/common/idct.c74
-rw-r--r--media/libaom/src/av1/common/idct.h16
-rw-r--r--media/libaom/src/av1/common/loopfiltermask.c1458
-rw-r--r--media/libaom/src/av1/common/mv.h75
-rw-r--r--media/libaom/src/av1/common/mvref_common.c968
-rw-r--r--media/libaom/src/av1/common/mvref_common.h108
-rw-r--r--media/libaom/src/av1/common/obmc.h32
-rw-r--r--media/libaom/src/av1/common/obu_util.c23
-rw-r--r--media/libaom/src/av1/common/onyxc_int.h1342
-rw-r--r--media/libaom/src/av1/common/ppc/cfl_ppc.c40
-rw-r--r--media/libaom/src/av1/common/pred_common.h74
-rw-r--r--media/libaom/src/av1/common/quant_common.c977
-rw-r--r--media/libaom/src/av1/common/quant_common.h34
-rw-r--r--media/libaom/src/av1/common/reconinter.c954
-rw-r--r--media/libaom/src/av1/common/reconinter.h235
-rw-r--r--media/libaom/src/av1/common/reconintra.c234
-rw-r--r--media/libaom/src/av1/common/reconintra.h54
-rw-r--r--media/libaom/src/av1/common/resize.c299
-rw-r--r--media/libaom/src/av1/common/resize.h5
-rw-r--r--media/libaom/src/av1/common/restoration.c142
-rw-r--r--media/libaom/src/av1/common/restoration.h15
-rw-r--r--media/libaom/src/av1/common/scale.c52
-rw-r--r--media/libaom/src/av1/common/scale.h2
-rw-r--r--media/libaom/src/av1/common/scan.c2315
-rw-r--r--media/libaom/src/av1/common/scan.h8
-rw-r--r--media/libaom/src/av1/common/seg_common.c17
-rw-r--r--media/libaom/src/av1/common/seg_common.h6
-rw-r--r--media/libaom/src/av1/common/thread_common.c182
-rw-r--r--media/libaom/src/av1/common/thread_common.h5
-rw-r--r--media/libaom/src/av1/common/tile_common.c196
-rw-r--r--media/libaom/src/av1/common/tile_common.h21
-rw-r--r--media/libaom/src/av1/common/timing.c37
-rw-r--r--media/libaom/src/av1/common/timing.h14
-rw-r--r--media/libaom/src/av1/common/token_cdfs.h3362
-rw-r--r--media/libaom/src/av1/common/txb_common.c25
-rw-r--r--media/libaom/src/av1/common/txb_common.h46
-rw-r--r--media/libaom/src/av1/common/warped_motion.c323
-rw-r--r--media/libaom/src/av1/common/warped_motion.h115
-rw-r--r--media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c15
-rw-r--r--media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c205
-rw-r--r--media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c96
-rw-r--r--media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c267
-rw-r--r--media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h4
-rw-r--r--media/libaom/src/av1/common/x86/av1_txfm_sse4.c2
-rw-r--r--media/libaom/src/av1/common/x86/cfl_avx2.c180
-rw-r--r--media/libaom/src/av1/common/x86/cfl_simd.h389
-rw-r--r--media/libaom/src/av1/common/x86/cfl_ssse3.c4
-rw-r--r--media/libaom/src/av1/common/x86/convolve_2d_avx2.c148
-rw-r--r--media/libaom/src/av1/common/x86/convolve_2d_sse2.c49
-rw-r--r--media/libaom/src/av1/common/x86/convolve_avx2.c524
-rw-r--r--media/libaom/src/av1/common/x86/convolve_sse2.c18
-rw-r--r--media/libaom/src/av1/common/x86/filterintra_sse4.c4
-rw-r--r--media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c20
-rw-r--r--media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c8
-rw-r--r--media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c53
-rw-r--r--media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c8
-rw-r--r--media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c3403
-rw-r--r--media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c3453
-rw-r--r--media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c115
-rw-r--r--media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c54
-rw-r--r--media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h21
-rw-r--r--media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c80
-rw-r--r--media/libaom/src/av1/common/x86/intra_edge_sse4.c8
-rw-r--r--media/libaom/src/av1/common/x86/jnt_convolve_avx2.c794
-rw-r--r--media/libaom/src/av1/common/x86/jnt_convolve_sse2.c278
-rw-r--r--media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c19
-rw-r--r--media/libaom/src/av1/common/x86/reconinter_avx2.c48
-rw-r--r--media/libaom/src/av1/common/x86/selfguided_avx2.c28
-rw-r--r--media/libaom/src/av1/common/x86/selfguided_sse4.c42
-rw-r--r--media/libaom/src/av1/common/x86/warp_plane_avx2.c1318
-rw-r--r--media/libaom/src/av1/common/x86/warp_plane_sse2.c88
-rw-r--r--media/libaom/src/av1/common/x86/warp_plane_sse4.c167
-rw-r--r--media/libaom/src/av1/common/x86/wiener_convolve_avx2.c403
117 files changed, 20997 insertions, 15545 deletions
diff --git a/media/libaom/src/av1/common/alloccommon.c b/media/libaom/src/av1/common/alloccommon.c
index 1bf81c91d..badee3df9 100644
--- a/media/libaom/src/av1/common/alloccommon.c
+++ b/media/libaom/src/av1/common/alloccommon.c
@@ -15,10 +15,10 @@
#include "aom_mem/aom_mem.h"
#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/entropymode.h"
#include "av1/common/entropymv.h"
-#include "av1/common/onyxc_int.h"
int av1_get_MBs(int width, int height) {
const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
@@ -31,60 +31,6 @@ int av1_get_MBs(int width, int height) {
return mb_rows * mb_cols;
}
-#if LOOP_FILTER_BITMASK
-static int alloc_loop_filter_mask(AV1_COMMON *cm) {
- aom_free(cm->lf.lfm);
- cm->lf.lfm = NULL;
-
- // Each lfm holds bit masks for all the 4x4 blocks in a max
- // 64x64 (128x128 for ext_partitions) region. The stride
- // and rows are rounded up / truncated to a multiple of 16
- // (32 for ext_partition).
- cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
- cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
- cm->lf.lfm_stride;
- cm->lf.lfm =
- (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
- if (!cm->lf.lfm) return 1;
-
- unsigned int i;
- for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
-
- return 0;
-}
-
-static void free_loop_filter_mask(AV1_COMMON *cm) {
- if (cm->lf.lfm == NULL) return;
-
- aom_free(cm->lf.lfm);
- cm->lf.lfm = NULL;
- cm->lf.lfm_num = 0;
- cm->lf.lfm_stride = 0;
-}
-#endif
-
-void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
- // Ensure that the decoded width and height are both multiples of
- // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
- // subsampling is used).
- // This simplifies the implementation of various experiments,
- // eg. cdef, which operates on units of 8x8 luma pixels.
- const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
- const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
-
- cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
- cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
- cm->mi_stride = calc_mi_size(cm->mi_cols);
-
- cm->mb_cols = (cm->mi_cols + 2) >> 2;
- cm->mb_rows = (cm->mi_rows + 2) >> 2;
- cm->MBs = cm->mb_rows * cm->mb_cols;
-
-#if LOOP_FILTER_BITMASK
- alloc_loop_filter_mask(cm);
-#endif
-}
-
void av1_free_ref_frame_buffers(BufferPool *pool) {
int i;
@@ -92,6 +38,9 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
if (pool->frame_bufs[i].ref_count > 0 &&
pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+ pool->frame_bufs[i].raw_frame_buffer.data = NULL;
+ pool->frame_bufs[i].raw_frame_buffer.size = 0;
+ pool->frame_bufs[i].raw_frame_buffer.priv = NULL;
pool->frame_bufs[i].ref_count = 0;
}
aom_free(pool->frame_bufs[i].mvs);
@@ -124,20 +73,19 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
// able to quickly answer the question "Where is the <n>'th stripe for tile
// row <m>?" To make that efficient, we generate the rst_last_stripe array.
int num_stripes = 0;
- for (int i = 0; i < cm->tile_rows; ++i) {
+ for (int i = 0; i < cm->tiles.rows; ++i) {
TileInfo tile_info;
av1_tile_set_row(&tile_info, cm, i);
const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
const int tile_stripes = (ext_h + 63) / 64;
num_stripes += tile_stripes;
- cm->rst_end_stripe[i] = num_stripes;
}
// Now we need to allocate enough space to store the line buffers for the
// stripes
const int frame_w = cm->superres_upscaled_width;
- const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+ const int use_highbd = cm->seq_params.use_highbitdepth;
for (int p = 0; p < num_planes; ++p) {
const int is_uv = p > 0;
@@ -184,106 +132,131 @@ void av1_free_restoration_buffers(AV1_COMMON *cm) {
aom_free_frame_buffer(&cm->rst_frame);
}
-void av1_free_above_context_buffers(AV1_COMMON *cm,
- int num_free_above_contexts) {
+void av1_free_above_context_buffers(CommonContexts *above_contexts) {
int i;
- const int num_planes = cm->num_allocated_above_context_planes;
+ const int num_planes = above_contexts->num_planes;
- for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) {
+ for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
for (i = 0; i < num_planes; i++) {
- aom_free(cm->above_context[i][tile_row]);
- cm->above_context[i][tile_row] = NULL;
+ aom_free(above_contexts->entropy[i][tile_row]);
+ above_contexts->entropy[i][tile_row] = NULL;
}
- aom_free(cm->above_seg_context[tile_row]);
- cm->above_seg_context[tile_row] = NULL;
+ aom_free(above_contexts->partition[tile_row]);
+ above_contexts->partition[tile_row] = NULL;
- aom_free(cm->above_txfm_context[tile_row]);
- cm->above_txfm_context[tile_row] = NULL;
+ aom_free(above_contexts->txfm[tile_row]);
+ above_contexts->txfm[tile_row] = NULL;
}
for (i = 0; i < num_planes; i++) {
- aom_free(cm->above_context[i]);
- cm->above_context[i] = NULL;
+ aom_free(above_contexts->entropy[i]);
+ above_contexts->entropy[i] = NULL;
}
- aom_free(cm->above_seg_context);
- cm->above_seg_context = NULL;
+ aom_free(above_contexts->partition);
+ above_contexts->partition = NULL;
- aom_free(cm->above_txfm_context);
- cm->above_txfm_context = NULL;
+ aom_free(above_contexts->txfm);
+ above_contexts->txfm = NULL;
- cm->num_allocated_above_contexts = 0;
- cm->num_allocated_above_context_mi_col = 0;
- cm->num_allocated_above_context_planes = 0;
+ above_contexts->num_tile_rows = 0;
+ above_contexts->num_mi_cols = 0;
+ above_contexts->num_planes = 0;
}
void av1_free_context_buffers(AV1_COMMON *cm) {
- cm->free_mi(cm);
+ cm->mi_params.free_mi(&cm->mi_params);
- av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+ av1_free_above_context_buffers(&cm->above_contexts);
-#if LOOP_FILTER_BITMASK
- free_loop_filter_mask(cm);
+#if CONFIG_LPF_MASK
+ av1_free_loop_filter_mask(cm);
#endif
}
-int av1_alloc_above_context_buffers(AV1_COMMON *cm,
- int num_alloc_above_contexts) {
- const int num_planes = av1_num_planes(cm);
- int plane_idx;
+int av1_alloc_above_context_buffers(CommonContexts *above_contexts,
+ int num_tile_rows, int num_mi_cols,
+ int num_planes) {
const int aligned_mi_cols =
- ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+ ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2);
// Allocate above context buffers
- cm->num_allocated_above_contexts = num_alloc_above_contexts;
- cm->num_allocated_above_context_mi_col = aligned_mi_cols;
- cm->num_allocated_above_context_planes = num_planes;
- for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
- cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
- num_alloc_above_contexts, sizeof(cm->above_context[0]));
- if (!cm->above_context[plane_idx]) return 1;
+ above_contexts->num_tile_rows = num_tile_rows;
+ above_contexts->num_mi_cols = aligned_mi_cols;
+ above_contexts->num_planes = num_planes;
+ for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+ above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+ num_tile_rows, sizeof(above_contexts->entropy[0]));
+ if (!above_contexts->entropy[plane_idx]) return 1;
}
- cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc(
- num_alloc_above_contexts, sizeof(cm->above_seg_context));
- if (!cm->above_seg_context) return 1;
+ above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc(
+ num_tile_rows, sizeof(above_contexts->partition));
+ if (!above_contexts->partition) return 1;
- cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc(
- num_alloc_above_contexts, sizeof(cm->above_txfm_context));
- if (!cm->above_txfm_context) return 1;
+ above_contexts->txfm =
+ (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm));
+ if (!above_contexts->txfm) return 1;
- for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) {
- for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
- cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc(
- aligned_mi_cols, sizeof(*cm->above_context[0][tile_row]));
- if (!cm->above_context[plane_idx][tile_row]) return 1;
+ for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) {
+ for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+ above_contexts->entropy[plane_idx][tile_row] =
+ (ENTROPY_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row]));
+ if (!above_contexts->entropy[plane_idx][tile_row]) return 1;
}
- cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
- aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row]));
- if (!cm->above_seg_context[tile_row]) return 1;
+ above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*above_contexts->partition[tile_row]));
+ if (!above_contexts->partition[tile_row]) return 1;
- cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc(
- aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row]));
- if (!cm->above_txfm_context[tile_row]) return 1;
+ above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row]));
+ if (!above_contexts->txfm[tile_row]) return 1;
}
return 0;
}
-int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
- int new_mi_size;
-
- av1_set_mb_mi(cm, width, height);
- new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
- if (cm->mi_alloc_size < new_mi_size) {
- cm->free_mi(cm);
- if (cm->alloc_mi(cm, new_mi_size)) goto fail;
+// Allocate the dynamically allocated arrays in 'mi_params' assuming
+// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of
+// the struct members.
+static int alloc_mi(CommonModeInfoParams *mi_params) {
+ const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows);
+ const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows;
+ const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ const int alloc_mi_size =
+ mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d);
+
+ if (mi_params->mi_alloc_size < alloc_mi_size ||
+ mi_params->mi_grid_size < mi_grid_size) {
+ mi_params->free_mi(mi_params);
+
+ mi_params->mi_alloc =
+ aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc));
+ if (!mi_params->mi_alloc) return 1;
+ mi_params->mi_alloc_size = alloc_mi_size;
+
+ mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc(
+ mi_grid_size, sizeof(*mi_params->mi_grid_base));
+ if (!mi_params->mi_grid_base) return 1;
+ mi_params->mi_grid_size = mi_grid_size;
+
+ mi_params->tx_type_map =
+ aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map));
+ if (!mi_params->tx_type_map) return 1;
}
return 0;
+}
+
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ mi_params->set_mb_mi(mi_params, width, height);
+ if (alloc_mi(mi_params)) goto fail;
+ return 0;
fail:
// clear the mi_* values to force a realloc on resync
- av1_set_mb_mi(cm, 0, 0);
+ mi_params->set_mb_mi(mi_params, 0, 0);
av1_free_context_buffers(cm);
return 1;
}
@@ -293,8 +266,44 @@ void av1_remove_common(AV1_COMMON *cm) {
aom_free(cm->fc);
cm->fc = NULL;
- aom_free(cm->frame_contexts);
- cm->frame_contexts = NULL;
+ aom_free(cm->default_frame_context);
+ cm->default_frame_context = NULL;
+}
+
+void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
+ mi_params->setup_mi(mi_params);
+}
+
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(AV1_COMMON *cm) {
+ aom_free(cm->lf.lfm);
+ cm->lf.lfm = NULL;
+
+ // Each lfm holds bit masks for all the 4x4 blocks in a max
+ // 64x64 (128x128 for ext_partitions) region. The stride
+ // and rows are rounded up / truncated to a multiple of 16
+ // (32 for ext_partition).
+ cm->lf.lfm_stride =
+ (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+ cm->lf.lfm_num =
+ ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+ cm->lf.lfm_stride;
+ cm->lf.lfm =
+ (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+ if (!cm->lf.lfm) return 1;
+
+ unsigned int i;
+ for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+ return 0;
}
-void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); }
+void av1_free_loop_filter_mask(AV1_COMMON *cm) {
+ if (cm->lf.lfm == NULL) return;
+
+ aom_free(cm->lf.lfm);
+ cm->lf.lfm = NULL;
+ cm->lf.lfm_num = 0;
+ cm->lf.lfm_stride = 0;
+}
+#endif
diff --git a/media/libaom/src/av1/common/alloccommon.h b/media/libaom/src/av1/common/alloccommon.h
index 8e5896981..fe8e0c530 100644
--- a/media/libaom/src/av1/common/alloccommon.h
+++ b/media/libaom/src/av1/common/alloccommon.h
@@ -14,21 +14,25 @@
#define INVALID_IDX -1 // Invalid buffer index.
+#include "config/aom_config.h"
+
#ifdef __cplusplus
extern "C" {
#endif
struct AV1Common;
struct BufferPool;
+struct CommonContexts;
+struct CommonModeInfoParams;
void av1_remove_common(struct AV1Common *cm);
-int av1_alloc_above_context_buffers(struct AV1Common *cm,
- int num_alloc_above_contexts);
-void av1_free_above_context_buffers(struct AV1Common *cm,
- int num_free_above_contexts);
+int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts,
+ int num_tile_rows, int num_mi_cols,
+ int num_planes);
+void av1_free_above_context_buffers(struct CommonContexts *above_contexts);
int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
-void av1_init_context_buffers(struct AV1Common *cm);
+void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params);
void av1_free_context_buffers(struct AV1Common *cm);
void av1_free_ref_frame_buffers(struct BufferPool *pool);
@@ -38,9 +42,13 @@ void av1_free_restoration_buffers(struct AV1Common *cm);
int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
void av1_free_state_buffers(struct AV1Common *cm);
-void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
int av1_get_MBs(int width, int height);
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(struct AV1Common *cm);
+void av1_free_loop_filter_mask(struct AV1Common *cm);
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
index bad411743..2f3567aea 100644
--- a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
+++ b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
@@ -48,11 +48,11 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
// 1D functions
static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
- { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
- { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
- { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
- { av1_idct32_new, NULL, NULL },
- { av1_idct64_new, NULL, NULL },
+ { av1_idct4, av1_iadst4, av1_iidentity4_c },
+ { av1_idct8, av1_iadst8, av1_iidentity8_c },
+ { av1_idct16, av1_iadst16, av1_iidentity16_c },
+ { av1_idct32, NULL, NULL },
+ { av1_idct64, NULL, NULL },
};
static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
@@ -248,31 +248,27 @@ static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
x[1] = vcombine_s16(v1[0], v1[1]);
}
-static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
- int16_t *const c2,
- int16_t *const c3) {
+static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
+ const int16_t c2, const int16_t c3) {
int16x4_t val = vdup_n_s16((int16_t)0);
- val = vld1_lane_s16(c0, val, 0);
- val = vld1_lane_s16(c1, val, 1);
- val = vld1_lane_s16(c2, val, 2);
- val = vld1_lane_s16(c3, val, 3);
+ val = vset_lane_s16(c0, val, 0);
+ val = vset_lane_s16(c1, val, 1);
+ val = vset_lane_s16(c2, val, 2);
+ val = vset_lane_s16(c3, val, 3);
return val;
}
-static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
- (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
- const int16x4_t c1 =
- create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
- (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
- const int16x4_t c2 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[20], (int16_t)cospi[44]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
int16x8_t x[8];
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -327,22 +323,21 @@ static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
// Stage 7
out[0] = x[0];
- out[1] = vnegq_s16(x[4]);
+ out[1] = vqnegq_s16(x[4]);
out[2] = x[6];
- out[3] = vnegq_s16(x[2]);
+ out[3] = vqnegq_s16(x[2]);
out[4] = x[3];
- out[5] = vnegq_s16(x[7]);
+ out[5] = vqnegq_s16(x[7]);
out[6] = x[5];
- out[7] = vnegq_s16(x[1]);
+ out[7] = vqnegq_s16(x[1]);
}
-static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
- const int16x4_t c2 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
int16x8_t x[8];
int16x8_t s0, s1, s4, s5;
@@ -381,34 +376,32 @@ static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
// Stage 7
out[0] = x[0];
- out[1] = vnegq_s16(x[4]);
+ out[1] = vqnegq_s16(x[4]);
out[2] = x[6];
- out[3] = vnegq_s16(x[2]);
+ out[3] = vqnegq_s16(x[2]);
out[4] = x[3];
- out[5] = vnegq_s16(x[7]);
+ out[5] = vqnegq_s16(x[7]);
out[6] = x[5];
- out[7] = vnegq_s16(x[1]);
+ out[7] = vqnegq_s16(x[1]);
}
-static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
- int bit) {
+static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+ int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1[8], step2[8];
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
- const int16x4_t c2 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
// stage 2
btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
// stage 3
- btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
- btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+ btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
step2[4] = vqaddq_s16(step1[4], step1[5]);
step2[5] = vqsubq_s16(step1[4], step1[5]);
step2[6] = vqsubq_s16(step1[7], step1[6]);
@@ -419,7 +412,7 @@ static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
step1[1] = vqaddq_s16(step2[1], step2[2]);
step1[2] = vqsubq_s16(step2[1], step2[2]);
step1[3] = vqsubq_s16(step2[0], step2[3]);
- btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+ btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
// stage 5
out[0] = vqaddq_s16(step1[0], step2[7]);
@@ -432,8 +425,8 @@ static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
out[7] = vqsubq_s16(step1[0], step2[7]);
}
-static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1;
@@ -489,19 +482,24 @@ static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
}
}
-static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, int bit) {
- (void)bit;
- (void)cos_bit;
-
- output[0] = vmulq_n_s16(input[0], (int16_t)2);
- output[1] = vmulq_n_s16(input[1], (int16_t)2);
- output[2] = vmulq_n_s16(input[2], (int16_t)2);
- output[3] = vmulq_n_s16(input[3], (int16_t)2);
- output[4] = vmulq_n_s16(input[4], (int16_t)2);
- output[5] = vmulq_n_s16(input[5], (int16_t)2);
- output[6] = vmulq_n_s16(input[6], (int16_t)2);
- output[7] = vmulq_n_s16(input[7], (int16_t)2);
+static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
+ int txw_idx, int8_t size, int bit) {
+ const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+ int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
+ int16x4_t low_i16, high_i16;
+ int32x4_t low_i32, high_i32;
+ for (int i = 0; i < size; i++) {
+ int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
+ int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
+ low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
+ high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
+ low_i16 = vqmovn_s32(low_i32);
+ high_i16 = vqmovn_s32(high_i32);
+ output[i] = vcombine_s16(low_i16, high_i16);
+ }
}
static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
@@ -520,38 +518,8 @@ static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
}
}
-static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, int bit) {
- (void)bit;
- (void)cos_bit;
-
- int32x4_t out_low, out_high;
- int16x4_t low, high;
- int16_t scale = (int16_t)(2 * NewSqrt2);
-
- for (int z = 0; z < 16; ++z) {
- out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
- out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
-
- low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
- high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
-
- output[z] = vcombine_s16(low, high);
- }
-}
-
-static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
- int8_t cos_bit, int bit) {
- (void)bit;
- (void)cos_bit;
-
- for (int z = 0; z < 32; ++z) {
- output[z] = vmulq_n_s16(input[z], (int16_t)4);
- }
-}
-
-static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1;
@@ -584,25 +552,23 @@ static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
out[15] = step1;
}
-static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+ int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1[16], step2[16];
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
- (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
- const int16x4_t c1 =
- create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
- (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
- const int16x4_t c2 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
- const int16x4_t c3 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
-
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
// stage 2
btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
@@ -642,8 +608,7 @@ static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
- &step2[10], &step2[13]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
step2[4] = vqaddq_s16(step1[4], step1[5]);
step2[5] = vqsubq_s16(step1[4], step1[5]);
@@ -710,14 +675,16 @@ static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
out[15] = vqsubq_s16(step2[0], step2[15]);
}
-static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1[16], step2[16];
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c1 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
// stage 1
// stage 2
@@ -753,8 +720,7 @@ static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
- &step2[10], &step2[13]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
step2[4] = vqaddq_s16(step1[4], step1[5]);
step2[5] = vqsubq_s16(step1[4], step1[5]);
@@ -820,30 +786,23 @@ static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
out[15] = vqsubq_s16(step2[0], step2[15]);
}
-static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
- (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
- const int16x4_t c1 =
- create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
- (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
- const int16x4_t c2 =
- create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
- (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
- const int16x4_t c3 =
- create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
- (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
- const int16x4_t c4 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
-
- const int16x4_t c =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+ (int16_t)cospi[10], (int16_t)cospi[54]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+ (int16_t)cospi[26], (int16_t)cospi[38]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
+ (int16_t)cospi[42], (int16_t)cospi[22]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
+ (int16_t)cospi[58], (int16_t)cospi[6]);
+ const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
int16x8_t x[16];
int16x8_t t[14];
@@ -933,14 +892,14 @@ static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
t[1] = x[1];
t[2] = x[2];
t[3] = x[3];
- btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
- btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
t[8] = x[8];
t[9] = x[9];
t[10] = x[10];
t[11] = x[11];
- btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
- btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+ btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
// Stage 7
x[0] = vqaddq_s16(t[0], t[2]);
@@ -961,40 +920,38 @@ static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
x[15] = vqsubq_s16(s13, s15);
// Stage 8
- btf_16_half_neon(x + 2, c);
- btf_16_half_neon(x + 6, c);
- btf_16_half_neon(x + 10, c);
- btf_16_half_neon(x + 14, c);
+ btf_16_half_neon(x + 2, c5);
+ btf_16_half_neon(x + 6, c5);
+ btf_16_half_neon(x + 10, c5);
+ btf_16_half_neon(x + 14, c5);
// Stage 9
out[0] = x[0];
- out[1] = vnegq_s16(x[8]);
+ out[1] = vqnegq_s16(x[8]);
out[2] = x[12];
- out[3] = vnegq_s16(x[4]);
+ out[3] = vqnegq_s16(x[4]);
out[4] = x[6];
- out[5] = vnegq_s16(x[14]);
+ out[5] = vqnegq_s16(x[14]);
out[6] = x[10];
- out[7] = vnegq_s16(x[2]);
+ out[7] = vqnegq_s16(x[2]);
out[8] = x[3];
- out[9] = vnegq_s16(x[11]);
+ out[9] = vqnegq_s16(x[11]);
out[10] = x[15];
- out[11] = vnegq_s16(x[7]);
+ out[11] = vqnegq_s16(x[7]);
out[12] = x[5];
- out[13] = vnegq_s16(x[13]);
+ out[13] = vqnegq_s16(x[13]);
out[14] = x[9];
- out[15] = vnegq_s16(x[1]);
+ out[15] = vqnegq_s16(x[1]);
}
-static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
- const int16x4_t c4 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
- const int16x4_t c =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
int16x8_t x[16];
int16x8_t t[10];
@@ -1016,7 +973,7 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
// Stage 4
t[0] = x[0];
t[1] = x[1];
- btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
// Stage 5
x[0] = t[0];
@@ -1031,10 +988,10 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
// stage 6
t[0] = x[0];
t[1] = x[1];
- btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
t[8] = x[8];
t[9] = x[9];
- btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+ btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
// Stage 7
x[0] = t[0];
@@ -1055,41 +1012,39 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
x[15] = s13;
// Stage 8
- btf_16_half_neon(x + 2, c);
- btf_16_half_neon(x + 6, c);
- btf_16_half_neon(x + 10, c);
- btf_16_half_neon(x + 14, c);
+ btf_16_half_neon(x + 2, c1);
+ btf_16_half_neon(x + 6, c1);
+ btf_16_half_neon(x + 10, c1);
+ btf_16_half_neon(x + 14, c1);
// Stage 9
out[0] = x[0];
- out[1] = vnegq_s16(x[8]);
+ out[1] = vqnegq_s16(x[8]);
out[2] = x[12];
- out[3] = vnegq_s16(x[4]);
+ out[3] = vqnegq_s16(x[4]);
out[4] = x[6];
- out[5] = vnegq_s16(x[14]);
+ out[5] = vqnegq_s16(x[14]);
out[6] = x[10];
- out[7] = vnegq_s16(x[2]);
+ out[7] = vqnegq_s16(x[2]);
out[8] = x[3];
- out[9] = vnegq_s16(x[11]);
+ out[9] = vqnegq_s16(x[11]);
out[10] = x[15];
- out[11] = vnegq_s16(x[7]);
+ out[11] = vqnegq_s16(x[7]);
out[12] = x[5];
- out[13] = vnegq_s16(x[13]);
+ out[13] = vqnegq_s16(x[13]);
out[14] = x[9];
- out[15] = vnegq_s16(x[1]);
+ out[15] = vqnegq_s16(x[1]);
}
-static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
- const int16x4_t c4 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
- const int16x4_t c =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
int16x8_t x[16];
int16x8_t t[14];
@@ -1144,10 +1099,10 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
t[5] = x[5];
t[6] = x[6];
t[7] = x[7];
- btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
- btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
- btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
- btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+ btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
// Stage 5
x[0] = vqaddq_s16(t[0], t[4]);
@@ -1172,14 +1127,14 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
t[1] = x[1];
t[2] = x[2];
t[3] = x[3];
- btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
- btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
t[8] = x[8];
t[9] = x[9];
t[10] = x[10];
t[11] = x[11];
- btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
- btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+ btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
// Stage 7
x[0] = vqaddq_s16(t[0], t[2]);
@@ -1200,60 +1155,58 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
x[15] = vqsubq_s16(s13, s15);
// Stage 8
- btf_16_half_neon(x + 2, c);
- btf_16_half_neon(x + 6, c);
- btf_16_half_neon(x + 10, c);
- btf_16_half_neon(x + 14, c);
+ btf_16_half_neon(x + 2, c1);
+ btf_16_half_neon(x + 6, c1);
+ btf_16_half_neon(x + 10, c1);
+ btf_16_half_neon(x + 14, c1);
// Stage 9
out[0] = x[0];
- out[1] = vnegq_s16(x[8]);
+ out[1] = vqnegq_s16(x[8]);
out[2] = x[12];
- out[3] = vnegq_s16(x[4]);
+ out[3] = vqnegq_s16(x[4]);
out[4] = x[6];
- out[5] = vnegq_s16(x[14]);
+ out[5] = vqnegq_s16(x[14]);
out[6] = x[10];
- out[7] = vnegq_s16(x[2]);
+ out[7] = vqnegq_s16(x[2]);
out[8] = x[3];
- out[9] = vnegq_s16(x[11]);
+ out[9] = vqnegq_s16(x[11]);
out[10] = x[15];
- out[11] = vnegq_s16(x[7]);
+ out[11] = vqnegq_s16(x[7]);
out[12] = x[5];
- out[13] = vnegq_s16(x[13]);
+ out[13] = vqnegq_s16(x[13]);
out[14] = x[9];
- out[15] = vnegq_s16(x[1]);
+ out[15] = vqnegq_s16(x[1]);
}
-static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+ int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1[32], step2[32];
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
- (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
- const int16x4_t c1 =
- create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
- (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
- const int16x4_t c2 =
- create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
- (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
- const int16x4_t c3 =
- create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
- (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
- const int16x4_t c4 =
- create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
- (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
- const int16x4_t c5 =
- create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
- (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
- const int16x4_t c6 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
- const int16x4_t c7 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+ (int16_t)cospi[34], (int16_t)cospi[30]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+ (int16_t)cospi[50], (int16_t)cospi[14]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
+ (int16_t)cospi[42], (int16_t)cospi[22]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
+ (int16_t)cospi[58], (int16_t)cospi[6]);
+ const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c8 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c9 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
// stage 2
@@ -1321,11 +1274,9 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
- btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
- &step2[18], &step2[29]);
+ btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
- &step2[22], &step2[25]);
+ btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
step2[0] = step1[0];
step2[1] = step1[1];
@@ -1353,8 +1304,7 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
- btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
- &step1[10], &step1[13]);
+ btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
step1[4] = vqaddq_s16(step2[4], step2[5]);
step1[5] = vqsubq_s16(step2[4], step2[5]);
@@ -1386,10 +1336,8 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
- &step2[20], &step2[27]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
- &step2[21], &step2[26]);
+ btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
step2[0] = vqaddq_s16(step1[0], step1[3]);
step2[1] = vqaddq_s16(step1[1], step1[2]);
@@ -1516,8 +1464,8 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
out[31] = vqsubq_s16(step2[0], step2[31]);
}
-static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1;
@@ -1573,19 +1521,22 @@ static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
out[31] = step1;
}
-static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1[32], step2[32];
int32x4_t t32[16];
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
- const int16x4_t c1 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
-
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], cospi[48]);
+ const int16x4_t c2 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c3 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
// stage 1
// stage 2
@@ -1627,11 +1578,9 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
- btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
- &step2[18], &step2[29]);
+ btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
- &step2[22], &step2[25]);
+ btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
step2[0] = step1[0];
step2[8] = step1[8];
@@ -1659,8 +1608,7 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
vrshrn_n_s32(t32[1], INV_COS_BIT));
btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
- btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
- &step1[10], &step1[13]);
+ btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
step1[4] = step2[4];
step1[5] = step2[4];
@@ -1692,10 +1640,8 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
- &step2[20], &step2[27]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
- &step2[21], &step2[26]);
+ btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
step2[0] = step1[0];
step2[1] = step1[0];
@@ -1822,18 +1768,22 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
out[31] = vqsubq_s16(step2[0], step2[31]);
}
-static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
- int8_t cos_bit, int bit) {
+static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
(void)bit;
const int32_t *cospi = cospi_arr(cos_bit);
int16x8_t step1[32], step2[32];
int32x4_t t32[16];
- const int16x4_t c0 =
- create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
- (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
- const int16x4_t c1 =
- create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
- (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c2 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c3 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
// stage 1
// stage 2
@@ -1889,11 +1839,9 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
- btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
- &step2[18], &step2[29]);
+ btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
- &step2[22], &step2[25]);
+ btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
step2[0] = step1[0];
step2[2] = step1[2];
@@ -1924,8 +1872,7 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
- btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
- &step1[10], &step1[13]);
+ btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
step1[4] = vqaddq_s16(step2[4], step2[5]);
step1[5] = vqsubq_s16(step2[4], step2[5]);
@@ -1957,10 +1904,8 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
- &step2[20], &step2[27]);
- btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
- &step2[21], &step2[26]);
+ btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
step2[0] = vqaddq_s16(step1[0], step1[3]);
step2[1] = vqaddq_s16(step1[0], step1[2]);
@@ -2086,33 +2031,1542 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
out[30] = vqsubq_s16(step2[1], step2[30]);
out[31] = vqsubq_s16(step2[0], step2[31]);
}
+static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
+ btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
+ btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
+ btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[15]);
+ step1[1] = vqaddq_s16(step2[1], step2[14]);
+ step1[2] = vqaddq_s16(step2[2], step2[13]);
+ step1[3] = vqaddq_s16(step2[3], step2[12]);
+ step1[4] = vqaddq_s16(step2[4], step2[11]);
+ step1[5] = vqaddq_s16(step2[5], step2[10]);
+ step1[6] = vqaddq_s16(step2[6], step2[9]);
+ step1[7] = vqaddq_s16(step2[7], step2[8]);
+ step1[8] = vqsubq_s16(step2[7], step2[8]);
+ step1[9] = vqsubq_s16(step2[6], step2[9]);
+ step1[10] = vqsubq_s16(step2[5], step2[10]);
+ step1[11] = vqsubq_s16(step2[4], step2[11]);
+ step1[12] = vqsubq_s16(step2[3], step2[12]);
+ step1[13] = vqsubq_s16(step2[2], step2[13]);
+ step1[14] = vqsubq_s16(step2[1], step2[14]);
+ step1[15] = vqsubq_s16(step2[0], step2[15]);
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[47]);
+ step1[33] = vqaddq_s16(step2[33], step2[46]);
+ step1[34] = vqaddq_s16(step2[34], step2[45]);
+ step1[35] = vqaddq_s16(step2[35], step2[44]);
+ step1[36] = vqaddq_s16(step2[36], step2[43]);
+ step1[37] = vqaddq_s16(step2[37], step2[42]);
+ step1[38] = vqaddq_s16(step2[38], step2[41]);
+ step1[39] = vqaddq_s16(step2[39], step2[40]);
+ step1[40] = vqsubq_s16(step2[39], step2[40]);
+ step1[41] = vqsubq_s16(step2[38], step2[41]);
+ step1[42] = vqsubq_s16(step2[37], step2[42]);
+ step1[43] = vqsubq_s16(step2[36], step2[43]);
+ step1[44] = vqsubq_s16(step2[35], step2[44]);
+ step1[45] = vqsubq_s16(step2[34], step2[45]);
+ step1[46] = vqsubq_s16(step2[33], step2[46]);
+ step1[47] = vqsubq_s16(step2[32], step2[47]);
+ step1[48] = vqsubq_s16(step2[63], step2[48]);
+ step1[49] = vqsubq_s16(step2[62], step2[49]);
+ step1[50] = vqsubq_s16(step2[61], step2[50]);
+ step1[51] = vqsubq_s16(step2[60], step2[51]);
+ step1[52] = vqsubq_s16(step2[59], step2[52]);
+ step1[53] = vqsubq_s16(step2[58], step2[53]);
+ step1[54] = vqsubq_s16(step2[57], step2[54]);
+ step1[55] = vqsubq_s16(step2[56], step2[55]);
+ step1[56] = vqaddq_s16(step2[56], step2[55]);
+ step1[57] = vqaddq_s16(step2[57], step2[54]);
+ step1[58] = vqaddq_s16(step2[58], step2[53]);
+ step1[59] = vqaddq_s16(step2[59], step2[52]);
+ step1[60] = vqaddq_s16(step2[60], step2[51]);
+ step1[61] = vqaddq_s16(step2[61], step2[50]);
+ step1[62] = vqaddq_s16(step2[62], step2[49]);
+ step1[63] = vqaddq_s16(step2[63], step2[48]);
+}
+
+static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
+ btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
+ btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
+ btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
+ btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
+ btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
+ btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[31]);
+ step2[1] = vqaddq_s16(step1[1], step1[30]);
+ step2[2] = vqaddq_s16(step1[2], step1[29]);
+ step2[3] = vqaddq_s16(step1[3], step1[28]);
+ step2[4] = vqaddq_s16(step1[4], step1[27]);
+ step2[5] = vqaddq_s16(step1[5], step1[26]);
+ step2[6] = vqaddq_s16(step1[6], step1[25]);
+ step2[7] = vqaddq_s16(step1[7], step1[24]);
+ step2[8] = vqaddq_s16(step1[8], step1[23]);
+ step2[9] = vqaddq_s16(step1[9], step1[22]);
+ step2[10] = vqaddq_s16(step1[10], step1[21]);
+ step2[11] = vqaddq_s16(step1[11], step1[20]);
+ step2[12] = vqaddq_s16(step1[12], step1[19]);
+ step2[13] = vqaddq_s16(step1[13], step1[18]);
+ step2[14] = vqaddq_s16(step1[14], step1[17]);
+ step2[15] = vqaddq_s16(step1[15], step1[16]);
+ step2[16] = vqsubq_s16(step1[15], step1[16]);
+ step2[17] = vqsubq_s16(step1[14], step1[17]);
+ step2[18] = vqsubq_s16(step1[13], step1[18]);
+ step2[19] = vqsubq_s16(step1[12], step1[19]);
+ step2[20] = vqsubq_s16(step1[11], step1[20]);
+ step2[21] = vqsubq_s16(step1[10], step1[21]);
+ step2[22] = vqsubq_s16(step1[9], step1[22]);
+ step2[23] = vqsubq_s16(step1[8], step1[23]);
+ step2[24] = vqsubq_s16(step1[7], step1[24]);
+ step2[25] = vqsubq_s16(step1[6], step1[25]);
+ step2[26] = vqsubq_s16(step1[5], step1[26]);
+ step2[27] = vqsubq_s16(step1[4], step1[27]);
+ step2[28] = vqsubq_s16(step1[3], step1[28]);
+ step2[29] = vqsubq_s16(step1[2], step1[29]);
+ step2[30] = vqsubq_s16(step1[1], step1[30]);
+ step2[31] = vqsubq_s16(step1[0], step1[31]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[36] = step1[36];
+ step2[37] = step1[37];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[58] = step1[58];
+ step2[59] = step1[59];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+}
+
+static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step2[64], step1[64];
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+ (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+ const int16x4_t c5 =
+ set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+ (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+ const int16x4_t c6 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c7 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[2] = in[16];
+ step2[4] = in[8];
+ step2[6] = in[24];
+ step2[8] = in[4];
+ step2[10] = in[20];
+ step2[12] = in[12];
+ step2[14] = in[28];
+ step2[16] = in[2];
+ step2[18] = in[18];
+ step2[20] = in[10];
+ step2[22] = in[26];
+ step2[24] = in[6];
+ step2[26] = in[22];
+ step2[28] = in[14];
+ step2[30] = in[30];
+
+ btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+ btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
+ btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
+ btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+ btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+ btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
+ btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
+ btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+ btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+ btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
+ btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
+ btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+ btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+ btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
+ btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
+ btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+ // stage 3
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = step2[4];
+ step1[6] = step2[6];
+ step1[8] = step2[8];
+ step1[10] = step2[10];
+ step1[12] = step2[12];
+ step1[14] = step2[14];
+
+ btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+ btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
+ btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
+ btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+ btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+ btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
+ btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
+ btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+ step1[32] = vqaddq_s16(step2[32], step2[33]);
+ step1[33] = vqsubq_s16(step2[32], step2[33]);
+ step1[34] = vqsubq_s16(step2[35], step2[34]);
+ step1[35] = vqaddq_s16(step2[35], step2[34]);
+ step1[36] = vqaddq_s16(step2[36], step2[37]);
+ step1[37] = vqsubq_s16(step2[36], step2[37]);
+ step1[38] = vqsubq_s16(step2[39], step2[38]);
+ step1[39] = vqaddq_s16(step2[39], step2[38]);
+ step1[40] = vqaddq_s16(step2[40], step2[41]);
+ step1[41] = vqsubq_s16(step2[40], step2[41]);
+ step1[42] = vqsubq_s16(step2[43], step2[42]);
+ step1[43] = vqaddq_s16(step2[43], step2[42]);
+ step1[44] = vqaddq_s16(step2[44], step2[45]);
+ step1[45] = vqsubq_s16(step2[44], step2[45]);
+ step1[46] = vqsubq_s16(step2[47], step2[46]);
+ step1[47] = vqaddq_s16(step2[47], step2[46]);
+ step1[48] = vqaddq_s16(step2[48], step2[49]);
+ step1[49] = vqsubq_s16(step2[48], step2[49]);
+ step1[50] = vqsubq_s16(step2[51], step2[50]);
+ step1[51] = vqaddq_s16(step2[51], step2[50]);
+ step1[52] = vqaddq_s16(step2[52], step2[53]);
+ step1[53] = vqsubq_s16(step2[52], step2[53]);
+ step1[54] = vqsubq_s16(step2[55], step2[54]);
+ step1[55] = vqaddq_s16(step2[55], step2[54]);
+ step1[56] = vqaddq_s16(step2[56], step2[57]);
+ step1[57] = vqsubq_s16(step2[56], step2[57]);
+ step1[58] = vqsubq_s16(step2[59], step2[58]);
+ step1[59] = vqaddq_s16(step2[59], step2[58]);
+ step1[60] = vqaddq_s16(step2[60], step2[61]);
+ step1[61] = vqsubq_s16(step2[60], step2[61]);
+ step1[62] = vqsubq_s16(step2[63], step2[62]);
+ step1[63] = vqaddq_s16(step2[63], step2[62]);
+
+ // stage 4
+
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+
+ btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
+ btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
+ btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+ btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+ btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+ btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+ btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+ btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+ btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+ step2[16] = vqaddq_s16(step1[16], step1[17]);
+ step2[17] = vqsubq_s16(step1[16], step1[17]);
+ step2[18] = vqsubq_s16(step1[19], step1[18]);
+ step2[19] = vqaddq_s16(step1[19], step1[18]);
+ step2[20] = vqaddq_s16(step1[20], step1[21]);
+ step2[21] = vqsubq_s16(step1[20], step1[21]);
+ step2[22] = vqsubq_s16(step1[23], step1[22]);
+ step2[23] = vqaddq_s16(step1[23], step1[22]);
+ step2[24] = vqaddq_s16(step1[24], step1[25]);
+ step2[25] = vqsubq_s16(step1[24], step1[25]);
+ step2[26] = vqsubq_s16(step1[27], step1[26]);
+ step2[27] = vqaddq_s16(step1[27], step1[26]);
+ step2[28] = vqaddq_s16(step1[28], step1[29]);
+ step2[29] = vqsubq_s16(step1[28], step1[29]);
+ step2[30] = vqsubq_s16(step1[31], step1[30]);
+ step2[31] = vqaddq_s16(step1[31], step1[30]);
+ step2[32] = step1[32];
+ step2[35] = step1[35];
+ step2[36] = step1[36];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[43] = step1[43];
+ step2[44] = step1[44];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[51] = step1[51];
+ step2[52] = step1[52];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[59] = step1[59];
+ step2[60] = step1[60];
+ step2[63] = step1[63];
+
+ // stage 5
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+ btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+ btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+ btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+ btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+ step1[16] = step2[16];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[35]);
+ step1[33] = vqaddq_s16(step2[33], step2[34]);
+ step1[34] = vqsubq_s16(step2[33], step2[34]);
+ step1[35] = vqsubq_s16(step2[32], step2[35]);
+ step1[36] = vqsubq_s16(step2[39], step2[36]);
+ step1[37] = vqsubq_s16(step2[38], step2[37]);
+ step1[38] = vqaddq_s16(step2[38], step2[37]);
+ step1[39] = vqaddq_s16(step2[39], step2[36]);
+ step1[40] = vqaddq_s16(step2[40], step2[43]);
+ step1[41] = vqaddq_s16(step2[41], step2[42]);
+ step1[42] = vqsubq_s16(step2[41], step2[42]);
+ step1[43] = vqsubq_s16(step2[40], step2[43]);
+ step1[44] = vqsubq_s16(step2[47], step2[44]);
+ step1[45] = vqsubq_s16(step2[46], step2[45]);
+ step1[46] = vqaddq_s16(step2[46], step2[45]);
+ step1[47] = vqaddq_s16(step2[47], step2[44]);
+ step1[48] = vqaddq_s16(step2[48], step2[51]);
+ step1[49] = vqaddq_s16(step2[49], step2[50]);
+ step1[50] = vqsubq_s16(step2[49], step2[50]);
+ step1[51] = vqsubq_s16(step2[48], step2[51]);
+ step1[52] = vqsubq_s16(step2[55], step2[52]);
+ step1[53] = vqsubq_s16(step2[54], step2[53]);
+ step1[54] = vqaddq_s16(step2[54], step2[53]);
+ step1[55] = vqaddq_s16(step2[55], step2[52]);
+ step1[56] = vqaddq_s16(step2[56], step2[59]);
+ step1[57] = vqaddq_s16(step2[57], step2[58]);
+ step1[58] = vqsubq_s16(step2[57], step2[58]);
+ step1[59] = vqsubq_s16(step2[56], step2[59]);
+ step1[60] = vqsubq_s16(step2[63], step2[60]);
+ step1[61] = vqsubq_s16(step2[62], step2[61]);
+ step1[62] = vqaddq_s16(step2[62], step2[61]);
+ step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+ // stage 6
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+ btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+ btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+ btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+ btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+ btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+ btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+ btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+ btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[19]);
+ step2[17] = vqaddq_s16(step1[17], step1[18]);
+ step2[18] = vqsubq_s16(step1[17], step1[18]);
+ step2[19] = vqsubq_s16(step1[16], step1[19]);
+ step2[20] = vqsubq_s16(step1[23], step1[20]);
+ step2[21] = vqsubq_s16(step1[22], step1[21]);
+ step2[22] = vqaddq_s16(step1[22], step1[21]);
+ step2[23] = vqaddq_s16(step1[23], step1[20]);
+ step2[24] = vqaddq_s16(step1[24], step1[27]);
+ step2[25] = vqaddq_s16(step1[25], step1[26]);
+ step2[26] = vqsubq_s16(step1[25], step1[26]);
+ step2[27] = vqsubq_s16(step1[24], step1[27]);
+ step2[28] = vqsubq_s16(step1[31], step1[28]);
+ step2[29] = vqsubq_s16(step1[30], step1[29]);
+ step2[30] = vqaddq_s16(step1[30], step1[29]);
+ step2[31] = vqaddq_s16(step1[31], step1[28]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[41] = step1[41];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[54] = step1[54];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+ btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+ btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+ btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+ btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[39]);
+ step1[33] = vqaddq_s16(step2[33], step2[38]);
+ step1[34] = vqaddq_s16(step2[34], step2[37]);
+ step1[35] = vqaddq_s16(step2[35], step2[36]);
+ step1[36] = vqsubq_s16(step2[35], step2[36]);
+ step1[37] = vqsubq_s16(step2[34], step2[37]);
+ step1[38] = vqsubq_s16(step2[33], step2[38]);
+ step1[39] = vqsubq_s16(step2[32], step2[39]);
+ step1[40] = vqsubq_s16(step2[47], step2[40]);
+ step1[41] = vqsubq_s16(step2[46], step2[41]);
+ step1[42] = vqsubq_s16(step2[45], step2[42]);
+ step1[43] = vqsubq_s16(step2[44], step2[43]);
+ step1[44] = vqaddq_s16(step2[43], step2[44]);
+ step1[45] = vqaddq_s16(step2[42], step2[45]);
+ step1[46] = vqaddq_s16(step2[41], step2[46]);
+ step1[47] = vqaddq_s16(step2[40], step2[47]);
+ step1[48] = vqaddq_s16(step2[48], step2[55]);
+ step1[49] = vqaddq_s16(step2[49], step2[54]);
+ step1[50] = vqaddq_s16(step2[50], step2[53]);
+ step1[51] = vqaddq_s16(step2[51], step2[52]);
+ step1[52] = vqsubq_s16(step2[51], step2[52]);
+ step1[53] = vqsubq_s16(step2[50], step2[53]);
+ step1[54] = vqsubq_s16(step2[49], step2[54]);
+ step1[55] = vqsubq_s16(step2[48], step2[55]);
+ step1[56] = vqsubq_s16(step2[63], step2[56]);
+ step1[57] = vqsubq_s16(step2[62], step2[57]);
+ step1[58] = vqsubq_s16(step2[61], step2[58]);
+ step1[59] = vqsubq_s16(step2[60], step2[59]);
+ step1[60] = vqaddq_s16(step2[59], step2[60]);
+ step1[61] = vqaddq_s16(step2[58], step2[61]);
+ step1[62] = vqaddq_s16(step2[57], step2[62]);
+ step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+ btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+ btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+ btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+ btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+ btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+ btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+ btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[23]);
+ step2[17] = vqaddq_s16(step1[17], step1[22]);
+ step2[18] = vqaddq_s16(step1[18], step1[21]);
+ step2[19] = vqaddq_s16(step1[19], step1[20]);
+ step2[20] = vqsubq_s16(step1[19], step1[20]);
+ step2[21] = vqsubq_s16(step1[18], step1[21]);
+ step2[22] = vqsubq_s16(step1[17], step1[22]);
+ step2[23] = vqsubq_s16(step1[16], step1[23]);
+ step2[24] = vqsubq_s16(step1[31], step1[24]);
+ step2[25] = vqsubq_s16(step1[30], step1[25]);
+ step2[26] = vqsubq_s16(step1[29], step1[26]);
+ step2[27] = vqsubq_s16(step1[28], step1[27]);
+ step2[28] = vqaddq_s16(step1[28], step1[27]);
+ step2[29] = vqaddq_s16(step1[29], step1[26]);
+ step2[30] = vqaddq_s16(step1[30], step1[25]);
+ step2[31] = vqaddq_s16(step1[31], step1[24]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[44] = step1[44];
+ step2[45] = step1[45];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[50] = step1[50];
+ step2[51] = step1[51];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 9
+ idct64_stage9_neon(step2, step1, cos_bit);
+
+ // stage 10
+ idct64_stage10_neon(step1, step2, cos_bit);
+
+ // stage 11
+
+ out[0] = vqaddq_s16(step2[0], step2[63]);
+ out[1] = vqaddq_s16(step2[1], step2[62]);
+ out[2] = vqaddq_s16(step2[2], step2[61]);
+ out[3] = vqaddq_s16(step2[3], step2[60]);
+ out[4] = vqaddq_s16(step2[4], step2[59]);
+ out[5] = vqaddq_s16(step2[5], step2[58]);
+ out[6] = vqaddq_s16(step2[6], step2[57]);
+ out[7] = vqaddq_s16(step2[7], step2[56]);
+ out[8] = vqaddq_s16(step2[8], step2[55]);
+ out[9] = vqaddq_s16(step2[9], step2[54]);
+ out[10] = vqaddq_s16(step2[10], step2[53]);
+ out[11] = vqaddq_s16(step2[11], step2[52]);
+ out[12] = vqaddq_s16(step2[12], step2[51]);
+ out[13] = vqaddq_s16(step2[13], step2[50]);
+ out[14] = vqaddq_s16(step2[14], step2[49]);
+ out[15] = vqaddq_s16(step2[15], step2[48]);
+ out[16] = vqaddq_s16(step2[16], step2[47]);
+ out[17] = vqaddq_s16(step2[17], step2[46]);
+ out[18] = vqaddq_s16(step2[18], step2[45]);
+ out[19] = vqaddq_s16(step2[19], step2[44]);
+ out[20] = vqaddq_s16(step2[20], step2[43]);
+ out[21] = vqaddq_s16(step2[21], step2[42]);
+ out[22] = vqaddq_s16(step2[22], step2[41]);
+ out[23] = vqaddq_s16(step2[23], step2[40]);
+ out[24] = vqaddq_s16(step2[24], step2[39]);
+ out[25] = vqaddq_s16(step2[25], step2[38]);
+ out[26] = vqaddq_s16(step2[26], step2[37]);
+ out[27] = vqaddq_s16(step2[27], step2[36]);
+ out[28] = vqaddq_s16(step2[28], step2[35]);
+ out[29] = vqaddq_s16(step2[29], step2[34]);
+ out[30] = vqaddq_s16(step2[30], step2[33]);
+ out[31] = vqaddq_s16(step2[31], step2[32]);
+ out[32] = vqsubq_s16(step2[31], step2[32]);
+ out[33] = vqsubq_s16(step2[30], step2[33]);
+ out[34] = vqsubq_s16(step2[29], step2[34]);
+ out[35] = vqsubq_s16(step2[28], step2[35]);
+ out[36] = vqsubq_s16(step2[27], step2[36]);
+ out[37] = vqsubq_s16(step2[26], step2[37]);
+ out[38] = vqsubq_s16(step2[25], step2[38]);
+ out[39] = vqsubq_s16(step2[24], step2[39]);
+ out[40] = vqsubq_s16(step2[23], step2[40]);
+ out[41] = vqsubq_s16(step2[22], step2[41]);
+ out[42] = vqsubq_s16(step2[21], step2[42]);
+ out[43] = vqsubq_s16(step2[20], step2[43]);
+ out[44] = vqsubq_s16(step2[19], step2[44]);
+ out[45] = vqsubq_s16(step2[18], step2[45]);
+ out[46] = vqsubq_s16(step2[17], step2[46]);
+ out[47] = vqsubq_s16(step2[16], step2[47]);
+ out[48] = vqsubq_s16(step2[15], step2[48]);
+ out[49] = vqsubq_s16(step2[14], step2[49]);
+ out[50] = vqsubq_s16(step2[13], step2[50]);
+ out[51] = vqsubq_s16(step2[12], step2[51]);
+ out[52] = vqsubq_s16(step2[11], step2[52]);
+ out[53] = vqsubq_s16(step2[10], step2[53]);
+ out[54] = vqsubq_s16(step2[9], step2[54]);
+ out[55] = vqsubq_s16(step2[8], step2[55]);
+ out[56] = vqsubq_s16(step2[7], step2[56]);
+ out[57] = vqsubq_s16(step2[6], step2[57]);
+ out[58] = vqsubq_s16(step2[5], step2[58]);
+ out[59] = vqsubq_s16(step2[4], step2[59]);
+ out[60] = vqsubq_s16(step2[3], step2[60]);
+ out[61] = vqsubq_s16(step2[2], step2[61]);
+ out[62] = vqsubq_s16(step2[1], step2[62]);
+ out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+
+ t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
+
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+ out[16] = step1;
+ out[17] = step1;
+ out[18] = step1;
+ out[19] = step1;
+ out[20] = step1;
+ out[21] = step1;
+ out[22] = step1;
+ out[23] = step1;
+ out[24] = step1;
+ out[25] = step1;
+ out[26] = step1;
+ out[27] = step1;
+ out[28] = step1;
+ out[29] = step1;
+ out[30] = step1;
+ out[31] = step1;
+ out[32] = step1;
+ out[33] = step1;
+ out[34] = step1;
+ out[35] = step1;
+ out[36] = step1;
+ out[37] = step1;
+ out[38] = step1;
+ out[39] = step1;
+ out[40] = step1;
+ out[41] = step1;
+ out[42] = step1;
+ out[43] = step1;
+ out[44] = step1;
+ out[45] = step1;
+ out[46] = step1;
+ out[47] = step1;
+ out[48] = step1;
+ out[49] = step1;
+ out[50] = step1;
+ out[51] = step1;
+ out[52] = step1;
+ out[53] = step1;
+ out[54] = step1;
+ out[55] = step1;
+ out[56] = step1;
+ out[57] = step1;
+ out[58] = step1;
+ out[59] = step1;
+ out[60] = step1;
+ out[61] = step1;
+ out[62] = step1;
+ out[63] = step1;
+}
+
+static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step2[64], step1[64];
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
+ (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+ const int16x4_t c5 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c6 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[8] = in[4];
+ step2[16] = in[2];
+ step2[24] = in[6];
+
+ btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+ btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+ btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+ btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+ // stage 3
+
+ step1[0] = step2[0];
+ step1[8] = step2[8];
+
+ btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+ btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+ step1[32] = step2[32];
+ step1[33] = step2[32];
+ step1[38] = step2[39];
+ step1[39] = step2[39];
+ step1[40] = step2[40];
+ step1[41] = step2[40];
+ step1[46] = step2[47];
+ step1[47] = step2[47];
+ step1[48] = step2[48];
+ step1[49] = step2[48];
+ step1[54] = step2[55];
+ step1[55] = step2[55];
+ step1[56] = step2[56];
+ step1[57] = step2[56];
+ step1[62] = step2[63];
+ step1[63] = step2[63];
+
+ // stage 4
+
+ step2[0] = step1[0];
+
+ btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+ btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+ btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
+
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+ step2[32] = step1[32];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[63] = step1[63];
+
+ // stage 5
+
+ step1[0] = step2[0];
+
+ btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+ btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
+
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ step1[16] = step2[16];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[31] = step2[31];
+ step1[32] = step2[32];
+ step1[33] = step2[33];
+ step1[34] = step2[33];
+ step1[35] = step2[32];
+ step1[36] = step2[39];
+ step1[37] = step2[38];
+ step1[38] = step2[38];
+ step1[39] = step2[39];
+ step1[40] = step2[40];
+ step1[41] = step2[41];
+ step1[42] = step2[41];
+ step1[43] = step2[40];
+ step1[44] = step2[47];
+ step1[45] = step2[46];
+ step1[46] = step2[46];
+ step1[47] = step2[47];
+ step1[48] = step2[48];
+ step1[49] = step2[49];
+ step1[50] = step2[49];
+ step1[51] = step2[48];
+ step1[52] = step2[55];
+ step1[53] = step2[54];
+ step1[54] = step2[54];
+ step1[55] = step2[55];
+ step1[56] = step2[56];
+ step1[57] = step2[57];
+ step1[58] = step2[57];
+ step1[59] = step2[56];
+ step1[60] = step2[63];
+ step1[61] = step2[62];
+ step1[62] = step2[62];
+ step1[63] = step2[63];
+
+ // stage 6
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+ btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+ btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
+ btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
+ btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+ btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+ btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
+ btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[17];
+ step2[19] = step1[16];
+ step2[20] = step1[23];
+ step2[21] = step1[22];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[26] = step1[25];
+ step2[27] = step1[24];
+ step2[28] = step1[31];
+ step2[29] = step1[30];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[41] = step1[41];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[54] = step1[54];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 7
+
+ btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+ btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+ btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
+ btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[10] = step2[9];
+ step1[11] = step2[8];
+ step1[12] = step2[15];
+ step1[13] = step2[14];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[39]);
+ step1[33] = vqaddq_s16(step2[33], step2[38]);
+ step1[34] = vqaddq_s16(step2[34], step2[37]);
+ step1[35] = vqaddq_s16(step2[35], step2[36]);
+ step1[36] = vqsubq_s16(step2[35], step2[36]);
+ step1[37] = vqsubq_s16(step2[34], step2[37]);
+ step1[38] = vqsubq_s16(step2[33], step2[38]);
+ step1[39] = vqsubq_s16(step2[32], step2[39]);
+ step1[40] = vqsubq_s16(step2[47], step2[40]);
+ step1[41] = vqsubq_s16(step2[46], step2[41]);
+ step1[42] = vqsubq_s16(step2[45], step2[42]);
+ step1[43] = vqsubq_s16(step2[44], step2[43]);
+ step1[44] = vqaddq_s16(step2[43], step2[44]);
+ step1[45] = vqaddq_s16(step2[42], step2[45]);
+ step1[46] = vqaddq_s16(step2[41], step2[46]);
+ step1[47] = vqaddq_s16(step2[40], step2[47]);
+ step1[48] = vqaddq_s16(step2[48], step2[55]);
+ step1[49] = vqaddq_s16(step2[49], step2[54]);
+ step1[50] = vqaddq_s16(step2[50], step2[53]);
+ step1[51] = vqaddq_s16(step2[51], step2[52]);
+ step1[52] = vqsubq_s16(step2[51], step2[52]);
+ step1[53] = vqsubq_s16(step2[50], step2[53]);
+ step1[54] = vqsubq_s16(step2[49], step2[54]);
+ step1[55] = vqsubq_s16(step2[48], step2[55]);
+ step1[56] = vqsubq_s16(step2[63], step2[56]);
+ step1[57] = vqsubq_s16(step2[62], step2[57]);
+ step1[58] = vqsubq_s16(step2[61], step2[58]);
+ step1[59] = vqsubq_s16(step2[60], step2[59]);
+ step1[60] = vqaddq_s16(step2[59], step2[60]);
+ step1[61] = vqaddq_s16(step2[58], step2[61]);
+ step1[62] = vqaddq_s16(step2[57], step2[62]);
+ step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+ btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+ btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+ btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+ btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
+ btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
+ btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
+ btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[3];
+ step2[5] = step1[2];
+ step2[6] = step1[1];
+ step2[7] = step1[0];
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[23]);
+ step2[17] = vqaddq_s16(step1[17], step1[22]);
+ step2[18] = vqaddq_s16(step1[18], step1[21]);
+ step2[19] = vqaddq_s16(step1[19], step1[20]);
+ step2[20] = vqsubq_s16(step1[19], step1[20]);
+ step2[21] = vqsubq_s16(step1[18], step1[21]);
+ step2[22] = vqsubq_s16(step1[17], step1[22]);
+ step2[23] = vqsubq_s16(step1[16], step1[23]);
+ step2[24] = vqsubq_s16(step1[31], step1[24]);
+ step2[25] = vqsubq_s16(step1[30], step1[25]);
+ step2[26] = vqsubq_s16(step1[29], step1[26]);
+ step2[27] = vqsubq_s16(step1[28], step1[27]);
+ step2[28] = vqaddq_s16(step1[28], step1[27]);
+ step2[29] = vqaddq_s16(step1[29], step1[26]);
+ step2[30] = vqaddq_s16(step1[30], step1[25]);
+ step2[31] = vqaddq_s16(step1[31], step1[24]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[44] = step1[44];
+ step2[45] = step1[45];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[50] = step1[50];
+ step2[51] = step1[51];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 9
+ idct64_stage9_neon(step2, step1, cos_bit);
+
+ // stage 10
+ idct64_stage10_neon(step1, step2, cos_bit);
+
+ // stage 11
+
+ out[0] = vqaddq_s16(step2[0], step2[63]);
+ out[1] = vqaddq_s16(step2[1], step2[62]);
+ out[2] = vqaddq_s16(step2[2], step2[61]);
+ out[3] = vqaddq_s16(step2[3], step2[60]);
+ out[4] = vqaddq_s16(step2[4], step2[59]);
+ out[5] = vqaddq_s16(step2[5], step2[58]);
+ out[6] = vqaddq_s16(step2[6], step2[57]);
+ out[7] = vqaddq_s16(step2[7], step2[56]);
+ out[8] = vqaddq_s16(step2[8], step2[55]);
+ out[9] = vqaddq_s16(step2[9], step2[54]);
+ out[10] = vqaddq_s16(step2[10], step2[53]);
+ out[11] = vqaddq_s16(step2[11], step2[52]);
+ out[12] = vqaddq_s16(step2[12], step2[51]);
+ out[13] = vqaddq_s16(step2[13], step2[50]);
+ out[14] = vqaddq_s16(step2[14], step2[49]);
+ out[15] = vqaddq_s16(step2[15], step2[48]);
+ out[16] = vqaddq_s16(step2[16], step2[47]);
+ out[17] = vqaddq_s16(step2[17], step2[46]);
+ out[18] = vqaddq_s16(step2[18], step2[45]);
+ out[19] = vqaddq_s16(step2[19], step2[44]);
+ out[20] = vqaddq_s16(step2[20], step2[43]);
+ out[21] = vqaddq_s16(step2[21], step2[42]);
+ out[22] = vqaddq_s16(step2[22], step2[41]);
+ out[23] = vqaddq_s16(step2[23], step2[40]);
+ out[24] = vqaddq_s16(step2[24], step2[39]);
+ out[25] = vqaddq_s16(step2[25], step2[38]);
+ out[26] = vqaddq_s16(step2[26], step2[37]);
+ out[27] = vqaddq_s16(step2[27], step2[36]);
+ out[28] = vqaddq_s16(step2[28], step2[35]);
+ out[29] = vqaddq_s16(step2[29], step2[34]);
+ out[30] = vqaddq_s16(step2[30], step2[33]);
+ out[31] = vqaddq_s16(step2[31], step2[32]);
+ out[32] = vqsubq_s16(step2[31], step2[32]);
+ out[33] = vqsubq_s16(step2[30], step2[33]);
+ out[34] = vqsubq_s16(step2[29], step2[34]);
+ out[35] = vqsubq_s16(step2[28], step2[35]);
+ out[36] = vqsubq_s16(step2[27], step2[36]);
+ out[37] = vqsubq_s16(step2[26], step2[37]);
+ out[38] = vqsubq_s16(step2[25], step2[38]);
+ out[39] = vqsubq_s16(step2[24], step2[39]);
+ out[40] = vqsubq_s16(step2[23], step2[40]);
+ out[41] = vqsubq_s16(step2[22], step2[41]);
+ out[42] = vqsubq_s16(step2[21], step2[42]);
+ out[43] = vqsubq_s16(step2[20], step2[43]);
+ out[44] = vqsubq_s16(step2[19], step2[44]);
+ out[45] = vqsubq_s16(step2[18], step2[45]);
+ out[46] = vqsubq_s16(step2[17], step2[46]);
+ out[47] = vqsubq_s16(step2[16], step2[47]);
+ out[48] = vqsubq_s16(step2[15], step2[48]);
+ out[49] = vqsubq_s16(step2[14], step2[49]);
+ out[50] = vqsubq_s16(step2[13], step2[50]);
+ out[51] = vqsubq_s16(step2[12], step2[51]);
+ out[52] = vqsubq_s16(step2[11], step2[52]);
+ out[53] = vqsubq_s16(step2[10], step2[53]);
+ out[54] = vqsubq_s16(step2[9], step2[54]);
+ out[55] = vqsubq_s16(step2[8], step2[55]);
+ out[56] = vqsubq_s16(step2[7], step2[56]);
+ out[57] = vqsubq_s16(step2[6], step2[57]);
+ out[58] = vqsubq_s16(step2[5], step2[58]);
+ out[59] = vqsubq_s16(step2[4], step2[59]);
+ out[60] = vqsubq_s16(step2[3], step2[60]);
+ out[61] = vqsubq_s16(step2[2], step2[61]);
+ out[62] = vqsubq_s16(step2[1], step2[62]);
+ out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step2[64], step1[64];
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+ (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+ const int16x4_t c5 =
+ set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+ (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+ const int16x4_t c6 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c7 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[4] = in[8];
+ step2[8] = in[4];
+ step2[12] = in[12];
+ step2[16] = in[2];
+ step2[20] = in[10];
+ step2[24] = in[6];
+ step2[28] = in[14];
+
+ btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+ btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+ btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+ btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+ btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+ btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+ btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+ btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+ // stage 3
+
+ step1[0] = step2[0];
+ step1[4] = step2[4];
+ step1[8] = step2[8];
+ step1[12] = step2[12];
+
+ btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+ btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+ btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+ btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+
+ step1[32] = step2[32];
+ step1[33] = step2[32];
+ step1[34] = step2[35];
+ step1[35] = step2[35];
+ step1[36] = step2[36];
+ step1[37] = step2[36];
+ step1[38] = step2[39];
+ step1[39] = step2[39];
+ step1[40] = step2[40];
+ step1[41] = step2[40];
+ step1[42] = step2[43];
+ step1[43] = step2[43];
+ step1[44] = step2[44];
+ step1[45] = step2[44];
+ step1[46] = step2[47];
+ step1[47] = step2[47];
+ step1[48] = step2[48];
+ step1[49] = step2[48];
+ step1[50] = step2[51];
+ step1[51] = step2[51];
+ step1[52] = step2[52];
+ step1[53] = step2[52];
+ step1[54] = step2[55];
+ step1[55] = step2[55];
+ step1[56] = step2[56];
+ step1[57] = step2[56];
+ step1[58] = step2[59];
+ step1[59] = step2[59];
+ step1[60] = step2[60];
+ step1[61] = step2[60];
+ step1[62] = step2[63];
+ step1[63] = step2[63];
+
+ // stage 4
+
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+
+ btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+ btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+ btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+ btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+ btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+ btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+ btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+ step2[32] = step1[32];
+ step2[35] = step1[35];
+ step2[36] = step1[36];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[43] = step1[43];
+ step2[44] = step1[44];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[51] = step1[51];
+ step2[52] = step1[52];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[59] = step1[59];
+ step2[60] = step1[60];
+ step2[63] = step1[63];
+
+ // stage 5
+
+ step1[0] = step2[0];
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+ btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+ btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+ btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[16] = step2[16];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[35]);
+ step1[33] = vqaddq_s16(step2[33], step2[34]);
+ step1[34] = vqsubq_s16(step2[33], step2[34]);
+ step1[35] = vqsubq_s16(step2[32], step2[35]);
+ step1[36] = vqsubq_s16(step2[39], step2[36]);
+ step1[37] = vqsubq_s16(step2[38], step2[37]);
+ step1[38] = vqaddq_s16(step2[38], step2[37]);
+ step1[39] = vqaddq_s16(step2[39], step2[36]);
+ step1[40] = vqaddq_s16(step2[40], step2[43]);
+ step1[41] = vqaddq_s16(step2[41], step2[42]);
+ step1[42] = vqsubq_s16(step2[41], step2[42]);
+ step1[43] = vqsubq_s16(step2[40], step2[43]);
+ step1[44] = vqsubq_s16(step2[47], step2[44]);
+ step1[45] = vqsubq_s16(step2[46], step2[45]);
+ step1[46] = vqaddq_s16(step2[46], step2[45]);
+ step1[47] = vqaddq_s16(step2[47], step2[44]);
+ step1[48] = vqaddq_s16(step2[48], step2[51]);
+ step1[49] = vqaddq_s16(step2[49], step2[50]);
+ step1[50] = vqsubq_s16(step2[49], step2[50]);
+ step1[51] = vqsubq_s16(step2[48], step2[51]);
+ step1[52] = vqsubq_s16(step2[55], step2[52]);
+ step1[53] = vqsubq_s16(step2[54], step2[53]);
+ step1[54] = vqaddq_s16(step2[54], step2[53]);
+ step1[55] = vqaddq_s16(step2[55], step2[52]);
+ step1[56] = vqaddq_s16(step2[56], step2[59]);
+ step1[57] = vqaddq_s16(step2[57], step2[58]);
+ step1[58] = vqsubq_s16(step2[57], step2[58]);
+ step1[59] = vqsubq_s16(step2[56], step2[59]);
+ step1[60] = vqsubq_s16(step2[63], step2[60]);
+ step1[61] = vqsubq_s16(step2[62], step2[61]);
+ step1[62] = vqaddq_s16(step2[62], step2[61]);
+ step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+ // stage 6
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+ btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+ btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+ btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+ btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+ btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+ btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+ btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+ btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[19]);
+ step2[17] = vqaddq_s16(step1[17], step1[18]);
+ step2[18] = vqsubq_s16(step1[17], step1[18]);
+ step2[19] = vqsubq_s16(step1[16], step1[19]);
+ step2[20] = vqsubq_s16(step1[23], step1[20]);
+ step2[21] = vqsubq_s16(step1[22], step1[21]);
+ step2[22] = vqaddq_s16(step1[22], step1[21]);
+ step2[23] = vqaddq_s16(step1[23], step1[20]);
+ step2[24] = vqaddq_s16(step1[24], step1[27]);
+ step2[25] = vqaddq_s16(step1[25], step1[26]);
+ step2[26] = vqsubq_s16(step1[25], step1[26]);
+ step2[27] = vqsubq_s16(step1[24], step1[27]);
+ step2[28] = vqsubq_s16(step1[31], step1[28]);
+ step2[29] = vqsubq_s16(step1[30], step1[29]);
+ step2[30] = vqaddq_s16(step1[30], step1[29]);
+ step2[31] = vqaddq_s16(step1[31], step1[28]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[41] = step1[41];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[54] = step1[54];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+ btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+ btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+ btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+ btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[39]);
+ step1[33] = vqaddq_s16(step2[33], step2[38]);
+ step1[34] = vqaddq_s16(step2[34], step2[37]);
+ step1[35] = vqaddq_s16(step2[35], step2[36]);
+ step1[36] = vqsubq_s16(step2[35], step2[36]);
+ step1[37] = vqsubq_s16(step2[34], step2[37]);
+ step1[38] = vqsubq_s16(step2[33], step2[38]);
+ step1[39] = vqsubq_s16(step2[32], step2[39]);
+ step1[40] = vqsubq_s16(step2[47], step2[40]);
+ step1[41] = vqsubq_s16(step2[46], step2[41]);
+ step1[42] = vqsubq_s16(step2[45], step2[42]);
+ step1[43] = vqsubq_s16(step2[44], step2[43]);
+ step1[44] = vqaddq_s16(step2[43], step2[44]);
+ step1[45] = vqaddq_s16(step2[42], step2[45]);
+ step1[46] = vqaddq_s16(step2[41], step2[46]);
+ step1[47] = vqaddq_s16(step2[40], step2[47]);
+ step1[48] = vqaddq_s16(step2[48], step2[55]);
+ step1[49] = vqaddq_s16(step2[49], step2[54]);
+ step1[50] = vqaddq_s16(step2[50], step2[53]);
+ step1[51] = vqaddq_s16(step2[51], step2[52]);
+ step1[52] = vqsubq_s16(step2[51], step2[52]);
+ step1[53] = vqsubq_s16(step2[50], step2[53]);
+ step1[54] = vqsubq_s16(step2[49], step2[54]);
+ step1[55] = vqsubq_s16(step2[48], step2[55]);
+ step1[56] = vqsubq_s16(step2[63], step2[56]);
+ step1[57] = vqsubq_s16(step2[62], step2[57]);
+ step1[58] = vqsubq_s16(step2[61], step2[58]);
+ step1[59] = vqsubq_s16(step2[60], step2[59]);
+ step1[60] = vqaddq_s16(step2[59], step2[60]);
+ step1[61] = vqaddq_s16(step2[58], step2[61]);
+ step1[62] = vqaddq_s16(step2[57], step2[62]);
+ step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+ btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+ btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+ btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+ btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+ btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+ btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+ btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[23]);
+ step2[17] = vqaddq_s16(step1[17], step1[22]);
+ step2[18] = vqaddq_s16(step1[18], step1[21]);
+ step2[19] = vqaddq_s16(step1[19], step1[20]);
+ step2[20] = vqsubq_s16(step1[19], step1[20]);
+ step2[21] = vqsubq_s16(step1[18], step1[21]);
+ step2[22] = vqsubq_s16(step1[17], step1[22]);
+ step2[23] = vqsubq_s16(step1[16], step1[23]);
+ step2[24] = vqsubq_s16(step1[31], step1[24]);
+ step2[25] = vqsubq_s16(step1[30], step1[25]);
+ step2[26] = vqsubq_s16(step1[29], step1[26]);
+ step2[27] = vqsubq_s16(step1[28], step1[27]);
+ step2[28] = vqaddq_s16(step1[28], step1[27]);
+ step2[29] = vqaddq_s16(step1[29], step1[26]);
+ step2[30] = vqaddq_s16(step1[30], step1[25]);
+ step2[31] = vqaddq_s16(step1[31], step1[24]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[44] = step1[44];
+ step2[45] = step1[45];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[50] = step1[50];
+ step2[51] = step1[51];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 9
+ idct64_stage9_neon(step2, step1, cos_bit);
+
+ // stage 10
+ idct64_stage10_neon(step1, step2, cos_bit);
+
+ // stage 11
+
+ out[0] = vqaddq_s16(step2[0], step2[63]);
+ out[1] = vqaddq_s16(step2[1], step2[62]);
+ out[2] = vqaddq_s16(step2[2], step2[61]);
+ out[3] = vqaddq_s16(step2[3], step2[60]);
+ out[4] = vqaddq_s16(step2[4], step2[59]);
+ out[5] = vqaddq_s16(step2[5], step2[58]);
+ out[6] = vqaddq_s16(step2[6], step2[57]);
+ out[7] = vqaddq_s16(step2[7], step2[56]);
+ out[8] = vqaddq_s16(step2[8], step2[55]);
+ out[9] = vqaddq_s16(step2[9], step2[54]);
+ out[10] = vqaddq_s16(step2[10], step2[53]);
+ out[11] = vqaddq_s16(step2[11], step2[52]);
+ out[12] = vqaddq_s16(step2[12], step2[51]);
+ out[13] = vqaddq_s16(step2[13], step2[50]);
+ out[14] = vqaddq_s16(step2[14], step2[49]);
+ out[15] = vqaddq_s16(step2[15], step2[48]);
+ out[16] = vqaddq_s16(step2[16], step2[47]);
+ out[17] = vqaddq_s16(step2[17], step2[46]);
+ out[18] = vqaddq_s16(step2[18], step2[45]);
+ out[19] = vqaddq_s16(step2[19], step2[44]);
+ out[20] = vqaddq_s16(step2[20], step2[43]);
+ out[21] = vqaddq_s16(step2[21], step2[42]);
+ out[22] = vqaddq_s16(step2[22], step2[41]);
+ out[23] = vqaddq_s16(step2[23], step2[40]);
+ out[24] = vqaddq_s16(step2[24], step2[39]);
+ out[25] = vqaddq_s16(step2[25], step2[38]);
+ out[26] = vqaddq_s16(step2[26], step2[37]);
+ out[27] = vqaddq_s16(step2[27], step2[36]);
+ out[28] = vqaddq_s16(step2[28], step2[35]);
+ out[29] = vqaddq_s16(step2[29], step2[34]);
+ out[30] = vqaddq_s16(step2[30], step2[33]);
+ out[31] = vqaddq_s16(step2[31], step2[32]);
+ out[32] = vqsubq_s16(step2[31], step2[32]);
+ out[33] = vqsubq_s16(step2[30], step2[33]);
+ out[34] = vqsubq_s16(step2[29], step2[34]);
+ out[35] = vqsubq_s16(step2[28], step2[35]);
+ out[36] = vqsubq_s16(step2[27], step2[36]);
+ out[37] = vqsubq_s16(step2[26], step2[37]);
+ out[38] = vqsubq_s16(step2[25], step2[38]);
+ out[39] = vqsubq_s16(step2[24], step2[39]);
+ out[40] = vqsubq_s16(step2[23], step2[40]);
+ out[41] = vqsubq_s16(step2[22], step2[41]);
+ out[42] = vqsubq_s16(step2[21], step2[42]);
+ out[43] = vqsubq_s16(step2[20], step2[43]);
+ out[44] = vqsubq_s16(step2[19], step2[44]);
+ out[45] = vqsubq_s16(step2[18], step2[45]);
+ out[46] = vqsubq_s16(step2[17], step2[46]);
+ out[47] = vqsubq_s16(step2[16], step2[47]);
+ out[48] = vqsubq_s16(step2[15], step2[48]);
+ out[49] = vqsubq_s16(step2[14], step2[49]);
+ out[50] = vqsubq_s16(step2[13], step2[50]);
+ out[51] = vqsubq_s16(step2[12], step2[51]);
+ out[52] = vqsubq_s16(step2[11], step2[52]);
+ out[53] = vqsubq_s16(step2[10], step2[53]);
+ out[54] = vqsubq_s16(step2[9], step2[54]);
+ out[55] = vqsubq_s16(step2[8], step2[55]);
+ out[56] = vqsubq_s16(step2[7], step2[56]);
+ out[57] = vqsubq_s16(step2[6], step2[57]);
+ out[58] = vqsubq_s16(step2[5], step2[58]);
+ out[59] = vqsubq_s16(step2[4], step2[59]);
+ out[60] = vqsubq_s16(step2[3], step2[60]);
+ out[61] = vqsubq_s16(step2[2], step2[61]);
+ out[62] = vqsubq_s16(step2[1], step2[62]);
+ out[63] = vqsubq_s16(step2[0], step2[63]);
+}
// Functions for blocks with eob at DC and within
// topleft 8x8, 16x16, 32x32 corner
-static const transform_1d_neon
- lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
- {
- { av1_idct4_new, av1_idct4_new, NULL, NULL },
- { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
- { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
- },
- { { av1_idct8_new, av1_idct8_new, NULL, NULL },
- { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
- { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
- {
- { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
- { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
- { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
- },
- { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
- { NULL, NULL, NULL, NULL },
- { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
- av1_iidentity32_c } },
- { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
- { NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL } }
- };
-
static const transform_neon
lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
{
@@ -2120,108 +3574,35 @@ static const transform_neon
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL },
},
- { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
- { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
- { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+ { { idct8_low1_neon, idct8_neon, NULL, NULL },
+ { iadst8_low1_neon, iadst8_neon, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
{
- { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
- { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
- NULL },
- { identity16_new_neon, identity16_new_neon, identity16_new_neon,
- NULL },
+ { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
+ { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
+ { NULL, NULL, NULL, NULL },
},
- { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
- idct32_new_neon },
+ { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
{ NULL, NULL, NULL, NULL },
- { identity32_new_neon, identity32_new_neon, identity32_new_neon,
- identity32_new_neon } },
- { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
+ idct64_low32_neon },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
-static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
- const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
- DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
- int32_t *temp_in = txfm_buf;
-
- int eobx, eoby;
- get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int txfm_size_col = tx_size_wide[tx_size];
- const int txfm_size_row = tx_size_high[tx_size];
- const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-
- const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
- const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
- int32_t *temp_out = temp_in + buf_offset;
- int32_t *buf = temp_out + buf_offset;
- int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
- int r, bd = 8;
-
- const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
- const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
- const transform_1d_neon row_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
- const transform_1d_neon col_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
- assert(col_txfm != NULL);
- assert(row_txfm != NULL);
-
- // row tx
- int row_start = (buf_size_nonzero_h_div8 * 8);
- for (int i = 0; i < row_start; i++) {
- if (abs(rect_type) == 1) {
- for (int j = 0; j < txfm_size_col; j++)
- temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
- row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
- } else {
- row_txfm(input, buf_ptr, cos_bit_row, stage_range);
- }
- av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
- input += txfm_size_col;
- buf_ptr += txfm_size_col;
- }
-
- // Doing memset for the rows which are not processed in row transform.
- memset(buf_ptr, 0,
- sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
- // col tx
- for (int c = 0; c < txfm_size_col; c++) {
- for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
-
- col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
- av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
- for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
- }
- }
-}
-
static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
uint8_t *output, int stride,
TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
+ (void)tx_type;
int16x8_t a[32 * 4];
int16x8_t b[32 * 4];
int eobx, eoby;
get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -2232,17 +3613,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
- const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
- const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
const int32_t *input_1;
int temp_b = 0;
- const transform_neon row_txfm =
- lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
- const transform_neon col_txfm =
- lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
- assert(col_txfm != NULL);
- assert(row_txfm != NULL);
for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
input_1 = input;
@@ -2257,9 +3629,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
int y = i * txfm_size_col;
round_shift_for_rect(&a[y], &a[y], txfm_size_col);
}
- row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
- av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
- -shift[0]);
+ identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+ txw_idx, txfm_size_col, -shift[0]);
for (int j = 0; j < buf_size_w_div8; ++j) {
int k = j * 8 + i * txfm_size_col;
transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
@@ -2267,9 +3638,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
temp_b += 8;
}
for (int j = 0; j < buf_size_w_div8; ++j) {
- col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
- av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
- -shift[1]);
+ identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+ txh_idx, txfm_size_row, -shift[1]);
}
if (txfm_size_col >= 16) {
for (int i = 0; i < (txfm_size_col >> 4); i++) {
@@ -2281,90 +3651,6 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
}
}
-static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
- const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
- DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
- int32_t *temp_in = txfm_buf;
-
- int eobx, eoby;
- get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int txfm_size_col = tx_size_wide[tx_size];
- const int txfm_size_row = tx_size_high[tx_size];
- const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-
- const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
- const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
- int32_t *temp_out = temp_in + buf_offset;
- int32_t *buf = temp_out + buf_offset;
- int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
- int r, bd = 8;
-
- const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
- const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
- const transform_1d_neon row_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
- const transform_1d_neon col_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
- assert(col_txfm != NULL);
- assert(row_txfm != NULL);
- int ud_flip, lr_flip;
- get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
- // row tx
- int row_start = (buf_size_nonzero_h_div8 * 8);
- for (int i = 0; i < row_start; i++) {
- if (abs(rect_type) == 1) {
- for (int j = 0; j < txfm_size_col; j++)
- temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
- row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
- } else {
- row_txfm(input, buf_ptr, cos_bit_row, stage_range);
- }
- av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
- input += txfm_size_col;
- buf_ptr += txfm_size_col;
- }
- // Doing memset for the rows which are not processed in row transform.
- memset(buf_ptr, 0,
- sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
- // col tx
- for (int c = 0; c < txfm_size_col; c++) {
- if (lr_flip == 0) {
- for (r = 0; r < txfm_size_row; ++r)
- temp_in[r] = buf[r * txfm_size_col + c];
- } else {
- // flip left right
- for (r = 0; r < txfm_size_row; ++r)
- temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
- }
- col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
- av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
- if (ud_flip == 0) {
- for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
- }
- } else {
- // flip upside down
- for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
- }
- }
- }
-}
-
static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
@@ -2372,11 +3658,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
int16x8_t b[16 * 2];
int eobx, eoby, ud_flip, lr_flip;
get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -2386,15 +3671,11 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
- const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
const int32_t *input_1;
int temp_b = 0;
const transform_neon row_txfm =
lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
- const transform_neon col_txfm =
- lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
- assert(col_txfm != NULL);
assert(row_txfm != NULL);
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
@@ -2432,9 +3713,8 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
}
}
for (int j = 0; j < buf_size_w_div8; ++j) {
- col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
- av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
- -shift[1]);
+ identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+ txh_idx, txfm_size_row, -shift[1]);
}
if (txfm_size_col >= 16) {
for (int i = 0; i < (txfm_size_col >> 4); i++) {
@@ -2446,90 +3726,6 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
}
}
-static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
- const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
- DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
- int32_t *temp_in = txfm_buf;
-
- int eobx, eoby;
- get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int txfm_size_col = tx_size_wide[tx_size];
- const int txfm_size_row = tx_size_high[tx_size];
- const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-
- const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
- const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
- int32_t *temp_out = temp_in + buf_offset;
- int32_t *buf = temp_out + buf_offset;
- int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
- int r, bd = 8;
-
- const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
- const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
- const transform_1d_neon row_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
- const transform_1d_neon col_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
- assert(col_txfm != NULL);
- assert(row_txfm != NULL);
- int ud_flip, lr_flip;
- get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
- // row tx
- int row_start = (buf_size_nonzero_h_div8 * 8);
- for (int i = 0; i < row_start; i++) {
- if (abs(rect_type) == 1) {
- for (int j = 0; j < txfm_size_col; j++)
- temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
- row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
- } else {
- row_txfm(input, buf_ptr, cos_bit_row, stage_range);
- }
- av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
- input += txfm_size_col;
- buf_ptr += txfm_size_col;
- }
- // Doing memset for the rows which are not processed in row transform.
- memset(buf_ptr, 0,
- sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
- // col tx
- for (int c = 0; c < txfm_size_col; c++) {
- if (lr_flip == 0) {
- for (r = 0; r < txfm_size_row; ++r)
- temp_in[r] = buf[r * txfm_size_col + c];
- } else {
- // flip left right
- for (r = 0; r < txfm_size_row; ++r)
- temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
- }
- col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
- av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
- if (ud_flip == 0) {
- for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
- }
- } else {
- // flip upside down
- for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
- }
- }
- }
-}
-
static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
@@ -2537,11 +3733,10 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
int16x8_t b[16 * 2];
int eobx, eoby, ud_flip, lr_flip;
get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
@@ -2550,17 +3745,13 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
- const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
const int32_t *input_1;
int temp_b = 0;
- const transform_neon row_txfm =
- lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
const transform_neon col_txfm =
lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
assert(col_txfm != NULL);
- assert(row_txfm != NULL);
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
@@ -2577,9 +3768,8 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
int y = i * txfm_size_col;
round_shift_for_rect(&a[y], &a[y], txfm_size_col);
}
- row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
- av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
- -shift[0]);
+ identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+ txw_idx, txfm_size_col, -shift[0]);
for (int j = 0; j < buf_size_w_div8; ++j) {
int k = j * 8 + i * txfm_size_col;
transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
@@ -2604,24 +3794,24 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
uint8_t *output, int stride,
- TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
+ TX_TYPE tx_type, int eob) {
(void)eob;
+ TX_SIZE tx_size = TX_4X4;
DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
int32_t *temp_in = txfm_buf;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
int32_t *temp_out = temp_in + buf_offset;
int32_t *buf = temp_out + buf_offset;
int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
int r, bd = 8;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2647,6 +3837,7 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
+ clamp_buf(temp_in, txfm_size_row, bd + 8);
col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
@@ -2666,24 +3857,25 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
}
void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type, TX_SIZE tx_size,
- int eob) {
+ int stride, TX_TYPE tx_type, int eob) {
(void)eob;
+ TX_SIZE tx_size = TX_4X8;
DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
int32_t *temp_in = txfm_buf;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
int32_t *temp_out = temp_in + buf_offset;
int32_t *buf = temp_out + buf_offset;
int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+ 16, 16, 16, 16 };
int r, bd = 8;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2711,6 +3903,7 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
+ clamp_buf(temp_in, txfm_size_row, bd + 8);
col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
@@ -2730,24 +3923,25 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
}
void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type, TX_SIZE tx_size,
- int eob) {
+ int stride, TX_TYPE tx_type, int eob) {
(void)eob;
+ TX_SIZE tx_size = TX_8X4;
DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
int32_t *temp_in = txfm_buf;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
int32_t *temp_out = temp_in + buf_offset;
int32_t *buf = temp_out + buf_offset;
int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+ 16, 16, 16, 16 };
int r, bd = 8;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2775,6 +3969,7 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
+ clamp_buf(temp_in, txfm_size_row, bd + 8);
col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
@@ -2794,24 +3989,25 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
}
void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
+ int stride, TX_TYPE tx_type, int eob) {
(void)eob;
+ TX_SIZE tx_size = TX_4X16;
DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
int32_t *temp_in = txfm_buf;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
int32_t *temp_out = temp_in + buf_offset;
int32_t *buf = temp_out + buf_offset;
int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16 };
int r, bd = 8;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2837,6 +4033,7 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
+ clamp_buf(temp_in, txfm_size_row, bd + 8);
col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
@@ -2856,25 +4053,25 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
}
void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
+ int stride, TX_TYPE tx_type, int eob) {
(void)eob;
-
+ TX_SIZE tx_size = TX_16X4;
DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
int32_t *temp_in = txfm_buf;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
int32_t *temp_out = temp_in + buf_offset;
int32_t *buf = temp_out + buf_offset;
int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16 };
int r, bd = 8;
const transform_1d_neon row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
@@ -2900,89 +4097,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
- col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
- av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
- if (ud_flip == 0) {
- for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] =
- highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
- }
- } else {
- // flip upside down
- for (r = 0; r < txfm_size_row; ++r) {
- output[r * stride + c] = highbd_clip_pixel_add(
- output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
- }
- }
- }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
- const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
- DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
- int32_t *temp_in = txfm_buf;
-
- int eobx, eoby, ud_flip, lr_flip, row_start;
- get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
- const int txw_idx = get_txw_idx(tx_size);
- const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int txfm_size_col = tx_size_wide[tx_size];
- const int txfm_size_row = tx_size_high[tx_size];
- const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
- const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
- const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-
- int32_t *temp_out = temp_in + buf_offset;
- int32_t *buf = temp_out + buf_offset;
- int32_t *buf_ptr = buf;
- const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
- const int bd = 8;
- int r;
-
- const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
- const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
- const transform_1d_neon row_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
- const transform_1d_neon col_txfm =
- lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
- assert(col_txfm != NULL);
- assert(row_txfm != NULL);
-
- get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- row_start = (buf_size_nonzero_h_div8 << 3);
-
- for (int i = 0; i < row_start; i++) {
- if (abs(rect_type) == 1) {
- for (int j = 0; j < txfm_size_col; j++)
- temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
- row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
- } else {
- row_txfm(input, buf_ptr, cos_bit_row, stage_range);
- }
- av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
- input += txfm_size_col;
- buf_ptr += txfm_size_col;
- }
-
- // Doing memset for the rows which are not processed in row transform.
- memset(buf_ptr, 0,
- sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
-
- for (int c = 0; c < txfm_size_col; c++) {
- if (lr_flip == 0) {
- for (r = 0; r < txfm_size_row; ++r)
- temp_in[r] = buf[r * txfm_size_col + c];
- } else {
- // flip left right
- for (r = 0; r < txfm_size_row; ++r)
- temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
- }
+ clamp_buf(temp_in, txfm_size_row, bd + 8);
col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
@@ -3008,17 +4123,18 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
int16x8_t b[64 * 8];
int eobx, eoby, ud_flip, lr_flip;
get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
const int buf_size_w_div8 = txfm_size_col >> 3;
const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
const int32_t *input_1;
@@ -3038,14 +4154,14 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
input_1 = input;
for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
int k = j * 8 + i * txfm_size_col;
- load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride);
transpose_s16_8x8q(&a[k], &a[k]);
input_1 += 8;
}
- input += (txfm_size_col * 8);
+ input += (input_stride * 8);
if (abs(rect_type) == 1) {
int y = i * txfm_size_col;
- round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ round_shift_for_rect(&a[y], &a[y], input_stride);
}
row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
@@ -3083,36 +4199,6 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
}
}
-static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
- const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
- switch (tx_type) {
- case IDTX:
- lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
- tx_size, eob);
- break;
-
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
- lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
- tx_size, eob);
- break;
-
- case V_DCT:
- case V_ADST:
- case V_FLIPADST:
- lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
- tx_size, eob);
- break;
-
- default:
- lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
- tx_size, eob);
- break;
- }
-}
-
static INLINE void lowbd_inv_txfm2d_add_universe_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
@@ -3146,73 +4232,27 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon(
void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, TX_SIZE tx_size,
int eob) {
- int row;
switch (tx_size) {
case TX_4X4:
- lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
- eob);
+ lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
break;
case TX_4X8:
- lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
- eob);
+ lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
break;
case TX_8X4:
- lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
- eob);
+ lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
break;
case TX_4X16:
- lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
- eob);
+ lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
break;
case TX_16X4:
- lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
- eob);
+ lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
break;
- case TX_16X64: {
- lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
- tx_size, eob);
- } break;
-
- case TX_64X16: {
- int32_t mod_input[64 * 16];
- for (row = 0; row < 16; ++row) {
- memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
- memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
- }
- lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
- } break;
-
- case TX_32X64: {
- lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
- tx_size, eob);
- } break;
-
- case TX_64X32: {
- int32_t mod_input[64 * 32];
- for (row = 0; row < 32; ++row) {
- memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
- memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
- }
- lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
- } break;
-
- case TX_64X64: {
- int32_t mod_input[64 * 64];
- for (row = 0; row < 32; ++row) {
- memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
- memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
- }
- lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
- } break;
-
default:
lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
tx_size, eob);
diff --git a/media/libaom/src/av1/common/arm/av1_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_txfm_neon.c
index de3c54724..7e3a05ab7 100644
--- a/media/libaom/src/av1/common/arm/av1_txfm_neon.c
+++ b/media/libaom/src/av1/common/arm/av1_txfm_neon.c
@@ -12,6 +12,8 @@
#include <arm_neon.h>
#include <assert.h>
+#include "config/av1_rtcd.h"
+
#include "aom_ports/mem.h"
#include "av1/common/arm/mem_neon.h"
diff --git a/media/libaom/src/av1/common/arm/cfl_neon.c b/media/libaom/src/av1/common/arm/cfl_neon.c
index 39025b5e5..371be5f0e 100644
--- a/media/libaom/src/av1/common/arm/cfl_neon.c
+++ b/media/libaom/src/av1/common/arm/cfl_neon.c
@@ -131,6 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
+#if CONFIG_AV1_HIGHBITDEPTH
#ifndef __aarch64__
uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
@@ -247,6 +248,7 @@ static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
input += input_stride;
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
CFL_GET_SUBSAMPLE_FUNCTION(neon)
@@ -511,6 +513,7 @@ static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
CFL_PREDICT_FN(neon, lbd)
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
}
@@ -582,3 +585,4 @@ static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
}
CFL_PREDICT_FN(neon, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/arm/convolve_neon.c b/media/libaom/src/av1/common/arm/convolve_neon.c
index d0c4f8ff6..51c96961c 100644
--- a/media/libaom/src/av1/common/arm/convolve_neon.c
+++ b/media/libaom/src/av1/common/arm/convolve_neon.c
@@ -195,12 +195,12 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
const int8_t bits = FILTER_BITS - conv_params->round_0;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
(void)conv_params;
(void)filter_params_y;
@@ -214,7 +214,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
@@ -603,14 +603,14 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const int vert_offset = filter_params_y->taps / 2 - 1;
src -= vert_offset * src_stride;
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
(void)conv_params;
assert(conv_params->round_0 <= FILTER_BITS);
@@ -618,7 +618,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
if (w <= 4) {
uint8x8_t d01;
@@ -844,17 +844,110 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
}
}
+// Horizontal filtering for convolve_2d_sr for width multiple of 8
+// Processes one row at a time
+static INLINE void horiz_filter_w8_single_row(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int width, int height, const int16_t *x_filter,
+ const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ int width_tmp = width;
+ const uint8_t *s = src_ptr + 8;
+ int16_t *dst_tmp = dst_ptr;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t sum = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const, shift_round_0);
+
+ vst1q_s16(dst_tmp, res0);
+
+ s += 8;
+ dst_tmp += 8;
+ width_tmp -= 8;
+ } while (width_tmp > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ height--;
+ } while (height > 0);
+}
+
+// Horizontal filtering for convolve_2d_sr for width <= 4
+// Processes one row at a time
+static INLINE void horiz_filter_w4_single_row(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int width, int height, const int16_t *x_filter,
+ const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ const uint8_t *s = src_ptr;
+
+ __builtin_prefetch(s);
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s0 = vget_low_s16(tt0);
+ s4 = vget_high_s16(tt0);
+
+ __builtin_prefetch(dst_ptr);
+ s += 8;
+
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const, shift_round_0);
+
+ if (width == 4) {
+ vst1_s16(dst_ptr, d0);
+ dst_ptr += dst_stride;
+ } else if (width == 2) {
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+ dst_ptr += dst_stride;
+ }
+
+ src_ptr += src_stride;
+ height--;
+ } while (height > 0);
+}
+
void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
int im_dst_stride;
int width, height;
- uint8x8_t t0;
#if defined(__aarch64__)
+ uint8x8_t t0;
uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+ const uint8_t *s;
#endif
DECLARE_ALIGNED(16, int16_t,
@@ -867,7 +960,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
- const uint8_t *s;
+
int16_t *dst_ptr;
dst_ptr = im_block;
@@ -880,7 +973,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
int16_t x_filter_tmp[8];
int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -893,18 +986,14 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
assert(conv_params->round_0 > 0);
if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-#if defined(__aarch64__)
- int16x4_t s8, s9, s10, d1, d2, d3;
-#endif
-
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
+#if defined(__aarch64__)
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
do {
+ assert(height >= 4);
s = src_ptr;
-
-#if defined(__aarch64__)
__builtin_prefetch(s + 0 * src_stride);
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
@@ -963,57 +1052,30 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
src_ptr += 4 * src_stride;
dst_ptr += 4 * im_dst_stride;
height -= 4;
-#else
- int16x8_t tt0;
-
- __builtin_prefetch(s);
-
- t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
- tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s0 = vget_low_s16(tt0);
- s4 = vget_high_s16(tt0);
-
- __builtin_prefetch(dst_ptr);
- s += 8;
+ } while (height >= 4);
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-
- s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
- s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
- s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
- s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
- s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
- s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
-
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
- horiz_const, shift_round_0);
-
- if (w == 4) {
- vst1_s16(dst_ptr, d0);
- dst_ptr += im_dst_stride;
- } else if (w == 2) {
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
- dst_ptr += im_dst_stride;
- }
-
- src_ptr += src_stride;
- height -= 1;
-#endif
- } while (height > 0);
- } else {
- int16_t *d_tmp;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
-#if defined(__aarch64__)
- int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
- int16x8_t s11, s12, s13, s14;
+ if (height) {
+ assert(height < 4);
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+ height, x_filter_tmp, horiz_const,
+ shift_round_0);
+ }
+#else
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+ height, x_filter_tmp, horiz_const,
+ shift_round_0);
#endif
+ } else {
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
#if defined(__aarch64__)
+ int16_t *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
do {
+ assert(height >= 8);
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
__builtin_prefetch(src_ptr + 2 * src_stride);
@@ -1099,45 +1161,121 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
src_ptr += 8 * src_stride;
dst_ptr += 8 * im_dst_stride;
height -= 8;
- } while (height > 0);
-#else
- do {
- t0 = vld1_u8(src_ptr);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+ } while (height >= 8);
- width = w;
- s = src_ptr + 8;
- d_tmp = dst_ptr;
+ if (height >= 4) {
+ assert(height < 8);
+ int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+ reg10, reg11, reg12, reg13, reg14;
+ int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+ int16x8_t out0, out1, out2, out3;
+
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+
+ load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
- __builtin_prefetch(dst_ptr);
+ reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
+ width = w;
do {
- t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- int16x8_t sum = s0;
- s0 = s7;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
- s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
- s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
- s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
- s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
- s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
- s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+ reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
- horiz_const, shift_round_0);
+ d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+ x_filter_tmp);
+
+ d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+ x_filter_tmp);
+
+ d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+ x_filter_tmp);
+
+ d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+ x_filter_tmp);
+
+ d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+ x_filter_tmp);
+
+ d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+ x_filter_tmp);
+
+ d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+ x_filter_tmp);
+
+ d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
+ x_filter_tmp);
+
+ transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
+ &out2, &out3);
- vst1q_s16(d_tmp, res0);
+ out0 = vaddq_s16(out0, horiz_const);
+ out0 = vqrshlq_s16(out0, shift_round_0);
+ out1 = vaddq_s16(out1, horiz_const);
+ out1 = vqrshlq_s16(out1, shift_round_0);
+
+ out2 = vaddq_s16(out2, horiz_const);
+ out2 = vqrshlq_s16(out2, shift_round_0);
+
+ out3 = vaddq_s16(out3, horiz_const);
+ out3 = vqrshlq_s16(out3, shift_round_0);
+
+ store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+ reg0 = reg8;
+ reg1 = reg9;
+ reg2 = reg10;
+ reg3 = reg11;
+ reg4 = reg12;
+ reg5 = reg13;
+ reg6 = reg14;
s += 8;
d_tmp += 8;
width -= 8;
} while (width > 0);
- src_ptr += src_stride;
- dst_ptr += im_dst_stride;
- height -= 1;
- } while (height > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * im_dst_stride;
+ height -= 4;
+ }
+
+ if (height) {
+ assert(height < 4);
+ horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+ height, x_filter_tmp, horiz_const,
+ shift_round_0);
+ }
+#else
+
+ horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+ height, x_filter_tmp, horiz_const,
+ shift_round_0);
#endif
}
@@ -1149,7 +1287,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
@@ -1409,12 +1547,12 @@ void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
(void)conv_params;
const uint8_t *src1;
diff --git a/media/libaom/src/av1/common/arm/convolve_neon.h b/media/libaom/src/av1/common/arm/convolve_neon.h
index f382984f2..dbcfab631 100644
--- a/media/libaom/src/av1/common/arm/convolve_neon.h
+++ b/media/libaom/src/av1/common/arm/convolve_neon.h
@@ -73,7 +73,7 @@ static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
int32x4_t sum_0, sum_1;
int32x4_t s3_0, s3_1;
const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
- const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+ const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
/* for the purpose of right shift by { conv_params->round_0 } */
const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
@@ -124,7 +124,7 @@ static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
int16x4_t sum, temp0, temp1, temp2;
const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
- const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+ const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
const int32x4_t zero = vdupq_n_s32(0);
const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
diff --git a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
index e5674ef7c..92112fb85 100644
--- a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
+++ b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c
@@ -23,19 +23,17 @@
#include "av1/common/arm/transpose_neon.h"
#if !defined(__aarch64__)
-static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
- const uint16_t fwd_offset,
- const uint16_t bck_offset,
- const int16x4_t sub_const_vec,
- const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_4x1(
+ uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset,
+ const uint16_t bck_offset, const int16x4_t sub_const_vec,
+ const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
int16x4_t tmp0;
uint16x4_t tmp_u0;
uint32x4_t sum0;
int32x4_t dst0;
int16x8_t tmp4;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
sum0 = vmull_n_u16(res0, fwd_offset);
@@ -65,12 +63,10 @@ static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
}
}
-static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
- const uint16_t fwd_offset,
- const uint16_t bck_offset,
- const int16x4_t sub_const,
- const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_8x1(
+ uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset,
+ const uint16_t bck_offset, const int16x4_t sub_const,
+ const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
int16x4_t tmp0, tmp2;
int16x8_t f0;
uint32x4_t sum0, sum2;
@@ -78,7 +74,7 @@ static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
uint16x8_t tmp_u0;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t sub_const_vec = vmovl_s16(sub_const);
const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
@@ -123,7 +119,7 @@ static INLINE void compute_avg_4x4(
uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const_vec, const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+ const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
int16x4_t tmp0, tmp1, tmp2, tmp3;
uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
uint32x4_t sum0, sum1, sum2, sum3;
@@ -132,7 +128,7 @@ static INLINE void compute_avg_4x4(
int16x8_t tmp4, tmp5;
const int16x8_t zero = vdupq_n_s16(0);
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
const int32x4_t const_vec = vmovl_s16(sub_const_vec);
@@ -203,8 +199,8 @@ static INLINE void compute_avg_8x4(
uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const, const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
- uint8x8_t *t3) {
+ const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1,
+ uint8x8_t *t2, uint8x8_t *t3) {
int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int16x8_t f0, f1, f2, f3;
uint32x4_t sum0, sum1, sum2, sum3;
@@ -214,7 +210,7 @@ static INLINE void compute_avg_8x4(
uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
const int16x8_t zero = vdupq_n_s16(0);
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t sub_const_vec = vmovl_s16(sub_const);
const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
@@ -319,7 +315,7 @@ static INLINE void compute_avg_8x4(
}
}
-static INLINE void jnt_convolve_2d_horiz_neon(
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
const int bd = 8;
@@ -563,7 +559,7 @@ static INLINE void jnt_convolve_2d_horiz_neon(
}
}
-static INLINE void jnt_convolve_2d_vert_neon(
+static INLINE void dist_wtd_convolve_2d_vert_neon(
int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
uint8_t *dst_u8_ptr, *d_u8;
@@ -587,7 +583,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
const uint16_t fwd_offset = conv_params->fwd_offset;
const uint16_t bck_offset = conv_params->bck_offset;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x4_t res4, d0;
@@ -652,8 +648,8 @@ static INLINE void jnt_convolve_2d_vert_neon(
d += (dst_stride << 2);
compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
- bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
- &t0, &t1);
+ bck_offset, sub_const_vec, round_bits,
+ use_dist_wtd_comp_avg, &t0, &t1);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -691,7 +687,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
d += (dst_stride);
compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
- round_bits, use_jnt_comp_avg, &t0);
+ round_bits, use_dist_wtd_comp_avg, &t0);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -717,12 +713,12 @@ static INLINE void jnt_convolve_2d_vert_neon(
} while (w > 0);
}
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
assert(!(w % 4));
assert(!(h % 4));
@@ -736,9 +732,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
const int round_0 = conv_params->round_0 - 1;
const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
int16_t x_filter_tmp[8];
int16x8_t filter_x_coef = vld1q_s16(x_filter);
@@ -748,19 +744,18 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
vst1q_s16(&x_filter_tmp[0], filter_x_coef);
- jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
- x_filter_tmp, im_h, w, round_0);
+ dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+ x_filter_tmp, im_h, w, round_0);
- jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
- y_filter, h, w);
+ dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
+ conv_params, y_filter, h, w);
}
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
tmp_shift3;
uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
@@ -783,8 +778,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
if (!(w & 0x07)) {
for (y = 0; y < (h >> 2); ++y) {
@@ -811,7 +806,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
res_q2, res_q3, conv_params->fwd_offset,
conv_params->bck_offset, sub_const_vec, bits,
- conv_params->use_jnt_comp_avg, &tmp_shift0,
+ conv_params->use_dist_wtd_comp_avg, &tmp_shift0,
&tmp_shift1, &tmp_shift2, &tmp_shift3);
vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
@@ -854,7 +849,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
conv_params->fwd_offset, conv_params->bck_offset,
- sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+ sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg,
&tmp_shift0, &tmp_shift1);
vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
@@ -881,12 +876,12 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
}
}
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
assert(!(w % 4));
assert(!(h % 4));
@@ -902,14 +897,14 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const uint16_t fwd_offset = conv_params->fwd_offset;
const uint16_t bck_offset = conv_params->bck_offset;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
const uint8_t *src_ptr = src - horiz_offset;
@@ -1031,8 +1026,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
- round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
- &t1);
+ round_offset_vec, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
0); // 00 01 02 03
@@ -1103,7 +1098,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
bck_offset, round_offset_vec, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
0); // 00 01 02 03
@@ -1231,11 +1226,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
- vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
- vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1),
+ vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1243,11 +1239,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
- vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
- vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5),
+ vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1319,7 +1316,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
bck_offset, round_offset64, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_u8(d_u8, t0);
d_u8 += (dst8_stride);
@@ -1342,12 +1339,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
}
}
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
assert(!(w % 4));
assert(!(h % 4));
@@ -1363,15 +1360,15 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const uint16_t fwd_offset = conv_params->fwd_offset;
const uint16_t bck_offset = conv_params->bck_offset;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int shift_value = (conv_params->round_1 - 1 - bits);
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const uint8_t *src_ptr = src - (vert_offset * src_stride);
@@ -1489,8 +1486,8 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0,
- &t1);
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -1535,7 +1532,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
bck_offset, round_offset64, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -1654,11 +1651,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
- vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
- vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1),
+ vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1666,11 +1664,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
- vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
- vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5),
+ vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1718,7 +1717,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
bck_offset, round_offset64, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_u8(d_u8, t0);
d_u8 += (dst8_stride);
diff --git a/media/libaom/src/av1/common/arm/mem_neon.h b/media/libaom/src/av1/common/arm/mem_neon.h
index c4ae2e784..171055fe1 100644
--- a/media/libaom/src/av1/common/arm/mem_neon.h
+++ b/media/libaom/src/av1/common/arm/mem_neon.h
@@ -13,6 +13,7 @@
#include <arm_neon.h>
#include <string.h>
+#include "aom_dsp/aom_dsp_common.h"
static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
const uint8x8_t s1) {
@@ -315,6 +316,26 @@ static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
*s3 = vld1q_s16(s);
}
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x4_t a_u32 = vdupq_n_u32(0);
+ if (stride == 4) return vld1q_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 2);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 3);
+ return vreinterpretq_u8_u32(a_u32);
+}
+
static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
uint32x2_t *tu0, uint32x2_t *tu1,
uint32x2_t *tu2, uint32x2_t *tu3) {
@@ -383,6 +404,15 @@ static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
*tu0 = vset_lane_u32(a, *tu0, 1);
}
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define store_unaligned_u8_4x1(dst, src, lane) \
+ do { \
+ uint32_t a; \
+ a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
uint16x4_t *tu0) {
uint16_t a;
@@ -491,4 +521,19 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
vst1q_u32(s, s4);
}
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+}
+
#endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/media/libaom/src/av1/common/arm/selfguided_neon.c b/media/libaom/src/av1/common/arm/selfguided_neon.c
index b3a37c4cb..fc404a64a 100644
--- a/media/libaom/src/av1/common/arm/selfguided_neon.c
+++ b/media/libaom/src/av1/common/arm/selfguided_neon.c
@@ -19,8 +19,8 @@
#include "aom_dsp/txfm_common.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/common.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/resize.h"
#include "av1/common/restoration.h"
#include "av1/common/arm/mem_neon.h"
@@ -86,7 +86,7 @@ static INLINE void calc_ab_fast_internal_common(
for (int x = 0; x < 4; x++) {
for (int y = 0; y < 4; y++) {
- dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
}
}
load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
@@ -214,7 +214,7 @@ static INLINE void calc_ab_internal_common(
for (int x = 0; x < 4; x++) {
for (int y = 0; y < 8; y++) {
- dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
}
}
load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
@@ -376,6 +376,21 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
w -= 8;
count++;
} while (w > 0);
+
+ // memset needed for row pixels as 2nd stage of boxsum filter uses
+ // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
+ for (int x = 0; x < 2; x++) {
+ memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
+ memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+ }
+
+ // memset needed for extra columns as 2nd stage of boxsum filter uses
+ // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
+ for (int x = 2; x < height + 2; x++) {
+ int dst_offset = x * dst_stride + width + 2;
+ memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
+ memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+ }
}
{
@@ -467,7 +482,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
const uint32_t n = (2 * r + 1) * (2 * r + 1);
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
- const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
@@ -509,6 +524,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
} while (h > 0);
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
uint16_t *B16, int32_t *B,
const int buf_stride, const int width,
@@ -522,7 +538,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
- const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -573,6 +589,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
h -= (ht_inc * 4);
} while (h > 0);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
int32_t *B, const int buf_stride,
@@ -584,7 +601,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
const uint32_t n = (2 * r + 1) * (2 * r + 1);
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
- const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -626,6 +643,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
} while (h > 0);
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
int32_t *B, const int buf_stride,
const int width, const int height,
@@ -638,7 +656,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
- const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -679,6 +697,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
h -= (ht_inc * 4);
} while (h > 0);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
int32_t *dst2, const int dst_stride, const int width,
@@ -788,6 +807,21 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
w -= 8;
count++;
} while (w > 0);
+
+ // memset needed for row pixels as 2nd stage of boxsum filter uses
+ // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
+ for (int x = 0; x < 2; x++) {
+ memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
+ memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+ }
+
+ // memset needed for extra columns as 2nd stage of boxsum filter uses
+ // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
+ for (int x = 2; x < height + 2; x++) {
+ int dst_offset = x * dst_stride + width + 2;
+ memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
+ memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+ }
}
{
@@ -1145,7 +1179,7 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
int32_t *dst, int dst_stride,
int bit_depth, int sgr_params_idx,
int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1181,17 +1215,25 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
// Calculation of a, b. a output is in 16bit tmp_buf which is in range of
// [1, 256] for all bit depths. b output is kept in 32bit buffer.
- if (8 == bit_depth) {
- calc_ab_fast_internal_lbd(
- (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
- (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
- params->s[radius_idx], 2);
- } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bit_depth > 8) {
calc_ab_fast_internal_hbd(
(square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
(sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
bit_depth, r, params->s[radius_idx], 2);
+ } else {
+ calc_ab_fast_internal_lbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+ params->s[radius_idx], 2);
}
+#else
+ (void)bit_depth;
+ calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
+ (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2,
+ width + 2, height + 2, r, params->s[radius_idx], 2);
+#endif
final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
dgd_stride, dst, dst_stride, width, height);
}
@@ -1200,7 +1242,7 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
int dgd_stride, int32_t *dst,
int dst_stride, int bit_depth,
int sgr_params_idx, int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1235,19 +1277,27 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+#if CONFIG_AV1_HIGHBITDEPTH
// Calculation of a, b. a output is in 16bit tmp_buf which is in range of
// [1, 256] for all bit depths. b output is kept in 32bit buffer.
- if (8 == bit_depth) {
- calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ if (bit_depth > 8) {
+ calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
(A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
(B - buf_stride - 1), buf_stride, width + 2,
- height + 2, r, params->s[radius_idx], 1);
+ height + 2, bit_depth, r, params->s[radius_idx], 1);
} else {
- calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
(A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
(B - buf_stride - 1), buf_stride, width + 2,
- height + 2, bit_depth, r, params->s[radius_idx], 1);
+ height + 2, r, params->s[radius_idx], 1);
}
+#else
+ (void)bit_depth;
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2, height + 2,
+ r, params->s[radius_idx], 1);
+#endif
final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
dst_stride, width, height);
}
@@ -1299,8 +1349,14 @@ static INLINE void src_convert_u8_to_u16(const uint8_t *src,
dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
}
}
+
+ // memset uninitialized rows of src buffer as they are needed for the
+ // boxsum filter calculation.
+ for (int x = height; x < height + 5; x++)
+ memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
uint16_t *dst, const int dst_stride,
int width, int height) {
@@ -1339,13 +1395,18 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
sizeof(uint16_t) * width);
}
+ // memset uninitialized rows of src buffer as they are needed for the
+ // boxsum filter calculation.
+ for (int x = height; x < height + 5; x++)
+ memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
int stride, int32_t *flt0, int32_t *flt1,
int flt_stride, int sgr_params_idx,
int bit_depth, int highbd) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
assert(!(params->r[0] == 0 && params->r[1] == 0));
uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
@@ -1356,6 +1417,7 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
const int dgd_stride = stride;
+#if CONFIG_AV1_HIGHBITDEPTH
if (highbd) {
const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
src_convert_hbd_copy(
@@ -1370,6 +1432,13 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
dgd16_stride, width_ext, height_ext);
}
+#else
+ (void)highbd;
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+#endif
if (params->r[0] > 0)
restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
@@ -1380,11 +1449,11 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
return 0;
}
-void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -1395,11 +1464,12 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
const int dgd_stride = stride;
- const sgr_params_type *const params = &sgr_params[eps];
+ const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
assert(!(params->r[0] == 0 && params->r[1] == 0));
+#if CONFIG_AV1_HIGHBITDEPTH
if (highbd) {
const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
src_convert_hbd_copy(
@@ -1414,7 +1484,13 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
dgd16_stride, width_ext, height_ext);
}
-
+#else
+ (void)highbd;
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+#endif
if (params->r[0] > 0)
restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
bit_depth, eps, 0);
@@ -1422,7 +1498,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
bit_depth, eps, 1);
- decode_xq(xqd, xq, params);
+ av1_decode_xq(xqd, xq, params);
{
int16_t *src_ptr;
@@ -1485,6 +1561,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+#if CONFIG_AV1_HIGHBITDEPTH
if (highbd) {
r4 = vminq_u16(r4, max);
vst1q_u16(dst16_ptr, r4);
@@ -1492,6 +1569,11 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
t0 = vqmovn_u16(r4);
vst1_u8(dst_ptr, t0);
}
+#else
+ (void)max;
+ t0 = vqmovn_u16(r4);
+ vst1_u8(dst_ptr, t0);
+#endif
w -= 8;
count += 8;
dst_ptr += 8;
diff --git a/media/libaom/src/av1/common/arm/transpose_neon.h b/media/libaom/src/av1/common/arm/transpose_neon.h
index 8a3d9f07f..91d89b43f 100644
--- a/media/libaom/src/av1/common/arm/transpose_neon.h
+++ b/media/libaom/src/av1/common/arm/transpose_neon.h
@@ -250,6 +250,71 @@ static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
vreinterpret_u16_u32(c3.val[1]));
}
+static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2, int16x4_t *a3,
+ int16x4_t *a4, int16x4_t *a5,
+ int16x4_t *a6, int16x4_t *a7,
+ int16x8_t *o0, int16x8_t *o1,
+ int16x8_t *o2, int16x8_t *o3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+ int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+ int16x4x2_t b2 = vtrn_s16(*a4, *a5);
+ int16x4x2_t b3 = vtrn_s16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+ int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+ vreinterpret_s32_s16(b3.val[0]));
+ int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+ vreinterpret_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // o0: 00 10 20 30 40 50 60 70
+ // o1: 01 11 21 31 41 51 61 71
+ // o2: 02 12 22 32 42 52 62 72
+ // o3: 03 13 23 33 43 53 63 73
+
+ *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+ vreinterpret_s16_s32(c2.val[0]));
+ *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+ vreinterpret_s16_s32(c3.val[0]));
+ *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+ vreinterpret_s16_s32(c2.val[1]));
+ *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+ vreinterpret_s16_s32(c3.val[1]));
+}
+
static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
uint16x8_t *a2, uint16x8_t *a3,
uint16x8_t *a4, uint16x8_t *a5,
@@ -386,7 +451,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
}
-static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
int16x8x2_t b0;
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -448,10 +513,10 @@ static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
- const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
- const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
- const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
- const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+ const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
*out = d0.val[0];
*(out + 1) = d1.val[0];
diff --git a/media/libaom/src/av1/common/arm/warp_plane_neon.c b/media/libaom/src/av1/common/arm/warp_plane_neon.c
index 7f02d42a7..c10a34fcd 100644
--- a/media/libaom/src/av1/common/arm/warp_plane_neon.c
+++ b/media/libaom/src/av1/common/arm/warp_plane_neon.c
@@ -20,7 +20,7 @@
#include "av1/common/warped_motion.h"
#include "av1/common/scale.h"
-/* This is a modified version of 'warped_filter' from warped_motion.c:
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
* Each coefficient is stored in 8 bits instead of 16 bits
* The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
@@ -333,22 +333,22 @@ static INLINE void vertical_filter_neon(const int16x8_t *src,
c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
vreinterpretq_s32_s16(b3.val[1]));
- f0 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f1 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f2 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f3 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f4 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f5 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f6 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
- f7 = vld1q_s16(
- (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f0 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f1 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f2 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f3 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f4 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f5 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f6 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f7 = vld1q_s16((int16_t *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
@@ -640,7 +640,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
uint16x4_t tmp16_lo = vld1_u16(p);
int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
int16x4_t tmp16_low;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_lo = vmulq_s32(res_lo, bwd);
tmp32_lo = vmulq_s32(tmp32_lo, fwd);
tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
@@ -671,7 +671,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
uint16x4_t tmp16_hi = vld1_u16(p4);
int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
int16x4_t tmp16_high;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_hi = vmulq_s32(res_hi, bwd);
tmp32_hi = vmulq_s32(tmp32_hi, fwd);
tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
diff --git a/media/libaom/src/av1/common/av1_common_int.h b/media/libaom/src/av1/common/av1_common_int.h
new file mode 100644
index 000000000..0403405e9
--- /dev/null
+++ b/media/libaom/src/av1/common/av1_common_int.h
@@ -0,0 +1,1557 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_
+#define AOM_AV1_COMMON_AV1_COMMON_INT_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/hash_motion.h"
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_table.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT
+#endif
+
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+ do { \
+ } while (0)
+#endif
+
+#define CDEF_MAX_STRENGTHS 16
+
+/* Constant values while waiting for the sequence header */
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
+
+#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
+// Extra frame context which is always kept at default values
+#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
+
+#define NUM_PING_PONG_BUFFERS 2
+
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+ (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS)
+/* clang-format on */
+
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
+
+enum {
+ SINGLE_REFERENCE = 0,
+ COMPOUND_REFERENCE = 1,
+ REFERENCE_MODE_SELECT = 2,
+ REFERENCE_MODES = 3,
+} UENUM1BYTE(REFERENCE_MODE);
+
+enum {
+ /**
+ * Frame context updates are disabled
+ */
+ REFRESH_FRAME_CONTEXT_DISABLED,
+ /**
+ * Update frame context to values resulting from backward probability
+ * updates based on entropy/counts in the decoded frame
+ */
+ REFRESH_FRAME_CONTEXT_BACKWARD,
+} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE);
+
+#define MFMV_STACK_SIZE 3
+typedef struct {
+ int_mv mfmv0;
+ uint8_t ref_frame_offset;
+} TPL_MV_REF;
+
+typedef struct {
+ int_mv mv;
+ MV_REFERENCE_FRAME ref_frame;
+} MV_REF;
+
+typedef struct RefCntBuffer {
+ // For a RefCntBuffer, the following are reference-holding variables:
+ // - cm->ref_frame_map[]
+ // - cm->cur_frame
+ // - cm->scaled_ref_buf[] (encoder only)
+ // - pbi->output_frame_index[] (decoder only)
+ // With that definition, 'ref_count' is the number of reference-holding
+ // variables that are currently referencing this buffer.
+ // For example:
+ // - suppose this buffer is at index 'k' in the buffer pool, and
+ // - Total 'n' of the variables / array elements above have value 'k' (that
+ // is, they are pointing to buffer at index 'k').
+ // Then, pool->frame_bufs[k].ref_count = n.
+ int ref_count;
+
+ unsigned int order_hint;
+ unsigned int ref_order_hints[INTER_REFS_PER_FRAME];
+
+ // These variables are used only in encoder and compare the absolute
+ // display order hint to compute the relative distance and overcome
+ // the limitation of get_relative_dist() which returns incorrect
+ // distance when a very old frame is used as a reference.
+ unsigned int display_order_hint;
+ unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
+
+ MV_REF *mvs;
+ uint8_t *seg_map;
+ struct segmentation seg;
+ int mi_rows;
+ int mi_cols;
+ // Width and height give the size of the buffer (before any upscaling, unlike
+ // the sizes that can be derived from the buf structure)
+ int width;
+ int height;
+ WarpedMotionParams global_motion[REF_FRAMES];
+ int showable_frame; // frame can be used as show existing frame in future
+ uint8_t film_grain_params_present;
+ aom_film_grain_t film_grain_params;
+ aom_codec_frame_buffer_t raw_frame_buffer;
+ YV12_BUFFER_CONFIG buf;
+ FRAME_TYPE frame_type;
+
+ // This is only used in the encoder but needs to be indexed per ref frame
+ // so it's extremely convenient to keep it here.
+ int interp_filter_selected[SWITCHABLE];
+
+ // Inter frame reference frame delta for loop filter
+ int8_t ref_deltas[REF_FRAMES];
+
+ // 0 = ZERO_MV, MV
+ int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+
+ FRAME_CONTEXT frame_context;
+} RefCntBuffer;
+
+typedef struct BufferPool {
+// Protect BufferPool from being accessed by several FrameWorkers at
+// the same time during frame parallel decode.
+// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+// TODO(wtc): Remove this. See
+// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630.
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t pool_mutex;
+#endif
+
+ // Private data associated with the frame buffer callbacks.
+ void *cb_priv;
+
+ aom_get_frame_buffer_cb_fn_t get_fb_cb;
+ aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+ RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+ // Frame buffers allocated internally by the codec.
+ InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct {
+ int cdef_damping;
+ int nb_cdef_strengths;
+ int cdef_strengths[CDEF_MAX_STRENGTHS];
+ int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+ int cdef_bits;
+} CdefInfo;
+
+typedef struct {
+ int delta_q_present_flag;
+ // Resolution of delta quant
+ int delta_q_res;
+ int delta_lf_present_flag;
+ // Resolution of delta lf level
+ int delta_lf_res;
+ // This is a flag for number of deltas of loop filter level
+ // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
+ // 1: use separate deltas for each filter level
+ int delta_lf_multi;
+} DeltaQInfo;
+
+typedef struct {
+ int enable_order_hint; // 0 - disable order hint, and related tools
+ int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs,
+ // frame_sign_bias
+ // if 0, enable_dist_wtd_comp and
+ // enable_ref_frame_mvs must be set as 0.
+ int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes
+ // 1 - enable it
+ int enable_ref_frame_mvs; // 0 - disable ref frame mvs
+ // 1 - enable it
+} OrderHintInfo;
+
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
+// One exception is the last member 'op_params' that is ignored by
+// are_seq_headers_consistent() function.
+typedef struct SequenceHeader {
+ int num_bits_width;
+ int num_bits_height;
+ int max_frame_width;
+ int max_frame_height;
+ uint8_t frame_id_numbers_present_flag;
+ int frame_id_length;
+ int delta_frame_id_length;
+ BLOCK_SIZE sb_size; // Size of the superblock used for this frame
+ int mib_size; // Size of the superblock in units of MI blocks
+ int mib_size_log2; // Log 2 of above.
+
+ OrderHintInfo order_hint_info;
+
+ uint8_t force_screen_content_tools; // 0 - force off
+ // 1 - force on
+ // 2 - adaptive
+ uint8_t still_picture; // Video is a single frame still picture
+ uint8_t reduced_still_picture_hdr; // Use reduced header for still picture
+ uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel
+ // 1 - force to integer
+ // 2 - adaptive
+ uint8_t enable_filter_intra; // enables/disables filterintra
+ uint8_t enable_intra_edge_filter; // enables/disables edge upsampling
+ uint8_t enable_interintra_compound; // enables/disables interintra_compound
+ uint8_t enable_masked_compound; // enables/disables masked compound
+ uint8_t enable_dual_filter; // 0 - disable dual interpolation filter
+ // 1 - enable vert/horz filter selection
+ uint8_t enable_warped_motion; // 0 - disable warp for the sequence
+ // 1 - enable warp for the sequence
+ uint8_t enable_superres; // 0 - Disable superres for the sequence
+ // and no frame level superres flag
+ // 1 - Enable superres for the sequence
+ // enable per-frame superres flag
+ uint8_t enable_cdef; // To turn on/off CDEF
+ uint8_t enable_restoration; // To turn on/off loop restoration
+ BITSTREAM_PROFILE profile;
+
+ // Color config.
+ aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1,
+ // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+ uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers.
+ uint8_t monochrome; // Monochorme video
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ int color_range;
+ int subsampling_x; // Chroma subsampling for x
+ int subsampling_y; // Chroma subsampling for y
+ aom_chroma_sample_position_t chroma_sample_position;
+ uint8_t separate_uv_delta_q;
+ uint8_t film_grain_params_present;
+
+ // Operating point info.
+ int operating_points_cnt_minus_1;
+ int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+ int timing_info_present;
+ aom_timing_info_t timing_info;
+ uint8_t decoder_model_info_present_flag;
+ aom_dec_model_info_t decoder_model_info;
+ uint8_t display_model_info_present_flag;
+ AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in spec. One bit: 0 or 1.
+
+ // IMPORTANT: the op_params member must be at the end of the struct so that
+ // are_seq_headers_consistent() can be implemented with a memcmp() call.
+ // TODO(urvang): We probably don't need the +1 here.
+ aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+} SequenceHeader;
+
+typedef struct {
+ int skip_mode_allowed;
+ int skip_mode_flag;
+ int ref_frame_idx_0;
+ int ref_frame_idx_1;
+} SkipModeInfo;
+
+typedef struct {
+ FRAME_TYPE frame_type;
+ REFERENCE_MODE reference_mode;
+
+ unsigned int order_hint;
+ unsigned int display_order_hint;
+ unsigned int frame_number;
+ SkipModeInfo skip_mode_info;
+ int refresh_frame_flags; // Which ref frames are overwritten by this frame
+ int frame_refs_short_signaling;
+} CurrentFrame;
+
+// Struct containing some frame level features.
+typedef struct {
+ bool disable_cdf_update;
+ bool allow_high_precision_mv;
+ bool cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer
+ bool allow_screen_content_tools;
+ bool allow_intrabc;
+ bool allow_warped_motion;
+ // Whether to use previous frames' motion vectors for prediction.
+ bool allow_ref_frame_mvs;
+ bool coded_lossless; // frame is fully lossless at the coded resolution.
+ bool all_lossless; // frame is fully lossless at the upscaled resolution.
+ bool reduced_tx_set_used;
+ bool error_resilient_mode;
+ bool switchable_motion_mode;
+ TX_MODE tx_mode;
+ InterpFilter interp_filter;
+ int primary_ref_frame;
+ int byte_alignment;
+ // Flag signaling how frame contexts should be updated at the end of
+ // a frame decode
+ REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+} FeatureFlags;
+
+// Struct containing params related to tiles.
+typedef struct CommonTileParams {
+ int cols; // number of tile columns that frame is divided into
+ int rows; // number of tile rows that frame is divided into
+ int max_width_sb; // maximum tile width in superblock units.
+ int max_height_sb; // maximum tile height in superblock units.
+ // Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+ int min_inner_width;
+
+ // If true, tiles are uniformly spaced with power-of-two number of rows and
+ // columns.
+ // If false, tiles have explicitly configured widths and heights.
+ int uniform_spacing;
+
+ // Following members are only valid when uniform_spacing == 1
+ int log2_cols; // log2 of 'cols'.
+ int log2_rows; // log2 of 'rows'.
+ int width; // tile width in MI units
+ int height; // tile height in MI units
+ // End of members that are only valid when uniform_spacing == 1
+
+ // Min num of tile columns possible based on 'max_width_sb' and frame width.
+ int min_log2_cols;
+ // Min num of tile rows possible based on 'max_height_sb' and frame height.
+ int min_log2_rows;
+ // Min num of tile columns possible based on frame width.
+ int max_log2_cols;
+ // Max num of tile columns possible based on frame width.
+ int max_log2_rows;
+ // log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+ int min_log2;
+ // col_start_sb[i] is the start position of tile column i in superblock units.
+ // valid for 0 <= i <= cols
+ int col_start_sb[MAX_TILE_COLS + 1];
+ // row_start_sb[i] is the start position of tile row i in superblock units.
+ // valid for 0 <= i <= rows
+ int row_start_sb[MAX_TILE_ROWS + 1];
+ // If true, we are using large scale tile mode.
+ unsigned int large_scale;
+ // Only relevant when large_scale == 1.
+ // If true, the independent decoding of a single tile or a section of a frame
+ // is allowed.
+ unsigned int single_tile_decoding;
+} CommonTileParams;
+
+// Struct containing params related to MB_MODE_INFO arrays and related info.
+typedef struct CommonModeInfoParams CommonModeInfoParams;
+struct CommonModeInfoParams {
+ // Number of rows/cols in the frame in 16 pixel units.
+ // This is computed from frame width and height aligned to a multiple of 8.
+ int mb_rows;
+ int mb_cols;
+ // Total MBs = mb_rows * mb_cols.
+ int MBs;
+
+ // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units.
+ // This is computed from frame width and height aligned to a multiple of 8.
+ int mi_rows;
+ int mi_cols;
+
+ // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
+ // in the frame.
+ // Note: This array should be treated like a scratch memory, and should NOT be
+ // accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+ MB_MODE_INFO *mi_alloc;
+ // Number of allocated elements in 'mi_alloc'.
+ int mi_alloc_size;
+ // Stride for 'mi_alloc' array.
+ int mi_alloc_stride;
+ // The minimum block size that each element in 'mi_alloc' can correspond to.
+ // For decoder, this is always BLOCK_4X4.
+ // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k,
+ // and BLOCK_8X8 for resolution >= 4k.
+ BLOCK_SIZE mi_alloc_bsize;
+
+ // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
+ // It's possible that:
+ // - Multiple pointers in the grid point to the same element in 'mi_alloc'
+ // (for example, for all 4x4 blocks that belong to the same partition block).
+ // - Some pointers can be NULL (for example, for blocks outside visible area).
+ MB_MODE_INFO **mi_grid_base;
+ // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+ int mi_grid_size;
+ // Stride for 'mi_grid_base' (and 'tx_type_map' also).
+ int mi_stride;
+
+ // An array of tx types for each 4x4 block in the frame.
+ // Number of allocated elements is same as 'mi_grid_size', and stride is
+ // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
+ // 'mi_grid_base'.
+ TX_TYPE *tx_type_map;
+
+ // Function pointers to allow separate logic for encoder and decoder.
+ void (*free_mi)(struct CommonModeInfoParams *mi_params);
+ void (*setup_mi)(struct CommonModeInfoParams *mi_params);
+ void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width,
+ int height);
+};
+
+// Parameters related to quantization at the frame level.
+typedef struct CommonQuantParams CommonQuantParams;
+struct CommonQuantParams {
+ // Base qindex of the frame in the range 0 to 255.
+ int base_qindex;
+
+ // Delta of qindex (from base_qindex) for Y plane DC coefficient.
+ // Note: y_ac_delta_q is implicitly 0.
+ int y_dc_delta_q;
+
+ // Delta of qindex (from base_qindex) for U plane DC and AC coefficients.
+ int u_dc_delta_q;
+ int v_dc_delta_q;
+
+ // Delta of qindex (from base_qindex) for V plane DC and AC coefficients.
+ // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+ int u_ac_delta_q;
+ int v_ac_delta_q;
+
+ // Note: The qindex per superblock may have a delta from the qindex obtained
+ // at frame level from parameters above, based on 'cm->delta_q_info'.
+
+ // The dequantizers below are true dequantizers used only in the
+ // dequantization process. They have the same coefficient
+ // shift/scale as TX.
+ int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+ int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+ int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+
+ // Global quant matrix tables
+ const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+ const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+
+ // Local quant matrix tables for each frame
+ const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+ // Flag indicating whether quantization matrices are being used:
+ // - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
+ // indices to be used to access appropriate global quant matrix tables.
+ // - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+ bool using_qmatrix;
+ int qmatrix_level_y;
+ int qmatrix_level_u;
+ int qmatrix_level_v;
+};
+
+// Context used for transmitting various symbols in the bistream.
+typedef struct CommonContexts CommonContexts;
+struct CommonContexts {
+ // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
+ // partition[i][j] is the context for ith tile row, jth mi_col.
+ PARTITION_CONTEXT **partition;
+
+ // Context used to derive context for multiple symbols:
+ // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
+ // to transmit skip_txfm flag.
+ // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
+ // sign.
+ // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+ ENTROPY_CONTEXT **entropy[MAX_MB_PLANE];
+
+ // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
+ // transmit 'is_split' flag to indicate if this transform block should be
+ // split into smaller sub-blocks.
+ // txfm[i][j] is the context for ith tile row, jth mi_col.
+ TXFM_CONTEXT **txfm;
+
+ // Dimensions that were used to allocate the arrays above.
+ // If these dimensions change, the arrays may have to be re-allocated.
+ int num_planes; // Corresponds to av1_num_planes(cm)
+ int num_tile_rows; // Corresponds to cm->tiles.row
+ int num_mi_cols; // Corresponds to cm->mi_params.mi_cols
+};
+
+typedef struct AV1Common {
+ // Information about the current frame that is being coded.
+ CurrentFrame current_frame;
+ // Code and details about current error status.
+ struct aom_internal_error_info error;
+
+ // AV1 allows two types of frame scaling operations:
+ // (1) Frame super-resolution: that allows coding a frame at lower resolution
+ // and after decoding the frame, normatively uscales and restores the frame --
+ // inside the coding loop.
+ // (2) Frame resize: that allows coding frame at lower/higher resolution, and
+ // then non-normatively upscale the frame at the time of rendering -- outside
+ // the coding loop.
+ // Hence, the need for 3 types of dimensions.
+
+ // Coded frame dimensions.
+ int width;
+ int height;
+
+ // Rendered frame dimensions, after applying both super-resolution and resize
+ // to the coded frame.
+ // Different from coded dimensions if super-resolution and/or resize are
+ // being used for this frame.
+ int render_width;
+ int render_height;
+
+ // Frame dimensions after applying super-resolution to the coded frame (if
+ // present), but before applying resize.
+ // Larger than the coded dimensions if super-resolution is being used for
+ // this frame.
+ // Different from rendered dimensions if resize is being used for this frame.
+ int superres_upscaled_width;
+ int superres_upscaled_height;
+
+ // The denominator of the superres scale used by this frame.
+ // Note: The numerator is fixed to be SCALE_NUMERATOR.
+ uint8_t superres_scale_denominator;
+
+ // If true, buffer removal times are present.
+ bool buffer_removal_time_present;
+ // buffer_removal_times[op_num] specifies the frame removal time in units of
+ // DecCT clock ticks counted from the removal time of the last random access
+ // point for operating point op_num.
+ // TODO(urvang): We probably don't need the +1 here.
+ uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1];
+ // Presentation time of the frame in clock ticks DispCT counted from the
+ // removal time of the last random access point for the operating point that
+ // is being decoded.
+ uint32_t frame_presentation_time;
+
+ // Buffer where previous frame is stored.
+ RefCntBuffer *prev_frame;
+
+ // Buffer into which the current frame will be stored and other related info.
+ // TODO(hkuang): Combine this with cur_buf in macroblockd.
+ RefCntBuffer *cur_frame;
+
+ // For encoder, we have a two-level mapping from reference frame type to the
+ // corresponding buffer in the buffer pool:
+ // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+ // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+ // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+ // the reference counted buffer structure RefCntBuffer, taken from the buffer
+ // pool cm->buffer_pool->frame_bufs.
+ //
+ // LAST_FRAME, ..., EXTREF_FRAME
+ // | |
+ // v v
+ // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1]
+ // | |
+ // v v
+ // ref_frame_map[], ..., ref_frame_map[]
+ //
+ // Note: INTRA_FRAME always refers to the current frame, so there's no need to
+ // have a remapped index for the same.
+ int remapped_ref_idx[REF_FRAMES];
+
+ // Scale of the current frame with respect to itself.
+ // This is currently used for intra block copy, which behaves like an inter
+ // prediction mode, where the reference frame is the current frame itself.
+ struct scale_factors sf_identity;
+
+ // Scale factors of the reference frame with respect to the current frame.
+ // This is required for generating inter prediction and will be non-identity
+ // for a reference frame, if it has different dimensions than the coded
+ // dimensions of the current frame.
+ struct scale_factors ref_scale_factors[REF_FRAMES];
+
+ // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+ // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+ // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
+ // remapped reference index 'j' (that is, original reference type 'i') to
+ // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+ RefCntBuffer *ref_frame_map[REF_FRAMES];
+
+ // If true, this frame is actually shown after decoding.
+ // If false, this frame is coded in the bitstream, but not shown. It is only
+ // used as a reference for other frames coded later.
+ int show_frame;
+
+ // If true, this frame can be used as a show-existing frame for other frames
+ // coded later.
+ // When 'show_frame' is true, this is always true for all non-keyframes.
+ // When 'show_frame' is false, this value is transmitted in the bitstream.
+ int showable_frame;
+
+ // If true, show an existing frame coded before, instead of actually coding a
+ // frame. The existing frame comes from one of the existing reference buffers,
+ // as signaled in the bitstream.
+ int show_existing_frame;
+
+ // Whether some features are allowed or not.
+ FeatureFlags features;
+
+ // Params related to MB_MODE_INFO arrays and related info.
+ CommonModeInfoParams mi_params;
+
+#if CONFIG_ENTROPY_STATS
+ int coef_cdf_category;
+#endif
+ // Quantization params.
+ CommonQuantParams quant_params;
+
+ // Segmentation info for current frame.
+ struct segmentation seg;
+
+ // Segmentation map for previous frame.
+ uint8_t *last_frame_seg_map;
+
+ // Deblocking filter parameters.
+ loop_filter_info_n lf_info;
+ struct loopfilter lf;
+
+ // Loop Restoration filter parameters.
+ RestorationInfo rst_info[MAX_MB_PLANE]; // Loop Restoration filter info.
+ int32_t *rst_tmpbuf; // Scratch buffer for self-guided restoration filter.
+ RestorationLineBuffers *rlbs; // Line buffers required by loop restoration.
+ YV12_BUFFER_CONFIG rst_frame; // Stores the output of loop restoration.
+
+ // CDEF (Constrained Directional Enhancement Filter) parameters.
+ CdefInfo cdef_info;
+
+ // Parameters for film grain synthesis.
+ aom_film_grain_t film_grain_params;
+
+ // Parameters for delta quantization and delta loop filter level.
+ DeltaQInfo delta_q_info;
+
+ // Global motion parameters for each reference frame.
+ WarpedMotionParams global_motion[REF_FRAMES];
+
+ // Elements part of the sequence header, that are applicable for all the
+ // frames in the video.
+ SequenceHeader seq_params;
+
+ // Current CDFs of all the symbols for the current frame.
+ FRAME_CONTEXT *fc;
+ // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
+ // (e.g. for a keyframe). These default CDFs are defined by the bitstream and
+ // copied from default CDF tables for each symbol.
+ FRAME_CONTEXT *default_frame_context;
+
+ // Parameters related to tiling.
+ CommonTileParams tiles;
+
+ // External BufferPool passed from outside.
+ BufferPool *buffer_pool;
+
+ // Above context buffers and their sizes.
+ // Note: above contexts are allocated in this struct, as their size is
+ // dependent on frame width, while left contexts are declared and allocated in
+ // MACROBLOCKD struct, as they have a fixed size.
+ CommonContexts above_contexts;
+
+ // When cm->seq_params.frame_id_numbers_present_flag == 1, current and
+ // reference frame IDs are signaled in the bitstream.
+ int current_frame_id;
+ int ref_frame_id[REF_FRAMES];
+
+ // Motion vectors provided by motion field estimation.
+ // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
+ // mi_row = 2 * row,
+ // mi_col = 2 * col, and
+ // stride = cm->mi_params.mi_stride / 2
+ TPL_MV_REF *tpl_mvs;
+ // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+ int tpl_mvs_mem_size;
+ // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
+ // current frame is positive; and 0 otherwise.
+ int ref_frame_sign_bias[REF_FRAMES];
+ // ref_frame_side[k] is 1 if relative distance between reference 'k' and
+ // current frame is positive, -1 if relative distance is 0; and 0 otherwise.
+ // TODO(jingning): This can be combined with sign_bias later.
+ int8_t ref_frame_side[REF_FRAMES];
+
+ // Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+ unsigned int number_temporal_layers;
+ // Temporal layer ID of this frame
+ // (in the range 0 ... (number_temporal_layers - 1)).
+ int temporal_layer_id;
+
+ // Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+ unsigned int number_spatial_layers;
+ // Spatial layer ID of this frame
+ // (in the range 0 ... (number_spatial_layers - 1)).
+ int spatial_layer_id;
+
+#if TXCOEFF_TIMER
+ int64_t cum_txcoeff_timer;
+ int64_t txcoeff_timer;
+ int txb_count;
+#endif // TXCOEFF_TIMER
+
+#if TXCOEFF_COST_TIMER
+ int64_t cum_txcoeff_cost_timer;
+ int64_t txcoeff_cost_timer;
+ int64_t txcoeff_cost_count;
+#endif // TXCOEFF_COST_TIMER
+
+#if CONFIG_LPF_MASK
+ int is_decoding;
+#endif // CONFIG_LPF_MASK
+} AV1_COMMON;
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+ if (index < 0 || index >= REF_FRAMES) return NULL;
+ if (cm->ref_frame_map[index] == NULL) return NULL;
+ return &cm->ref_frame_map[index]->buf;
+}
+
+static INLINE int get_free_fb(AV1_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int i;
+
+ lock_buffer_pool(cm->buffer_pool);
+ for (i = 0; i < FRAME_BUFFERS; ++i)
+ if (frame_bufs[i].ref_count == 0) break;
+
+ if (i != FRAME_BUFFERS) {
+ if (frame_bufs[i].buf.use_external_reference_buffers) {
+ // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+ // external reference buffers. Restore the buffer pointers to point to the
+ // internally allocated memory.
+ YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+ ybf->y_buffer = ybf->store_buf_adr[0];
+ ybf->u_buffer = ybf->store_buf_adr[1];
+ ybf->v_buffer = ybf->store_buf_adr[2];
+ ybf->use_external_reference_buffers = 0;
+ }
+
+ frame_bufs[i].ref_count = 1;
+ } else {
+ // We should never run out of free buffers. If this assertion fails, there
+ // is a reference leak.
+ assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
+ // Reset i to be INVALID_IDX to indicate no free buffer found.
+ i = INVALID_IDX;
+ }
+
+ unlock_buffer_pool(cm->buffer_pool);
+ return i;
+}
+
+static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
+ // Release the previously-used frame-buffer
+ if (cm->cur_frame != NULL) {
+ --cm->cur_frame->ref_count;
+ cm->cur_frame = NULL;
+ }
+
+ // Assign a new framebuffer
+ const int new_fb_idx = get_free_fb(cm);
+ if (new_fb_idx == INVALID_IDX) return NULL;
+
+ cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
+ cm->cur_frame->buf.buf_8bit_valid = 0;
+ av1_zero(cm->cur_frame->interp_filter_selected);
+ return cm->cur_frame;
+}
+
+// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
+// counts accordingly.
+static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
+ RefCntBuffer *rhs_ptr) {
+ RefCntBuffer *const old_ptr = *lhs_ptr;
+ if (old_ptr != NULL) {
+ assert(old_ptr->ref_count > 0);
+ // One less reference to the buffer at 'old_ptr', so decrease ref count.
+ --old_ptr->ref_count;
+ }
+
+ *lhs_ptr = rhs_ptr;
+ // One more reference to the buffer at 'rhs_ptr', so increase ref count.
+ ++rhs_ptr->ref_count;
+}
+
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+ return cm->current_frame.frame_type == KEY_FRAME ||
+ cm->current_frame.frame_type == INTRA_ONLY_FRAME;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+ return cm->current_frame.frame_type == S_FRAME;
+}
+
+// These functions take a reference frame label between LAST_FRAME and
+// EXTREF_FRAME inclusive. Note that this is different to the indexing
+// previously used by the frame_refs[] array.
+static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm,
+ const MV_REFERENCE_FRAME ref_frame) {
+ return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
+ ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
+ : INVALID_IDX;
+}
+
+static INLINE RefCntBuffer *get_ref_frame_buf(
+ const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Both const and non-const versions of this function are provided so that it
+// can be used with a const AV1_COMMON if needed.
+static INLINE const struct scale_factors *get_ref_scale_factors_const(
+ const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE struct scale_factors *get_ref_scale_factors(
+ AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE RefCntBuffer *get_primary_ref_frame_buf(
+ const AV1_COMMON *const cm) {
+ const int primary_ref_frame = cm->features.primary_ref_frame;
+ if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
+ const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+ return !cm->features.error_resilient_mode &&
+ cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
+ cm->seq_params.order_hint_info.enable_order_hint &&
+ !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+ return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
+ cm->seq_params.enable_warped_motion;
+}
+
+static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
+ const int buf_rows = buf->mi_rows;
+ const int buf_cols = buf->mi_cols;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ if (buf->mvs == NULL || buf_rows != mi_params->mi_rows ||
+ buf_cols != mi_params->mi_cols) {
+ aom_free(buf->mvs);
+ buf->mi_rows = mi_params->mi_rows;
+ buf->mi_cols = mi_params->mi_cols;
+ CHECK_MEM_ERROR(cm, buf->mvs,
+ (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) *
+ ((mi_params->mi_cols + 1) >> 1),
+ sizeof(*buf->mvs)));
+ aom_free(buf->seg_map);
+ CHECK_MEM_ERROR(
+ cm, buf->seg_map,
+ (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols,
+ sizeof(*buf->seg_map)));
+ }
+
+ const int mem_size =
+ ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1);
+ int realloc = cm->tpl_mvs == NULL;
+ if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
+
+ if (realloc) {
+ aom_free(cm->tpl_mvs);
+ CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+ (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+ cm->tpl_mvs_mem_size = mem_size;
+ }
+}
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+ return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+}
+
+static INLINE void av1_init_above_context(CommonContexts *above_contexts,
+ int num_planes, int tile_row,
+ MACROBLOCKD *xd) {
+ for (int i = 0; i < num_planes; ++i) {
+ xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row];
+ }
+ xd->above_partition_context = above_contexts->partition[tile_row];
+ xd->above_txfm_context = above_contexts->txfm[tile_row];
+}
+
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
+ tran_low_t *dqcoeff) {
+ const int num_planes = av1_num_planes(cm);
+ const CommonQuantParams *const quant_params = &cm->quant_params;
+
+ for (int i = 0; i < num_planes; ++i) {
+ xd->plane[i].dqcoeff = dqcoeff;
+
+ if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+ memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
+ sizeof(quant_params->y_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix,
+ sizeof(quant_params->y_iqmatrix));
+
+ } else {
+ if (i == AOM_PLANE_U) {
+ memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX,
+ sizeof(quant_params->u_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix,
+ sizeof(quant_params->u_iqmatrix));
+ } else {
+ memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX,
+ sizeof(quant_params->v_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix,
+ sizeof(quant_params->v_iqmatrix));
+ }
+ }
+ }
+ xd->mi_stride = cm->mi_params.mi_stride;
+ xd->error_info = &cm->error;
+ cfl_init(&xd->cfl, &cm->seq_params);
+}
+
+static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+ const int num_planes) {
+ int i;
+ int row_offset = mi_row;
+ int col_offset = mi_col;
+ for (i = 0; i < num_planes; ++i) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ // Offset the buffer pointer
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+ row_offset = mi_row - 1;
+ if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+ col_offset = mi_col - 1;
+ int above_idx = col_offset;
+ int left_idx = row_offset & MAX_MIB_MASK;
+ pd->above_entropy_context =
+ &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
+ pd->left_entropy_context =
+ &xd->left_entropy_context[i][left_idx >> pd->subsampling_y];
+ }
+}
+
+static INLINE int calc_mi_size(int len) {
+ // len is in mi units. Align to a multiple of SBs.
+ return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
+}
+
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+ const int num_planes) {
+ int i;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+ xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+
+ xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
+ xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
+ }
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+ int mi_row, int bh, int mi_col, int bw,
+ int mi_rows, int mi_cols) {
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
+ xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
+ xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE);
+
+ xd->mi_row = mi_row;
+ xd->mi_col = mi_col;
+
+ // Are edges available for intra prediction?
+ xd->up_available = (mi_row > tile->mi_row_start);
+
+ const int ss_x = xd->plane[1].subsampling_x;
+ const int ss_y = xd->plane[1].subsampling_y;
+
+ xd->left_available = (mi_col > tile->mi_col_start);
+ xd->chroma_up_available = xd->up_available;
+ xd->chroma_left_available = xd->left_available;
+ if (ss_x && bw < mi_size_wide[BLOCK_8X8])
+ xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
+ if (ss_y && bh < mi_size_high[BLOCK_8X8])
+ xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
+ if (xd->up_available) {
+ xd->above_mbmi = xd->mi[-xd->mi_stride];
+ } else {
+ xd->above_mbmi = NULL;
+ }
+
+ if (xd->left_available) {
+ xd->left_mbmi = xd->mi[-1];
+ } else {
+ xd->left_mbmi = NULL;
+ }
+
+ const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+ ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+ xd->is_chroma_ref = chroma_ref;
+ if (chroma_ref) {
+ // To help calculate the "above" and "left" chroma blocks, note that the
+ // current block may cover multiple luma blocks (eg, if partitioned into
+ // 4x4 luma blocks).
+ // First, find the top-left-most luma block covered by this chroma block
+ MB_MODE_INFO **base_mi =
+ &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+ // Then, we consider the luma region covered by the left or above 4x4 chroma
+ // prediction. We want to point to the chroma reference block in that
+ // region, which is the bottom-right-most mi unit.
+ // This leads to the following offsets:
+ MB_MODE_INFO *chroma_above_mi =
+ xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+ xd->chroma_above_mbmi = chroma_above_mi;
+
+ MB_MODE_INFO *chroma_left_mi =
+ xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+ xd->chroma_left_mbmi = chroma_left_mi;
+ }
+
+ xd->height = bh;
+ xd->width = bw;
+ xd->is_sec_rect = 0;
+ if (xd->width < xd->height) {
+ // Only mark is_sec_rect as 1 for the last block.
+ // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
+ // For other partitions, it would be (0, 1).
+ if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1;
+ }
+
+ if (xd->width > xd->height)
+ if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1;
+}
+
+static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi) {
+ const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[above];
+ const int left_ctx = intra_mode_context[left];
+ return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE subsize,
+ BLOCK_SIZE bsize) {
+ PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
+ PARTITION_CONTEXT *const left_ctx =
+ xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ memset(above_ctx, partition_context_lookup[subsize].above, bw);
+ memset(left_ctx, partition_context_lookup[subsize].left, bh);
+}
+
+static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int subsampling_x, int subsampling_y) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
+ ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
+ return ref_pos;
+}
+
+static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
+ size_t element) {
+ assert(cdf != NULL);
+ return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
+}
+
+static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
+ const aom_cdf_prob *const in,
+ BLOCK_SIZE bsize) {
+ (void)bsize;
+ out[0] = CDF_PROB_TOP;
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ);
+ out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+ if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
+ out[0] = AOM_ICDF(out[0]);
+ out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
+ const aom_cdf_prob *const in,
+ BLOCK_SIZE bsize) {
+ (void)bsize;
+ out[0] = CDF_PROB_TOP;
+ out[0] -= cdf_element_prob(in, PARTITION_VERT);
+ out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
+ if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
+ out[0] = AOM_ICDF(out[0]);
+ out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE subsize,
+ BLOCK_SIZE bsize,
+ PARTITION_TYPE partition) {
+ if (bsize >= BLOCK_8X8) {
+ const int hbs = mi_size_wide[bsize] / 2;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+ switch (partition) {
+ case PARTITION_SPLIT:
+ if (bsize != BLOCK_8X8) break;
+ AOM_FALLTHROUGH_INTENDED;
+ case PARTITION_NONE:
+ case PARTITION_HORZ:
+ case PARTITION_VERT:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+ break;
+ case PARTITION_HORZ_A:
+ update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+ update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+ break;
+ case PARTITION_HORZ_B:
+ update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+ update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+ break;
+ case PARTITION_VERT_A:
+ update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+ update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+ break;
+ case PARTITION_VERT_B:
+ update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+ update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+ break;
+ default: assert(0 && "Invalid partition type");
+ }
+ }
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
+ const PARTITION_CONTEXT *left_ctx =
+ xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+ // Minimum partition point is 8x8. Offset the bsl accordingly.
+ const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+ int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+ assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
+ assert(bsl >= 0);
+
+ return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+ if (bsize <= BLOCK_8X8)
+ return PARTITION_TYPES;
+ else if (bsize == BLOCK_128X128)
+ return EXT_PARTITION_TYPES - 2;
+ else
+ return EXT_PARTITION_TYPES;
+}
+
+static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ int max_blocks_wide = block_size_wide[bsize];
+
+ if (xd->mb_to_right_edge < 0) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+ }
+
+ // Scale the width in the transform block unit.
+ return max_blocks_wide >> MI_SIZE_LOG2;
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ int max_blocks_high = block_size_high[bsize];
+
+ if (xd->mb_to_bottom_edge < 0) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+ }
+
+ // Scale the height in the transform block unit.
+ return max_blocks_high >> MI_SIZE_LOG2;
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+ const MACROBLOCKD *xd,
+ int mi_col_start, int mi_col_end,
+ const int tile_row) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ const int width = mi_col_end - mi_col_start;
+ const int aligned_width =
+ ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
+ const int offset_y = mi_col_start;
+ const int width_y = aligned_width;
+ const int offset_uv = offset_y >> seq_params->subsampling_x;
+ const int width_uv = width_y >> seq_params->subsampling_x;
+ CommonContexts *const above_contexts = &cm->above_contexts;
+
+ av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y);
+ if (num_planes > 1) {
+ if (above_contexts->entropy[1][tile_row] &&
+ above_contexts->entropy[2][tile_row]) {
+ av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv,
+ width_uv);
+ av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv,
+ width_uv);
+ } else {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid value of planes");
+ }
+ }
+
+ av1_zero_array(above_contexts->partition[tile_row] + mi_col_start,
+ aligned_width);
+
+ memset(above_contexts->txfm[tile_row] + mi_col_start,
+ tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+ av1_zero(xd->left_entropy_context);
+ av1_zero(xd->left_partition_context);
+
+ memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+ sizeof(xd->left_txfm_context_buffer));
+}
+
+// Disable array-bounds checks as the TX_SIZE enum contains values larger than
+// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround
+// infeasible. The assert is enough for static analysis and this or other tools
+// asan, valgrind would catch oob access at runtime.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic warning "-Warray-bounds"
+#endif
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+ int i;
+ for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
+ const MACROBLOCKD *xd) {
+ uint8_t bw = tx_size_wide[tx_size];
+ uint8_t bh = tx_size_high[tx_size];
+
+ if (skip) {
+ bw = n4_w * MI_SIZE;
+ bh = n4_h * MI_SIZE;
+ }
+
+ set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+ set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
+}
+
+static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col) {
+ return mi_row * mi_params->mi_stride + mi_col;
+}
+
+static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col) {
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ const int mi_alloc_row = mi_row / mi_alloc_size_1d;
+ const int mi_alloc_col = mi_col / mi_alloc_size_1d;
+
+ return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col;
+}
+
+// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi.
+static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col) {
+ // 'mi_grid_base' should point to appropriate memory in 'mi'.
+ const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+ const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+ mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
+ // 'xd->mi' should point to an offset in 'mi_grid_base';
+ xd->mi = mi_params->mi_grid_base + mi_grid_idx;
+ // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'.
+ xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx;
+ xd->tx_type_map_stride = mi_params->mi_stride;
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+ TXFM_CONTEXT *left_ctx,
+ TX_SIZE tx_size, TX_SIZE txb_size) {
+ BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
+ int bh = mi_size_high[bsize];
+ int bw = mi_size_wide[bsize];
+ uint8_t txw = tx_size_wide[tx_size];
+ uint8_t txh = tx_size_high[tx_size];
+ int i;
+ for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+ for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
+ switch (tx_dim) {
+ case 128:
+ case 64: return TX_64X64; break;
+ case 32: return TX_32X32; break;
+ case 16: return TX_16X16; break;
+ case 8: return TX_8X8; break;
+ default: return TX_4X4;
+ }
+}
+
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+ if (width == height) {
+ return get_sqr_tx_size(width);
+ }
+ if (width < height) {
+ if (width + width == height) {
+ switch (width) {
+ case 4: return TX_4X8; break;
+ case 8: return TX_8X16; break;
+ case 16: return TX_16X32; break;
+ case 32: return TX_32X64; break;
+ }
+ } else {
+ switch (width) {
+ case 4: return TX_4X16; break;
+ case 8: return TX_8X32; break;
+ case 16: return TX_16X64; break;
+ }
+ }
+ } else {
+ if (height + height == width) {
+ switch (height) {
+ case 4: return TX_8X4; break;
+ case 8: return TX_16X8; break;
+ case 16: return TX_32X16; break;
+ case 32: return TX_64X32; break;
+ }
+ } else {
+ switch (height) {
+ case 4: return TX_16X4; break;
+ case 8: return TX_32X8; break;
+ case 16: return TX_64X16; break;
+ }
+ }
+ }
+ assert(0);
+ return TX_4X4;
+}
+
+static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
+ const TXFM_CONTEXT *const left_ctx,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ const uint8_t txw = tx_size_wide[tx_size];
+ const uint8_t txh = tx_size_high[tx_size];
+ const int above = *above_ctx < txw;
+ const int left = *left_ctx < txh;
+ int category = TXFM_PARTITION_CONTEXTS;
+
+ // dummy return, not used by others.
+ if (tx_size <= TX_4X4) return 0;
+
+ TX_SIZE max_tx_size =
+ get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
+
+ if (max_tx_size >= TX_8X8) {
+ category =
+ (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+ (TX_SIZES - 1 - max_tx_size) * 2;
+ }
+ assert(category != TXFM_PARTITION_CONTEXTS);
+ return category * 3 + above + left;
+}
+
+// Compute the next partition in the direction of the sb_type stored in the mi
+// array, starting with bsize.
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols)
+ return PARTITION_INVALID;
+
+ const int offset = mi_row * mi_params->mi_stride + mi_col;
+ MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
+ const BLOCK_SIZE subsize = mi[0]->sb_type;
+
+ if (subsize == bsize) return PARTITION_NONE;
+
+ const int bhigh = mi_size_high[bsize];
+ const int bwide = mi_size_wide[bsize];
+ const int sshigh = mi_size_high[subsize];
+ const int sswide = mi_size_wide[subsize];
+
+ if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows &&
+ mi_col + bhigh / 2 < mi_params->mi_cols) {
+ // In this case, the block might be using an extended partition
+ // type.
+ const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+ const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride];
+
+ if (sswide == bwide) {
+ // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
+ // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
+ // half was split.
+ if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
+ assert(sshigh * 2 == bhigh);
+
+ if (mbmi_below->sb_type == subsize)
+ return PARTITION_HORZ;
+ else
+ return PARTITION_HORZ_B;
+ } else if (sshigh == bhigh) {
+ // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
+ // PARTITION_VERT_B. To distinguish the latter two, check if the right
+ // half was split.
+ if (sswide * 4 == bwide) return PARTITION_VERT_4;
+ assert(sswide * 2 == bhigh);
+
+ if (mbmi_right->sb_type == subsize)
+ return PARTITION_VERT;
+ else
+ return PARTITION_VERT_B;
+ } else {
+ // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
+ // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
+ // dimensions, we immediately know this is a split (which will recurse to
+ // get to subsize). Otherwise look down and to the right. With
+ // PARTITION_VERT_A, the right block will have height bhigh; with
+ // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
+ // it's PARTITION_SPLIT.
+ if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
+
+ if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
+ if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
+
+ return PARTITION_SPLIT;
+ }
+ }
+ const int vert_split = sswide < bwide;
+ const int horz_split = sshigh < bhigh;
+ const int split_idx = (vert_split << 1) | horz_split;
+ assert(split_idx != 0);
+
+ static const PARTITION_TYPE base_partitions[4] = {
+ PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
+ };
+
+ return base_partitions[split_idx];
+}
+
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+ BLOCK_SIZE sb_size) {
+ seq_params->sb_size = sb_size;
+ seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+ seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
+}
+
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ int coded_lossless = 1;
+ if (cm->seg.enabled) {
+ for (int i = 0; i < MAX_SEGMENTS; ++i) {
+ if (!xd->lossless[i]) {
+ coded_lossless = 0;
+ break;
+ }
+ }
+ } else {
+ coded_lossless = xd->lossless[0];
+ }
+ return coded_lossless;
+}
+
+static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
+ return seq_level_idx == SEQ_LEVEL_MAX ||
+ (seq_level_idx < SEQ_LEVELS &&
+ // The following levels are currently undefined.
+ seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
+ seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
+ seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 &&
+ seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
+ seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_AV1_COMMON_INT_H_
diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d.c b/media/libaom/src/av1/common/av1_inv_txfm1d.c
index 7ef2d6d7f..8d69efcd2 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm1d.c
+++ b/media/libaom/src/av1/common/av1_inv_txfm1d.c
@@ -13,11 +13,8 @@
#include "av1/common/av1_inv_txfm1d.h"
#include "av1/common/av1_txfm.h"
-// TODO(angiebird): Make 1-d txfm functions static
-//
-
-void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
assert(output != input);
const int32_t size = 4;
const int32_t *cospi = cospi_arr(cos_bit);
@@ -57,8 +54,8 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
}
-void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
assert(output != input);
const int32_t size = 8;
const int32_t *cospi = cospi_arr(cos_bit);
@@ -138,8 +135,8 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
}
-void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
assert(output != input);
const int32_t size = 16;
const int32_t *cospi = cospi_arr(cos_bit);
@@ -303,8 +300,8 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
}
-void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
assert(output != input);
const int32_t size = 32;
const int32_t *cospi = cospi_arr(cos_bit);
@@ -656,8 +653,8 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
}
-void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
int bit = cos_bit;
const int32_t *sinpi = sinpi_arr(bit);
int32_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -713,8 +710,8 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
output[3] = round_shift(x3, bit);
}
-void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
assert(output != input);
const int32_t size = 8;
const int32_t *cospi = cospi_arr(cos_bit);
@@ -809,7 +806,6 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
- stage++;
bf0 = step;
bf1 = output;
bf1[0] = bf0[0];
@@ -822,8 +818,8 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[7] = -bf0[1];
}
-void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
assert(output != input);
const int32_t size = 16;
const int32_t *cospi = cospi_arr(cos_bit);
@@ -1010,7 +1006,6 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
- stage++;
bf0 = step;
bf1 = output;
bf1[0] = bf0[0];
@@ -1064,8 +1059,8 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
}
-void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range) {
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
assert(output != input);
const int32_t size = 64;
const int32_t *cospi = cospi_arr(cos_bit);
diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d.h b/media/libaom/src/av1/common/av1_inv_txfm1d.h
index c31c019aa..e1d5d98d1 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm1d.h
+++ b/media/libaom/src/av1/common/av1_inv_txfm1d.h
@@ -29,22 +29,22 @@ static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
}
-void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
-void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
-void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
-void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
-void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
-void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
-void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
-void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
- const int8_t *stage_range);
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
index 7d80a0099..47fedbd2a 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
+++ b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
@@ -36,12 +36,12 @@ static const int8_t inv_start_range[TX_SIZES_ALL] = {
7, // 64x16 transform
};
-extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL];
-// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12
+// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12
// for each valid row and col combination
#define INV_COS_BIT 12
-extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
-extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
+extern const int8_t av1_inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t av1_inv_cos_bit_row[5 /*row*/][5 /*col*/];
#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/media/libaom/src/av1/common/av1_inv_txfm2d.c b/media/libaom/src/av1/common/av1_inv_txfm2d.c
index 4e6944314..559d12129 100644
--- a/media/libaom/src/av1/common/av1_inv_txfm2d.c
+++ b/media/libaom/src/av1/common/av1_inv_txfm2d.c
@@ -113,14 +113,14 @@ void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
switch (txfm_type) {
- case TXFM_TYPE_DCT4: return av1_idct4_new;
- case TXFM_TYPE_DCT8: return av1_idct8_new;
- case TXFM_TYPE_DCT16: return av1_idct16_new;
- case TXFM_TYPE_DCT32: return av1_idct32_new;
- case TXFM_TYPE_DCT64: return av1_idct64_new;
- case TXFM_TYPE_ADST4: return av1_iadst4_new;
- case TXFM_TYPE_ADST8: return av1_iadst8_new;
- case TXFM_TYPE_ADST16: return av1_iadst16_new;
+ case TXFM_TYPE_DCT4: return av1_idct4;
+ case TXFM_TYPE_DCT8: return av1_idct8;
+ case TXFM_TYPE_DCT16: return av1_idct16;
+ case TXFM_TYPE_DCT32: return av1_idct32;
+ case TXFM_TYPE_DCT64: return av1_idct64;
+ case TXFM_TYPE_ADST4: return av1_iadst4;
+ case TXFM_TYPE_ADST8: return av1_iadst8;
+ case TXFM_TYPE_ADST16: return av1_iadst16;
case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
@@ -149,7 +149,7 @@ static const int8_t inv_shift_32x8[2] = { -2, -4 };
static const int8_t inv_shift_16x64[2] = { -2, -4 };
static const int8_t inv_shift_64x16[2] = { -2, -4 };
-const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
+const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = {
inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32,
inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16,
inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
@@ -158,7 +158,7 @@ const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
};
/* clang-format off */
-const int8_t inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx
+const int8_t av1_inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx
[MAX_TXWH_IDX] = { // txh_idx
{ INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 },
{ INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 },
@@ -167,7 +167,7 @@ const int8_t inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx
{ 0, 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
};
-const int8_t inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx
+const int8_t av1_inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx
[MAX_TXWH_IDX] = { // txh_idx
{ INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 },
{ INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 },
@@ -177,23 +177,22 @@ const int8_t inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx
};
/* clang-format on */
-const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
+static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
TXFM_2D_FLIP_CFG *cfg) {
assert(cfg != NULL);
cfg->tx_size = tx_size;
- set_flip_cfg(tx_type, cfg);
av1_zero(cfg->stage_range_col);
av1_zero(cfg->stage_range_row);
set_flip_cfg(tx_type, cfg);
const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
- cfg->shift = inv_txfm_shift_ls[tx_size];
+ cfg->shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ cfg->cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+ cfg->cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
@@ -229,7 +228,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
(void)real_range_row;
if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
// the adst4 may use 1 extra bit on top of opt_range_row at stage 1
- // so opt_range_col >= real_range_col will not hold
+ // so opt_range_row >= real_range_row will not hold
stage_range_row[i] = opt_range_row;
} else {
assert(opt_range_row >= real_range_row);
@@ -242,7 +241,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
(void)real_range_col;
if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
- // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+ // the adst4 may use 1 extra bit on top of opt_range_col at stage 1
// so opt_range_col >= real_range_col will not hold
stage_range_col[i] = opt_range_col;
} else {
diff --git a/media/libaom/src/av1/common/av1_loopfilter.c b/media/libaom/src/av1/common/av1_loopfilter.c
index 537d8dfe9..c756760de 100644
--- a/media/libaom/src/av1/common/av1_loopfilter.c
+++ b/media/libaom/src/av1/common/av1_loopfilter.c
@@ -17,8 +17,8 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/av1_loopfilter.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/reconinter.h"
#include "av1/common/seg_common.h"
@@ -28,11 +28,9 @@ static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
{ SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
};
-static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
- { 0, 1 }, { 2, 2 }, { 3, 3 }
-};
-
-typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
+ { 2, 2 },
+ { 3, 3 } };
static const int mode_lf_lut[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES
@@ -40,393 +38,6 @@ static const int mode_lf_lut[] = {
1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
};
-#if LOOP_FILTER_BITMASK
-// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the left border of an 4x4 block boundary.
-//
-// In the case of TX_8x8-> ( in low order byte first we end up with
-// a mask that looks like this (-- and | are used for better view)
-//
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// -----------------
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-// 10101010|10101010
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the top border of an 4x4 block boundary.
-//
-// In the case of TX_8x8-> ( in low order byte first we end up with
-// a mask that looks like this
-//
-// 11111111|11111111
-// 00000000|00000000
-// 11111111|11111111
-// 00000000|00000000
-// 11111111|11111111
-// 00000000|00000000
-// 11111111|11111111
-// 00000000|00000000
-// -----------------
-// 11111111|11111111
-// 00000000|00000000
-// 11111111|11111111
-// 00000000|00000000
-// 11111111|11111111
-// 00000000|00000000
-// 11111111|11111111
-// 00000000|00000000
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
-};
-
-const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
- -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
-};
-
-const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
- -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
-};
-
-const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
- -1, -1, -1, 0, 1, 2,
- 3, -1, -1, -1, -1, -1,
- -1, -1, -1, -1 };
-
-const FilterMask left_mask_univariant_reordered[67] = {
- // TX_4X4
- { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
- { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
- { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
- { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
- { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
- { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
- { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
- { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
- { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
- { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
- { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
- 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
- { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
- { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
- 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
- { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
- { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
- { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
- 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
- { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
- // TX_8X8
- { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
- { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
- { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
- { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
- { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
- { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
- { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
- 0x0055005500550055ULL } }, // block size 32X64, TX_8X8
- { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
- { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
- 0x5555555555555555ULL } }, // block size 64X64, TX_8X8
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
- { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
- { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
- 0x0005000500050005ULL } }, // block size 16X64, TX_8X8
- { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
- // TX_16X16
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
- { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
- { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
- { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
- 0x0011001100110011ULL } }, // block size 32X64, TX_16X16
- { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
- { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
- 0x1111111111111111ULL } }, // block size 64X64, TX_16X16
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL } }, // block size 16X64, TX_16X16
- { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
- // TX_32X32
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
- { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
- 0x0101010101010101ULL } }, // block size 32X64, TX_32X32
- { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
- { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
- 0x0101010101010101ULL } }, // block size 64X64, TX_32X32
- // TX_64X64
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL } }, // block size 64X64, TX_64X64
- // 2:1, 1:2 transform sizes.
- { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
- { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
- { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
- { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
- { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL } }, // block size 16X64, TX_16X32
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
- { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL } }, // block size 32X64, TX_32X64
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
- // 4:1, 1:4 transform sizes.
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
- { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
- { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
- { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL } }, // block size 16X64, TX_16X64
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
-};
-
-const FilterMask above_mask_univariant_reordered[67] = {
- // TX_4X4
- { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
- { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
- { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
- { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
- { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
- { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
- { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
- { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
- { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
- { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
- { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
- 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
- { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
- { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
- 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4
- { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
- { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
- { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
- { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
- 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
- { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
- // TX_8X8
- { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
- { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
- { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
- { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
- { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
- { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
- { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
- 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8
- { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
- { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
- 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8
- { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
- { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
- 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8
- { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
- // TX_16X16
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
- { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
- { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
- { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
- 0x00000000000000ffULL } }, // block size 32X64, TX_16X16
- { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
- { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
- 0x000000000000ffffULL } }, // block size 64X64, TX_16X16
- { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
- 0x000000000000000fULL } }, // block size 16X64, TX_16X16
- { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
- // TX_32X32
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
- 0x0000000000000000ULL } }, // block size 32X64, TX_32X32
- { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
- { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
- 0x0000000000000000ULL } }, // block size 64X64, TX_32X32
- // TX_64X64
- { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X64, TX_64X64
- // 2:1, 1:2 transform sizes.
- { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
- { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
- { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
- { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
- { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
- 0x0000000000000000ULL } }, // block size 16X64, TX_16X32
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
- { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X64, TX_32X64
- { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
- // 4:1, 1:4 transform sizes.
- { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
- { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
- { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
- { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 16X64, TX_16X64
- { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
-};
-
-LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
- int mi_col) {
- assert(cm->lf.lfm != NULL);
- const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64
- const int col = mi_col >> MIN_MIB_SIZE_LOG2;
- return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
-}
-
-typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh);
-
-typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1);
-
-typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh, int bd);
-
-typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd);
-#endif // LOOP_FILTER_BITMASK
-
static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
int lvl;
@@ -448,13 +59,13 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
}
}
-uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
- const int dir_idx, int plane,
- const MB_MODE_INFO *mbmi) {
+uint8_t av1_get_filter_level(const AV1_COMMON *cm,
+ const loop_filter_info_n *lfi_n, const int dir_idx,
+ int plane, const MB_MODE_INFO *mbmi) {
const int segment_id = mbmi->segment_id;
- if (cm->delta_lf_present_flag) {
- int delta_lf;
- if (cm->delta_lf_multi) {
+ if (cm->delta_q_info.delta_lf_present_flag) {
+ int8_t delta_lf;
+ if (cm->delta_q_info.delta_lf_multi) {
const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
delta_lf = mbmi->delta_lf[delta_lf_idx];
} else {
@@ -531,6 +142,9 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
filt_lvl_r[1] = cm->lf.filter_level_u;
filt_lvl_r[2] = cm->lf.filter_level_v;
+ assert(plane_start >= AOM_PLANE_Y);
+ assert(plane_end <= MAX_MB_PLANE);
+
for (plane = plane_start; plane < plane_end; plane++) {
if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
break;
@@ -542,7 +156,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
for (int dir = 0; dir < 2; ++dir) {
int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
- assert(plane >= 0 && plane <= 2);
const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
@@ -575,1321 +188,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
}
}
-#if LOOP_FILTER_BITMASK
-// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
-// Every 4 rows is represented by one uint64_t mask. Hence,
-// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
-//
-// Given a location by (mi_col, mi_row), This function returns the index
-// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
-//
-// For example, mi_row is the offset of pixels in mi size (4),
-// (mi_row / 4) returns which uint64_t.
-// After locating which uint64_t, mi_row % 4 is the
-// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
-// Therefore, shift = (row << stride_log2) + mi_col;
-int get_index_shift(int mi_col, int mi_row, int *index) {
- // *index = mi_row >> 2;
- // rows = mi_row % 4;
- // stride_log2 = 4;
- // shift = (rows << stride_log2) + mi_col;
- *index = mi_row >> 2;
- return ((mi_row & 3) << 4) | mi_col;
-}
-
-static void check_mask(const FilterMask *lfm) {
-#ifndef NDEBUG
- for (int i = 0; i < 4; ++i) {
- assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
- assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
- assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
- assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
- assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
- assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
- assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
- assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
- assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
- assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
- }
-#else
- (void)lfm;
-#endif
-}
-
-static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
- if (plane == 0) {
- // Assert if we try to apply 2 different loop filters at the same
- // position.
- check_mask(lfm->left_y);
- check_mask(lfm->above_y);
- } else if (plane == 1) {
- check_mask(lfm->left_u);
- check_mask(lfm->above_u);
- } else {
- check_mask(lfm->left_v);
- check_mask(lfm->above_v);
- }
-}
-
-static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
- TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
- if (dir == VERT_EDGE) {
- switch (plane) {
- case 0:
- for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
- break;
- case 1:
- for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
- break;
- case 2:
- for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
- break;
- default: assert(plane <= 2);
- }
- } else {
- switch (plane) {
- case 0:
- for (int i = 0; i < 4; ++i)
- lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
- break;
- case 1:
- for (int i = 0; i < 4; ++i)
- lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
- break;
- case 2:
- for (int i = 0; i < 4; ++i)
- lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
- break;
- default: assert(plane <= 2);
- }
- }
-}
-
-static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
- int mi_col, int ssx, int ssy, EDGE_DIR dir) {
- if (plane && (ssx || ssy)) {
- if (ssx && ssy) { // format 420
- if ((mi_row << MI_SIZE_LOG2) > cm->height ||
- (mi_col << MI_SIZE_LOG2) > cm->width)
- return 1;
- } else if (ssx) { // format 422
- if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
- (mi_col << MI_SIZE_LOG2) > cm->width)
- return 1;
- }
- } else {
- if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
- (mi_col << MI_SIZE_LOG2) >= cm->width)
- return 1;
- }
-
- int row_or_col;
- if (plane == 0) {
- row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
- } else {
- // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
- // So if mi_col == 1, it is actually the frame boundary.
- if (dir == VERT_EDGE) {
- row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
- } else {
- row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
- }
- }
- return row_or_col == 0;
-}
-
-static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
- int ssx, int ssy, TX_SIZE tx_size) {
- LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
- const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
- const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
- // decide whether current vertical/horizontal edge needs loop filtering
- for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
- // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
- mi_row |= ssy;
- mi_col |= ssx;
-
- MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
- const MB_MODE_INFO *const mbmi = mi[0];
- const int curr_skip = mbmi->skip && is_inter_block(mbmi);
- const BLOCK_SIZE bsize = mbmi->sb_type;
- const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
- const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
- const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
- const int prediction_masks = dir == VERT_EDGE
- ? block_size_wide[plane_bsize] - 1
- : block_size_high[plane_bsize] - 1;
- const int is_coding_block_border =
- dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
-
- // TODO(chengchen): step can be optimized.
- const int row_step = mi_size_high[TX_4X4] << ssy;
- const int col_step = mi_size_wide[TX_4X4] << ssx;
- const int mi_height =
- dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
- const int mi_width =
- dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
-
- // assign filter levels
- for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
- for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
- // do not filter frame boundary
- // Note: when chroma planes' size are half of luma plane,
- // chroma plane mi corresponds to even position.
- // If frame size is not even, we still need to filter this chroma
- // position. Therefore the boundary condition check needs to be
- // separated to two cases.
- if (plane && (ssx || ssy)) {
- if (ssx && ssy) { // format 420
- if ((r << MI_SIZE_LOG2) > cm->height ||
- (c << MI_SIZE_LOG2) > cm->width)
- continue;
- } else if (ssx) { // format 422
- if ((r << MI_SIZE_LOG2) >= cm->height ||
- (c << MI_SIZE_LOG2) > cm->width)
- continue;
- }
- } else {
- if ((r << MI_SIZE_LOG2) >= cm->height ||
- (c << MI_SIZE_LOG2) >= cm->width)
- continue;
- }
-
- const int row = r % MI_SIZE_64X64;
- const int col = c % MI_SIZE_64X64;
- if (plane == 0) {
- if (dir == VERT_EDGE)
- lfm->lfl_y_ver[row][col] = level;
- else
- lfm->lfl_y_hor[row][col] = level;
- } else if (plane == 1) {
- lfm->lfl_u[row][col] = level;
- } else {
- lfm->lfl_v[row][col] = level;
- }
- }
- }
-
- for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
- for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
- // do not filter frame boundary
- if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
-
- uint64_t mask[4] = { 0 };
- const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
- const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
- MB_MODE_INFO **mi_prev =
- cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
- const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
- const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
- const uint8_t level_prev =
- get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
- const int is_edge =
- (level || level_prev) &&
- (!curr_skip || !prev_skip || is_coding_block_border);
-
- if (is_edge) {
- const TX_SIZE prev_tx_size =
- plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
- : mbmi_prev->tx_size;
- TX_SIZE min_tx_size = (dir == VERT_EDGE)
- ? AOMMIN(txsize_horz_map[tx_size],
- txsize_horz_map[prev_tx_size])
- : AOMMIN(txsize_vert_map[tx_size],
- txsize_vert_map[prev_tx_size]);
- min_tx_size = AOMMIN(min_tx_size, TX_16X16);
- assert(min_tx_size < TX_SIZES);
- const int row = r % MI_SIZE_64X64;
- const int col = c % MI_SIZE_64X64;
- int index = 0;
- const int shift = get_index_shift(col, row, &index);
- assert(index < 4 && index >= 0);
- mask[index] |= ((uint64_t)1 << shift);
- // set mask on corresponding bit
- update_masks(dir, plane, mask, min_tx_size, lfm);
- }
- }
- }
- }
-}
-
-static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
- int blk_row, int blk_col,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- int plane, int ssx, int ssy) {
- blk_row <<= ssy;
- blk_col <<= ssx;
- if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
- ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
- return;
-
- // U/V plane, tx_size is always the largest size
- if (plane) {
- assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
- setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
- tx_size);
- return;
- }
-
- MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
- const MB_MODE_INFO *const mbmi = mi[0];
- // For Y plane:
- // If intra block, tx size is univariant.
- // If inter block, tx size follows inter_tx_size.
- TX_SIZE plane_tx_size = tx_size;
- const int is_inter = is_inter_block(mbmi);
-
- if (plane == 0) {
- if (is_inter) {
- if (mbmi->skip) {
- // TODO(chengchen): change av1_get_transform_size() to be consistant.
- // plane_tx_size = get_max_rect_tx_size(plane_bsize);
- plane_tx_size = mbmi->tx_size;
- } else {
- plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
- plane_bsize, blk_row, blk_col)];
- }
- } else {
- MB_MODE_INFO **mi_this = cm->mi_grid_visible +
- (mi_row + blk_row) * cm->mi_stride + mi_col +
- blk_col;
- const MB_MODE_INFO *const mbmi_this = mi_this[0];
- plane_tx_size = mbmi_this->tx_size;
- }
- }
-
- assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
-
- if (plane || plane_tx_size == tx_size) {
- setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
- tx_size);
- } else {
- const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
- const int bsw = tx_size_wide_unit[sub_txs];
- const int bsh = tx_size_high_unit[sub_txs];
- for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
- for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
- const int offsetr = blk_row + row;
- const int offsetc = blk_col + col;
- setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
- sub_txs, plane, ssx, ssy);
- }
- }
- }
-}
-
-static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
- int plane, int ssx, int ssy) {
- MB_MODE_INFO **mi =
- cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
- const MB_MODE_INFO *const mbmi = mi[0];
-
- const BLOCK_SIZE bsize = mbmi->sb_type;
- const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
- const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
-
- const int block_width = mi_size_wide[plane_bsize];
- const int block_height = mi_size_high[plane_bsize];
-
- TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
- // The decoder is designed so that it can process 64x64 luma pixels at a
- // time. If this is a chroma plane with subsampling and bsize corresponds to
- // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
- // mustn't be used for the subsampled plane (because it would be bigger than
- // a 64x64 luma block) so we round down to TX_32X32.
- if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
- if (max_txsize == TX_16X64)
- max_txsize = TX_16X32;
- else if (max_txsize == TX_64X16)
- max_txsize = TX_32X16;
- else
- max_txsize = TX_32X32;
- }
-
- const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
- const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
- const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
- const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
- int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
- int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
- mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
- mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
-
- // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
- // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
- // U/V: largest tx size is 32x32.
- for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
- for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
- const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
- const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
- for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
- for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
- setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
- max_txsize, plane, ssx, ssy);
- }
- }
- }
- }
-}
-
-static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
- BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
- if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
- (mi_col << MI_SIZE_LOG2) >= cm->width)
- return;
-
- const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
- const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
- const int hbs = mi_size_wide[bsize] / 2;
- const int quarter_step = mi_size_wide[bsize] / 4;
- const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
- const int has_next_row =
- (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
- const int has_next_col =
- (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
- int i;
-
- switch (partition) {
- case PARTITION_NONE:
- setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
- break;
- case PARTITION_HORZ:
- setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
- if (has_next_row)
- setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
- break;
- case PARTITION_VERT:
- setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
- if (has_next_col)
- setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
- break;
- case PARTITION_SPLIT:
- setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
- if (has_next_col)
- setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
- if (has_next_row)
- setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
- if (has_next_col & has_next_row)
- setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
- ssy);
- break;
- case PARTITION_HORZ_A:
- setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
- if (has_next_col)
- setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
- if (has_next_row)
- setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
- break;
- case PARTITION_HORZ_B:
- setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
- if (has_next_row)
- setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
- if (has_next_col & has_next_row)
- setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
- break;
- case PARTITION_VERT_A:
- setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
- if (has_next_row)
- setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
- if (has_next_col)
- setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
- break;
- case PARTITION_VERT_B:
- setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
- if (has_next_col)
- setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
- if (has_next_row)
- setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
- break;
- case PARTITION_HORZ_4:
- for (i = 0; i < 4; ++i) {
- int this_mi_row = mi_row + i * quarter_step;
- if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
- // chroma plane filter the odd location
- if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
-
- setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
- }
- break;
- case PARTITION_VERT_4:
- for (i = 0; i < 4; ++i) {
- int this_mi_col = mi_col + i * quarter_step;
- if (i > 0 && this_mi_col >= cm->mi_cols) break;
- // chroma plane filter the odd location
- if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
-
- setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
- }
- break;
- default: assert(0);
- }
-}
-
-// TODO(chengchen): if lossless, do not need to setup mask. But when
-// segments enabled, each segment has different lossless settings.
-void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
- int subsampling_x, int subsampling_y, int row_end,
- int col_end) {
- const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
- for (int y = 0; y < num_64x64; ++y) {
- for (int x = 0; x < num_64x64; ++x) {
- const int row = mi_row + y * MI_SIZE_64X64;
- const int col = mi_col + x * MI_SIZE_64X64;
- if (row >= row_end || col >= col_end) continue;
- if ((row << MI_SIZE_LOG2) >= cm->height ||
- (col << MI_SIZE_LOG2) >= cm->width)
- continue;
-
- LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
- if (lfm == NULL) return;
-
- // init mask to zero
- if (plane == 0) {
- av1_zero(lfm->left_y);
- av1_zero(lfm->above_y);
- av1_zero(lfm->lfl_y_ver);
- av1_zero(lfm->lfl_y_hor);
- } else if (plane == 1) {
- av1_zero(lfm->left_u);
- av1_zero(lfm->above_u);
- av1_zero(lfm->lfl_u);
- } else {
- av1_zero(lfm->left_v);
- av1_zero(lfm->above_v);
- av1_zero(lfm->lfl_v);
- }
- }
- }
-
- // set up bitmask for each superblock
- setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
- subsampling_x, subsampling_y);
-
- for (int y = 0; y < num_64x64; ++y) {
- for (int x = 0; x < num_64x64; ++x) {
- const int row = mi_row + y * MI_SIZE_64X64;
- const int col = mi_col + x * MI_SIZE_64X64;
- if (row >= row_end || col >= col_end) continue;
- if ((row << MI_SIZE_LOG2) >= cm->height ||
- (col << MI_SIZE_LOG2) >= cm->width)
- continue;
-
- LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
- if (lfm == NULL) return;
-
- // check if the mask is valid
- check_loop_filter_masks(lfm, plane);
-
- {
- // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
- // Even tx size is greater, we only apply max length filter, which
- // is 16.
- if (plane == 0) {
- for (int j = 0; j < 4; ++j) {
- lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
- lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
- lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
- lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
-
- // set 32x32 and 64x64 to 0
- lfm->left_y[TX_32X32].bits[j] = 0;
- lfm->left_y[TX_64X64].bits[j] = 0;
- lfm->above_y[TX_32X32].bits[j] = 0;
- lfm->above_y[TX_64X64].bits[j] = 0;
- }
- } else if (plane == 1) {
- for (int j = 0; j < 4; ++j) {
- lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
- lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
-
- // set 32x32 to 0
- lfm->left_u[TX_32X32].bits[j] = 0;
- lfm->above_u[TX_32X32].bits[j] = 0;
- }
- } else {
- for (int j = 0; j < 4; ++j) {
- lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
- lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
-
- // set 32x32 to 0
- lfm->left_v[TX_32X32].bits[j] = 0;
- lfm->above_v[TX_32X32].bits[j] = 0;
- }
- }
- }
-
- // check if the mask is valid
- check_loop_filter_masks(lfm, plane);
- }
- }
-}
-
-static void filter_selectively_vert_row2(
- int subsampling_factor, uint8_t *s, int pitch, int plane,
- uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
- uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
- const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
- uint64_t mask;
- const int step = 1 << subsampling_factor;
-
- for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
- mask_8x8_1 | mask_4x4_1;
- mask; mask >>= step) {
- const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
- const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
- if (mask & 1) {
- if ((mask_16x16_0 | mask_16x16_1) & 1) {
- // chroma plane filters less pixels introduced in deblock_13tap
- // experiment
- LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
-
- if ((mask_16x16_0 & mask_16x16_1) & 1) {
- if (plane) {
- aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- } else {
- aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- }
- } else if (mask_16x16_0 & 1) {
- lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- } else {
- lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- }
- }
-
- if ((mask_8x8_0 | mask_8x8_1) & 1) {
- // chroma plane filters less pixels introduced in deblock_13tap
- // experiment
- LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
-
- if ((mask_8x8_0 & mask_8x8_1) & 1) {
- if (plane) {
- aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- } else {
- aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- }
- } else if (mask_8x8_0 & 1) {
- lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- } else {
- lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- }
- }
-
- if ((mask_4x4_0 | mask_4x4_1) & 1) {
- if ((mask_4x4_0 & mask_4x4_1) & 1) {
- aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- } else if (mask_4x4_0 & 1) {
- aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- } else {
- aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
- }
- }
- }
-
- s += 4;
- lfl += step;
- lfl2 += step;
- mask_16x16_0 >>= step;
- mask_8x8_0 >>= step;
- mask_4x4_0 >>= step;
- mask_16x16_1 >>= step;
- mask_8x8_1 >>= step;
- mask_4x4_1 >>= step;
- }
-}
-
-static void highbd_filter_selectively_vert_row2(
- int subsampling_factor, uint16_t *s, int pitch, int plane,
- uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
- uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
- const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
- uint64_t mask;
- const int step = 1 << subsampling_factor;
-
- for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
- mask_8x8_1 | mask_4x4_1;
- mask; mask >>= step) {
- const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
- const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
- if (mask & 1) {
- if ((mask_16x16_0 | mask_16x16_1) & 1) {
- // chroma plane filters less pixels introduced in deblock_13tap
- // experiment
- HbdLpfFunc highbd_lpf_vertical =
- plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
-
- if ((mask_16x16_0 & mask_16x16_1) & 1) {
- if (plane) {
- aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
- } else {
- aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
- }
- } else if (mask_16x16_0 & 1) {
- highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
- bd);
- } else {
- highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr, bd);
- }
- }
-
- if ((mask_8x8_0 | mask_8x8_1) & 1) {
- HbdLpfFunc highbd_lpf_vertical =
- plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
-
- if ((mask_8x8_0 & mask_8x8_1) & 1) {
- if (plane) {
- aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
- } else {
- aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
- }
- } else if (mask_8x8_0 & 1) {
- highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
- bd);
- } else {
- highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr, bd);
- }
- }
-
- if ((mask_4x4_0 | mask_4x4_1) & 1) {
- if ((mask_4x4_0 & mask_4x4_1) & 1) {
- aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr, bd);
- } else if (mask_4x4_0 & 1) {
- aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, bd);
- } else {
- aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
- }
- }
- }
-
- s += 4;
- lfl += step;
- lfl2 += step;
- mask_16x16_0 >>= step;
- mask_8x8_0 >>= step;
- mask_4x4_0 >>= step;
- mask_16x16_1 >>= step;
- mask_8x8_1 >>= step;
- mask_4x4_1 >>= step;
- }
-}
-
-static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
- int subsampling, uint64_t mask_16x16,
- uint64_t mask_8x8, uint64_t mask_4x4,
- const loop_filter_info_n *lfi_n,
- const uint8_t *lfl) {
- uint64_t mask;
- int count;
- const int step = 1 << subsampling;
- const unsigned int two_block_mask = subsampling ? 5 : 3;
-
- for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
- const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
- // Next block's thresholds.
- const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
-
- count = 1;
- if (mask & 1) {
- if (mask_16x16 & 1) {
- // chroma plane filters less pixels introduced in deblock_13tap
- // experiment
- LpfFunc lpf_horizontal =
- plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
-
- if ((mask_16x16 & two_block_mask) == two_block_mask) {
- if (plane) {
- aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr);
- } else {
- aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr);
- }
- count = 2;
- } else {
- lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- }
- } else if (mask_8x8 & 1) {
- // chroma plane filters less pixels introduced in deblock_13tap
- // experiment
- LpfFunc lpf_horizontal =
- plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
-
- if ((mask_8x8 & two_block_mask) == two_block_mask) {
- if (plane) {
- aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr);
- } else {
- aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr);
- }
- count = 2;
- } else {
- lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- }
- } else if (mask_4x4 & 1) {
- if ((mask_4x4 & two_block_mask) == two_block_mask) {
- aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr);
- count = 2;
- } else {
- aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- }
- }
- }
-
- s += 4 * count;
- lfl += step * count;
- mask_16x16 >>= step * count;
- mask_8x8 >>= step * count;
- mask_4x4 >>= step * count;
- }
-}
-
-static void highbd_filter_selectively_horiz(
- uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
- uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
- uint8_t *lfl, int bd) {
- uint64_t mask;
- int count;
- const int step = 1 << subsampling;
- const unsigned int two_block_mask = subsampling ? 5 : 3;
-
- for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
- const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
- // Next block's thresholds.
- const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
-
- count = 1;
- if (mask & 1) {
- if (mask_16x16 & 1) {
- HbdLpfFunc highbd_lpf_horizontal =
- plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
-
- if ((mask_16x16 & two_block_mask) == two_block_mask) {
- if (plane) {
- aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim,
- lfin->lim, lfin->hev_thr, bd);
- } else {
- aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim,
- lfin->lim, lfin->hev_thr, bd);
- }
- count = 2;
- } else {
- highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
- bd);
- }
- } else if (mask_8x8 & 1) {
- HbdLpfFunc highbd_lpf_horizontal =
- plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
-
- if ((mask_8x8 & two_block_mask) == two_block_mask) {
- if (plane) {
- aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim,
- lfin->lim, lfin->hev_thr, bd);
- } else {
- aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim,
- lfin->lim, lfin->hev_thr, bd);
- }
- count = 2;
- } else {
- highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
- bd);
- }
- } else if (mask_4x4 & 1) {
- if ((mask_4x4 & two_block_mask) == two_block_mask) {
- aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
- count = 2;
- } else {
- aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, bd);
- }
- }
- }
-
- s += 4 * count;
- lfl += step * count;
- mask_16x16 >>= step * count;
- mask_8x8 >>= step * count;
- mask_4x4 >>= step * count;
- }
-}
-
-void av1_build_bitmask_vert_info(
- AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
- int plane) {
- const int subsampling_x = plane_ptr->subsampling_x;
- const int subsampling_y = plane_ptr->subsampling_y;
- const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
- const int is_uv = plane > 0;
- TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
- uint8_t level, prev_level = 1;
- int skip, prev_skip = 0;
- int is_coding_block_border;
-
- for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
- const int mi_row = r << subsampling_y;
- const int row = mi_row % MI_SIZE_64X64;
- int index = 0;
- const int shift = get_index_shift(0, row, &index);
-
- for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
- c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
- const int mi_col = c << subsampling_x;
- LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
- for (int col_in_unit = 0;
- col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
- const int x = (c + col_in_unit) << MI_SIZE_LOG2;
- if (x >= plane_ptr->dst.width) break;
- const int col = col_in_unit << subsampling_x;
- const uint64_t mask = ((uint64_t)1 << (shift | col));
- skip = lfm->skip.bits[index] & mask;
- is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
- switch (plane) {
- case 0: level = lfm->lfl_y_ver[row][col]; break;
- case 1: level = lfm->lfl_u[row][col]; break;
- case 2: level = lfm->lfl_v[row][col]; break;
- default: assert(plane >= 0 && plane <= 2); return;
- }
- for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
- if (is_uv && ts == TX_64X64) continue;
- if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
- tx_size = ts;
- break;
- }
- }
- if ((c + col_in_unit > 0) && (level || prev_level) &&
- (!prev_skip || !skip || is_coding_block_border)) {
- const TX_SIZE min_tx_size =
- AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
- const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
- const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
- const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
- const uint64_t mask_1 = ((uint64_t)1 << shift_1);
- switch (plane) {
- case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
- case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
- case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
- default: assert(plane >= 0 && plane <= 2); return;
- }
- }
-
- // update prev info
- prev_level = level;
- prev_skip = skip;
- prev_tx_size = tx_size;
- // advance
- col_in_unit += tx_size_wide_unit[tx_size];
- }
- }
- }
-}
-
-void av1_build_bitmask_horz_info(
- AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
- int plane) {
- const int subsampling_x = plane_ptr->subsampling_x;
- const int subsampling_y = plane_ptr->subsampling_y;
- const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
- const int is_uv = plane > 0;
- TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
- uint8_t level, prev_level = 1;
- int skip, prev_skip = 0;
- int is_coding_block_border;
-
- for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
- const int mi_col = c << subsampling_x;
- const int col = mi_col % MI_SIZE_64X64;
-
- for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
- r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
- const int mi_row = r << subsampling_y;
- LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
- for (int r_in_unit = 0;
- r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
- const int y = (r + r_in_unit) << MI_SIZE_LOG2;
- if (y >= plane_ptr->dst.height) break;
- const int row = r_in_unit << subsampling_y;
- int index = 0;
- const int shift = get_index_shift(col, row, &index);
- const uint64_t mask = ((uint64_t)1 << shift);
- skip = lfm->skip.bits[index] & mask;
- is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
- switch (plane) {
- case 0: level = lfm->lfl_y_hor[row][col]; break;
- case 1: level = lfm->lfl_u[row][col]; break;
- case 2: level = lfm->lfl_v[row][col]; break;
- default: assert(plane >= 0 && plane <= 2); return;
- }
- for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
- if (is_uv && ts == TX_64X64) continue;
- if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
- tx_size = ts;
- break;
- }
- }
- if ((r + r_in_unit > 0) && (level || prev_level) &&
- (!prev_skip || !skip || is_coding_block_border)) {
- const TX_SIZE min_tx_size =
- AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
- const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
- const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
- const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
- const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-
- switch (plane) {
- case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
- case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
- case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
- default: assert(plane >= 0 && plane <= 2); return;
- }
- }
-
- // update prev info
- prev_level = level;
- prev_skip = skip;
- prev_tx_size = tx_size;
- // advance
- r_in_unit += tx_size_high_unit[tx_size];
- }
- }
- }
-}
-
-void av1_filter_block_plane_bitmask_vert(
- AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
- int mi_row, int mi_col) {
- struct buf_2d *const dst = &plane_ptr->dst;
- uint8_t *const buf0 = dst->buf;
- const int ssx = plane_ptr->subsampling_x;
- const int ssy = plane_ptr->subsampling_y;
- const int mask_cutoff = 0xffff;
- const int row_step = 1 << ssy;
- const int two_row_step = 2 << ssy;
- const int row_stride = dst->stride << MI_SIZE_LOG2;
- const int two_row_stride = row_stride << 1;
- uint64_t mask_16x16 = 0;
- uint64_t mask_8x8 = 0;
- uint64_t mask_4x4 = 0;
- uint8_t *lfl;
- uint8_t *lfl2;
- LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
- assert(lfm);
-
- // 1. vertical filtering. filter two rows at a time
- for (int r = 0;
- ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
- r += two_row_step) {
- const int row = r | ssy;
- const int row_next = row + row_step;
- const int col = ssx;
- int index = 0;
- const int shift = get_index_shift(col, row, &index);
- int index_next = 0;
- const int shift_next = get_index_shift(col, row_next, &index_next);
- switch (pl) {
- case 0:
- mask_16x16 = lfm->left_y[TX_16X16].bits[index];
- mask_8x8 = lfm->left_y[TX_8X8].bits[index];
- mask_4x4 = lfm->left_y[TX_4X4].bits[index];
- lfl = &lfm->lfl_y_ver[row][col];
- lfl2 = &lfm->lfl_y_ver[row_next][col];
- break;
- case 1:
- mask_16x16 = lfm->left_u[TX_16X16].bits[index];
- mask_8x8 = lfm->left_u[TX_8X8].bits[index];
- mask_4x4 = lfm->left_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u[row][col];
- lfl2 = &lfm->lfl_u[row_next][col];
- break;
- case 2:
- mask_16x16 = lfm->left_v[TX_16X16].bits[index];
- mask_8x8 = lfm->left_v[TX_8X8].bits[index];
- mask_4x4 = lfm->left_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v[row][col];
- lfl2 = &lfm->lfl_v[row_next][col];
- break;
- default: assert(pl >= 0 && pl <= 2); return;
- }
- uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
- uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
- uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
- uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
- uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
- uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-
- if (cm->seq_params.use_highbitdepth)
- highbd_filter_selectively_vert_row2(
- ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
- mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
- &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
- else
- filter_selectively_vert_row2(
- ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
- mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
- dst->buf += two_row_stride;
- }
- // reset buf pointer for horizontal filtering
- dst->buf = buf0;
-}
-
-void av1_filter_block_plane_bitmask_horz(
- AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
- int mi_row, int mi_col) {
- struct buf_2d *const dst = &plane_ptr->dst;
- uint8_t *const buf0 = dst->buf;
- const int ssx = plane_ptr->subsampling_x;
- const int ssy = plane_ptr->subsampling_y;
- const int mask_cutoff = 0xffff;
- const int row_step = 1 << ssy;
- const int row_stride = dst->stride << MI_SIZE_LOG2;
- uint64_t mask_16x16 = 0;
- uint64_t mask_8x8 = 0;
- uint64_t mask_4x4 = 0;
- uint8_t *lfl;
- LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
- assert(lfm);
- for (int r = 0;
- ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
- r += row_step) {
- if (mi_row + r == 0) {
- dst->buf += row_stride;
- continue;
- }
- const int row = r | ssy;
- const int col = ssx;
- int index = 0;
- const int shift = get_index_shift(col, row, &index);
- switch (pl) {
- case 0:
- mask_16x16 = lfm->above_y[TX_16X16].bits[index];
- mask_8x8 = lfm->above_y[TX_8X8].bits[index];
- mask_4x4 = lfm->above_y[TX_4X4].bits[index];
- lfl = &lfm->lfl_y_hor[row][col];
- break;
- case 1:
- mask_16x16 = lfm->above_u[TX_16X16].bits[index];
- mask_8x8 = lfm->above_u[TX_8X8].bits[index];
- mask_4x4 = lfm->above_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u[row][col];
- break;
- case 2:
- mask_16x16 = lfm->above_v[TX_16X16].bits[index];
- mask_8x8 = lfm->above_v[TX_8X8].bits[index];
- mask_4x4 = lfm->above_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v[row][col];
- break;
- default: assert(pl >= 0 && pl <= 2); return;
- }
- mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
- mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
- mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
- if (cm->seq_params.use_highbitdepth)
- highbd_filter_selectively_horiz(
- CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
- mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
- else
- filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
- mask_8x8, mask_4x4, &cm->lf_info, lfl);
- dst->buf += row_stride;
- }
- // reset buf pointer for next block
- dst->buf = buf0;
-}
-
-void av1_filter_block_plane_ver(AV1_COMMON *const cm,
- struct macroblockd_plane *const plane_ptr,
- int pl, int mi_row, int mi_col) {
- struct buf_2d *const dst = &plane_ptr->dst;
- int r, c;
- const int ssx = plane_ptr->subsampling_x;
- const int ssy = plane_ptr->subsampling_y;
- const int mask_cutoff = 0xffff;
- const int single_step = 1 << ssy;
- const int r_step = 2 << ssy;
- uint64_t mask_16x16 = 0;
- uint64_t mask_8x8 = 0;
- uint64_t mask_4x4 = 0;
- uint8_t *lfl;
- uint8_t *lfl2;
-
- // filter two rows at a time
- for (r = 0; r < cm->seq_params.mib_size &&
- ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
- r += r_step) {
- for (c = 0; c < cm->seq_params.mib_size &&
- ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
- c += MI_SIZE_64X64) {
- dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
- LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
- assert(lfm);
- const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
- const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
- int index = 0;
- const int shift = get_index_shift(col, row, &index);
- // current and next row should belong to the same mask_idx and index
- // next row's shift
- const int row_next = row + single_step;
- int index_next = 0;
- const int shift_next = get_index_shift(col, row_next, &index_next);
- switch (pl) {
- case 0:
- mask_16x16 = lfm->left_y[TX_16X16].bits[index];
- mask_8x8 = lfm->left_y[TX_8X8].bits[index];
- mask_4x4 = lfm->left_y[TX_4X4].bits[index];
- lfl = &lfm->lfl_y_ver[row][col];
- lfl2 = &lfm->lfl_y_ver[row_next][col];
- break;
- case 1:
- mask_16x16 = lfm->left_u[TX_16X16].bits[index];
- mask_8x8 = lfm->left_u[TX_8X8].bits[index];
- mask_4x4 = lfm->left_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u[row][col];
- lfl2 = &lfm->lfl_u[row_next][col];
- break;
- case 2:
- mask_16x16 = lfm->left_v[TX_16X16].bits[index];
- mask_8x8 = lfm->left_v[TX_8X8].bits[index];
- mask_4x4 = lfm->left_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v[row][col];
- lfl2 = &lfm->lfl_v[row_next][col];
- break;
- default: assert(pl >= 0 && pl <= 2); return;
- }
- uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
- uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
- uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
- uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
- uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
- uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-
- if (cm->seq_params.use_highbitdepth)
- highbd_filter_selectively_vert_row2(
- ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
- mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
- &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
- else
- filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
- mask_16x16_0, mask_8x8_0, mask_4x4_0,
- mask_16x16_1, mask_8x8_1, mask_4x4_1,
- &cm->lf_info, lfl, lfl2);
- dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
- }
- dst->buf += 2 * MI_SIZE * dst->stride;
- }
-}
-
-void av1_filter_block_plane_hor(AV1_COMMON *const cm,
- struct macroblockd_plane *const plane_ptr,
- int pl, int mi_row, int mi_col) {
- struct buf_2d *const dst = &plane_ptr->dst;
- int r, c;
- const int ssx = plane_ptr->subsampling_x;
- const int ssy = plane_ptr->subsampling_y;
- const int mask_cutoff = 0xffff;
- const int r_step = 1 << ssy;
- uint64_t mask_16x16 = 0;
- uint64_t mask_8x8 = 0;
- uint64_t mask_4x4 = 0;
- uint8_t *lfl;
-
- for (r = 0; r < cm->seq_params.mib_size &&
- ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
- r += r_step) {
- for (c = 0; c < cm->seq_params.mib_size &&
- ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
- c += MI_SIZE_64X64) {
- if (mi_row + r == 0) continue;
-
- dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
- LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
- assert(lfm);
- const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
- const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
- int index = 0;
- const int shift = get_index_shift(col, row, &index);
- switch (pl) {
- case 0:
- mask_16x16 = lfm->above_y[TX_16X16].bits[index];
- mask_8x8 = lfm->above_y[TX_8X8].bits[index];
- mask_4x4 = lfm->above_y[TX_4X4].bits[index];
- lfl = &lfm->lfl_y_hor[row][col];
- break;
- case 1:
- mask_16x16 = lfm->above_u[TX_16X16].bits[index];
- mask_8x8 = lfm->above_u[TX_8X8].bits[index];
- mask_4x4 = lfm->above_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u[row][col];
- break;
- case 2:
- mask_16x16 = lfm->above_v[TX_16X16].bits[index];
- mask_8x8 = lfm->above_v[TX_8X8].bits[index];
- mask_4x4 = lfm->above_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v[row][col];
- break;
- default: assert(pl >= 0 && pl <= 2); return;
- }
- mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
- mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
- mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
- if (cm->seq_params.use_highbitdepth)
- highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
- dst->stride, pl, ssx, mask_16x16,
- mask_8x8, mask_4x4, &cm->lf_info, lfl,
- (int)cm->seq_params.bit_depth);
- else
- filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
- mask_8x8, mask_4x4, &cm->lf_info, lfl);
- dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
- }
- dst->buf += MI_SIZE * dst->stride;
- }
-}
-#endif // LOOP_FILTER_BITMASK
-
static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
const MB_MODE_INFO *const mbmi,
const EDGE_DIR edge_dir, const int mi_row,
@@ -1914,7 +212,7 @@ static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
tx_size = mb_tx_size;
}
- // since in case of chrominance or non-square transorm need to convert
+ // since in case of chrominance or non-square transform need to convert
// transform size into transform size in particular direction.
// for vertical edge, filter direction is horizontal, for horizontal
// edge, filter direction is vertical.
@@ -1933,7 +231,7 @@ typedef struct AV1_DEBLOCKING_PARAMETERS {
} AV1_DEBLOCKING_PARAMETERS;
// Return TX_SIZE from get_transform_size(), so it is plane and direction
-// awared
+// aware
static TX_SIZE set_lpf_parameters(
AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
@@ -1958,7 +256,8 @@ static TX_SIZE set_lpf_parameters(
// and mi_col should be odd number for chroma plane.
const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
- MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+ MB_MODE_INFO **mi =
+ cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
const MB_MODE_INFO *mbmi = mi[0];
// If current mbmi is not correctly setup, return an invalid value to stop
// filtering. One example is that if this tile is not coded, then its mbmi
@@ -1979,7 +278,7 @@ static TX_SIZE set_lpf_parameters(
// prepare outer edge parameters. deblock the edge if it's an edge of a TU
{
const uint32_t curr_level =
- get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+ av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
uint32_t level = curr_level;
if (coord) {
@@ -1994,12 +293,13 @@ static TX_SIZE set_lpf_parameters(
xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
const uint32_t pv_lvl =
- get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+ av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
const BLOCK_SIZE bsize =
get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
plane_ptr->subsampling_y);
+ assert(bsize < BLOCK_SIZES_ALL);
const int prediction_masks = edge_dir == VERT_EDGE
? block_size_wide[bsize] - 1
: block_size_high[bsize] - 1;
@@ -2047,21 +347,18 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
const MACROBLOCKD *const xd, const int plane,
const MACROBLOCKD_PLANE *const plane_ptr,
const uint32_t mi_row, const uint32_t mi_col) {
- const int row_step = MI_SIZE >> MI_SIZE_LOG2;
const uint32_t scale_horz = plane_ptr->subsampling_x;
const uint32_t scale_vert = plane_ptr->subsampling_y;
uint8_t *const dst_ptr = plane_ptr->dst.buf;
const int dst_stride = plane_ptr->dst.stride;
const int y_range = (MAX_MIB_SIZE >> scale_vert);
const int x_range = (MAX_MIB_SIZE >> scale_horz);
- const int use_highbitdepth = cm->seq_params.use_highbitdepth;
- const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
- for (int y = 0; y < y_range; y += row_step) {
+ for (int y = 0; y < y_range; y++) {
uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
for (int x = 0; x < x_range;) {
// inner loop always filter vertical edges in a MI block. If MI size
// is 8x8, it will filter the vertical edge aligned with a 8x8 block.
- // If 4x4 trasnform is used, it will then filter the internal edge
+ // If 4x4 transform is used, it will then filter the internal edge
// aligned with a 4x4 block
const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2078,6 +375,9 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
tx_size = TX_4X4;
}
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
@@ -2122,6 +422,32 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
// no filtering
default: break;
}
+#else
+ switch (params.filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ assert(plane != 0);
+ aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
// advance the destination pointer
advance_units = tx_size_wide_unit[tx_size];
x += advance_units;
@@ -2134,21 +460,18 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
const MACROBLOCKD *const xd, const int plane,
const MACROBLOCKD_PLANE *const plane_ptr,
const uint32_t mi_row, const uint32_t mi_col) {
- const int col_step = MI_SIZE >> MI_SIZE_LOG2;
const uint32_t scale_horz = plane_ptr->subsampling_x;
const uint32_t scale_vert = plane_ptr->subsampling_y;
uint8_t *const dst_ptr = plane_ptr->dst.buf;
const int dst_stride = plane_ptr->dst.stride;
const int y_range = (MAX_MIB_SIZE >> scale_vert);
const int x_range = (MAX_MIB_SIZE >> scale_horz);
- const int use_highbitdepth = cm->seq_params.use_highbitdepth;
- const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
- for (int x = 0; x < x_range; x += col_step) {
+ for (int x = 0; x < x_range; x++) {
uint8_t *p = dst_ptr + x * MI_SIZE;
for (int y = 0; y < y_range;) {
// inner loop always filter vertical edges in a MI block. If MI size
// is 8x8, it will first filter the vertical edge aligned with a 8x8
- // block. If 4x4 trasnform is used, it will then filter the internal
+ // block. If 4x4 transform is used, it will then filter the internal
// edge aligned with a 4x4 block
const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
@@ -2157,14 +480,17 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
AV1_DEBLOCKING_PARAMETERS params;
memset(&params, 0, sizeof(params));
- tx_size =
- set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
- HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+ tx_size = set_lpf_parameters(
+ &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+ curr_x, curr_y, plane, plane_ptr);
if (tx_size == TX_INVALID) {
params.filter_length = 0;
tx_size = TX_4X4;
}
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
@@ -2210,6 +536,117 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
// no filtering
default: break;
}
+#else
+ switch (params.filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 6-tap filtering
+ case 6:
+ assert(plane != 0);
+ aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ // advance the destination pointer
+ advance_units = tx_size_high_unit[tx_size];
+ y += advance_units;
+ p += advance_units * dst_stride * MI_SIZE;
+ }
+ }
+}
+
+void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row,
+ const uint32_t mi_col) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int y_range = cm->mi_params.mi_rows >> scale_vert;
+ const int x_range = cm->mi_params.mi_cols >> scale_horz;
+ for (int y = 0; y < y_range; y++) {
+ uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+ for (int x = 0; x < x_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+ // If 4x4 transform is used, it will then filter the internal edge
+ // aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(&params, 0, sizeof(params));
+
+ tx_size =
+ set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+ VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
+
+ // advance the destination pointer
+ advance_units = tx_size_wide_unit[tx_size];
+ x += advance_units;
+ p += advance_units * MI_SIZE;
+ }
+ }
+}
+
+void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row,
+ const uint32_t mi_col) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int y_range = cm->mi_params.mi_rows >> scale_vert;
+ const int x_range = cm->mi_params.mi_cols >> scale_horz;
+ for (int x = 0; x < x_range; x++) {
+ uint8_t *p = dst_ptr + x * MI_SIZE;
+ for (int y = 0; y < y_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will first filter the vertical edge aligned with a 8x8
+ // block. If 4x4 transform is used, it will then filter the internal
+ // edge aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(&params, 0, sizeof(params));
+
+ tx_size = set_lpf_parameters(
+ &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+ curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
// advance the destination pointer
advance_units = tx_size_high_unit[tx_size];
@@ -2221,18 +658,19 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
MACROBLOCKD *xd, int start, int stop,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
int is_decoding,
#endif
int plane_start, int plane_end) {
struct macroblockd_plane *pd = xd->plane;
const int col_start = 0;
- const int col_end = cm->mi_cols;
+ const int col_end = cm->mi_params.mi_cols;
int mi_row, mi_col;
int plane;
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
if (is_decoding) {
+ cm->is_decoding = is_decoding;
for (plane = plane_start; plane < plane_end; plane++) {
if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
break;
@@ -2243,24 +681,25 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
plane, plane + 1);
+
av1_build_bitmask_vert_info(cm, &pd[plane], plane);
av1_build_bitmask_horz_info(cm, &pd[plane], plane);
// apply loop filtering which only goes through buffer once
for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
- av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+ av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col,
plane, plane + 1);
av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
mi_col);
if (mi_col - MI_SIZE_64X64 >= 0) {
- av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
mi_col - MI_SIZE_64X64, plane, plane + 1);
av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
mi_col - MI_SIZE_64X64);
}
}
- av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
mi_col - MI_SIZE_64X64, plane, plane + 1);
av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
mi_col - MI_SIZE_64X64);
@@ -2278,31 +717,6 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
else if (plane == 2 && !(cm->lf.filter_level_v))
continue;
-#if LOOP_FILTER_BITMASK
- // filter all vertical edges every superblock (could be 128x128 or 64x64)
- for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
- for (mi_col = col_start; mi_col < col_end;
- mi_col += cm->seq_params.mib_size) {
- av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
- mi_col, plane, plane + 1);
-
- av1_setup_bitmask(cm, mi_row, mi_col, plane, pd[plane].subsampling_x,
- pd[plane].subsampling_y, stop, col_end);
- av1_filter_block_plane_ver(cm, &pd[plane], plane, mi_row, mi_col);
- }
- }
-
- // filter all horizontal edges every superblock
- for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
- for (mi_col = col_start; mi_col < col_end;
- mi_col += cm->seq_params.mib_size) {
- av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
- mi_col, plane, plane + 1);
-
- av1_filter_block_plane_hor(cm, &pd[plane], plane, mi_row, mi_col);
- }
- }
-#else
if (cm->lf.combine_vert_horz_lf) {
// filter all vertical and horizontal edges in every 128x128 super block
for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
@@ -2348,29 +762,28 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
}
}
}
-#endif // LOOP_FILTER_BITMASK
}
}
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
int is_decoding,
#endif
int plane_start, int plane_end, int partial_frame) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
start_mi_row = 0;
- mi_rows_to_filter = cm->mi_rows;
- if (partial_frame && cm->mi_rows > 8) {
- start_mi_row = cm->mi_rows >> 1;
+ mi_rows_to_filter = cm->mi_params.mi_rows;
+ if (partial_frame && cm->mi_params.mi_rows > 8) {
+ start_mi_row = cm->mi_params.mi_rows >> 1;
start_mi_row &= 0xfffffff8;
- mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+ mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
}
end_mi_row = start_mi_row + mi_rows_to_filter;
av1_loop_filter_frame_init(cm, plane_start, plane_end);
loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
is_decoding,
#endif
plane_start, plane_end);
diff --git a/media/libaom/src/av1/common/av1_loopfilter.h b/media/libaom/src/av1/common/av1_loopfilter.h
index 80ac61178..ce26d1647 100644
--- a/media/libaom/src/av1/common/av1_loopfilter.h
+++ b/media/libaom/src/av1/common/av1_loopfilter.h
@@ -33,11 +33,12 @@ enum lf_path {
LF_PATH_SLOW,
};
-#if LOOP_FILTER_BITMASK
+enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
typedef struct {
uint64_t bits[4];
} FilterMask;
+#if CONFIG_LPF_MASK
// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
// Each 1 bit represents a position in which we want to apply the loop filter.
// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
@@ -61,10 +62,12 @@ typedef struct {
uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
// U plane filter level
- uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
+ uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+ uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
// V plane filter level
- uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
+ uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+ uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
// other info
FilterMask skip;
@@ -74,7 +77,7 @@ typedef struct {
FilterMask tx_size_ver[2][5];
FilterMask tx_size_hor[2][5];
} LoopFilterMask;
-#endif // LOOP_FILTER_BITMASK
+#endif // CONFIG_LPF_MASK
struct loopfilter {
int filter_level[2];
@@ -95,11 +98,11 @@ struct loopfilter {
int combine_vert_horz_lf;
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
LoopFilterMask *lfm;
size_t lfm_num;
int lfm_stride;
-#endif // LOOP_FILTER_BITMASK
+#endif // CONFIG_LPF_MASK
};
// Need to align this structure so when it is declared and
@@ -125,13 +128,13 @@ void av1_loop_filter_init(struct AV1Common *cm);
void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
int plane_end);
-#if LOOP_FILTER_BITMASK
+#if CONFIG_LPF_MASK
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
- struct macroblockd *mbd, int is_decoding,
+ struct macroblockd *xd, int is_decoding,
int plane_start, int plane_end, int partial_frame);
#else
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
- struct macroblockd *mbd, int plane_start,
+ struct macroblockd *xd, int plane_start,
int plane_end, int partial_frame);
#endif
@@ -154,14 +157,10 @@ typedef struct LoopFilterWorkerData {
MACROBLOCKD *xd;
} LFWorkerData;
-uint8_t get_filter_level(const struct AV1Common *cm,
- const loop_filter_info_n *lfi_n, const int dir_idx,
- int plane, const MB_MODE_INFO *mbmi);
-#if LOOP_FILTER_BITMASK
-void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
- int plane, int subsampling_x, int subsampling_y,
- int row_end, int col_end);
-
+uint8_t av1_get_filter_level(const struct AV1Common *cm,
+ const loop_filter_info_n *lfi_n, const int dir_idx,
+ int plane, const MB_MODE_INFO *mbmi);
+#if CONFIG_LPF_MASK
void av1_filter_block_plane_ver(struct AV1Common *const cm,
struct macroblockd_plane *const plane_ptr,
int pl, int mi_row, int mi_col);
@@ -169,56 +168,38 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm,
void av1_filter_block_plane_hor(struct AV1Common *const cm,
struct macroblockd_plane *const plane, int pl,
int mi_row, int mi_col);
-LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
- int mi_row, int mi_col);
-int get_index_shift(int mi_col, int mi_row, int *index);
-
-static const FilterMask left_txform_mask[TX_SIZES] = {
- { { 0x0000000000000001ULL, // TX_4X4,
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0000000000010001ULL, // TX_8X8,
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0001000100010001ULL, // TX_16X16,
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
- { { 0x0001000100010001ULL, // TX_32X32,
- 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+int get_index_shift(int mi_col, int mi_row, int *index);
- { { 0x0001000100010001ULL, // TX_64X64,
- 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
-};
+void av1_build_bitmask_vert_info(
+ struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane);
-static const uint64_t above_txform_mask[2][TX_SIZES] = {
- {
- 0x0000000000000001ULL, // TX_4X4
- 0x0000000000000003ULL, // TX_8X8
- 0x000000000000000fULL, // TX_16X16
- 0x00000000000000ffULL, // TX_32X32
- 0x000000000000ffffULL, // TX_64X64
- },
- {
- 0x0000000000000001ULL, // TX_4X4
- 0x0000000000000005ULL, // TX_8X8
- 0x0000000000000055ULL, // TX_16X16
- 0x0000000000005555ULL, // TX_32X32
- 0x0000000055555555ULL, // TX_64X64
- },
-};
+void av1_build_bitmask_horz_info(
+ struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane);
-extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+void av1_filter_block_plane_bitmask_vert(
+ struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+ int pl, int mi_row, int mi_col);
-extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+void av1_filter_block_plane_bitmask_horz(
+ struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+ int pl, int mi_row, int mi_col);
-extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ MB_MODE_INFO *mbmi);
-extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+ int is_horz_coding_block_border,
+ int is_vert_coding_block_border);
-extern const FilterMask left_mask_univariant_reordered[67];
-
-extern const FilterMask above_mask_univariant_reordered[67];
-#endif
+void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi);
+#endif // CONFIG_LPF_MASK
#ifdef __cplusplus
} // extern "C"
diff --git a/media/libaom/src/av1/common/av1_rtcd_defs.pl b/media/libaom/src/av1/common/av1_rtcd_defs.pl
index dee1f1c79..296c6c572 100644
--- a/media/libaom/src/av1/common/av1_rtcd_defs.pl
+++ b/media/libaom/src/av1/common/av1_rtcd_defs.pl
@@ -33,21 +33,46 @@ struct txfm_param;
struct aom_variance_vtable;
struct search_site_config;
struct yv12_buffer_config;
+struct NN_CONFIG;
+typedef struct NN_CONFIG NN_CONFIG;
+
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
/* Function pointers return by CfL functions */
typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
uint16_t *output_q3);
+#if CONFIG_AV1_HIGHBITDEPTH
typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
uint16_t *output_q3);
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+#endif
+
typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
int dst_stride, int alpha_q3);
-typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
EOF
}
forward_decls qw/av1_common_forward_decls/;
@@ -65,22 +90,24 @@ if ($opts{arch} eq "x86_64") {
add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
specialize qw/av1_convolve_horiz_rs sse4_1/;
-add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
-specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
-
-add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+ specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
-add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
+ add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
+ specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/;
+}
+add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
-specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
-specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
-
# directional intra predictor functions
add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+specialize qw/av1_dr_prediction_z1 avx2/;
add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z2 avx2/;
add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z3 avx2/;
# FILTER_INTRA predictor functions
add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
@@ -108,21 +135,21 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
-add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x16/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/;
add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -151,10 +178,15 @@ add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *out
add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-# directional intra predictor functions
-add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
-add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ # directional intra predictor functions
+ add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+ specialize qw/av1_highbd_dr_prediction_z1 avx2/;
+ add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+ specialize qw/av1_highbd_dr_prediction_z2 avx2/;
+ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+ specialize qw/av1_highbd_dr_prediction_z3 avx2/;
+}
# build compound seg mask functions
add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
@@ -166,6 +198,10 @@ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
+
#
# Encoder functions below this point.
#
@@ -176,10 +212,17 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# the transform coefficients are held in 32-bit
# values, so the assembler code for av1_block_error can no longer be used.
add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/av1_block_error avx2/;
+ specialize qw/av1_block_error sse2 avx2 neon/;
+
+ add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
+ specialize qw/av1_block_error_lp avx2 neon/;
add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/av1_quantize_fp sse2 avx2/;
+ specialize qw/av1_quantize_fp sse2 avx2 neon/;
+
+ add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan";
+ specialize qw/av1_quantize_lp avx2 neon/;
+
add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/av1_quantize_fp_32x32 avx2/;
@@ -196,54 +239,71 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_4x8 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x32 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x16 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_4x16 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x4 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x32 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x8 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x64 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x32 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x64 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x16 sse4_1/;
#
# Motion search
#
- add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
-
add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
- add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
- specialize qw/av1_temporal_filter_apply sse2 msa/;
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+ specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
+ }
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+ specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
+ }
add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
# ENCODEMB INVOKE
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/av1_highbd_block_error sse2 avx2/;
+ }
- add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/av1_highbd_block_error sse2/;
-
- add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-
- add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
- specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+ specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+ }
add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
@@ -257,30 +317,57 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
- add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+ add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
# hash
- add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
+ add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
specialize qw/av1_get_crc32c_value sse4_2/;
- add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H";
+ add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
specialize qw/av1_compute_stats sse4_1 avx2/;
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+ specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+ }
+
+ add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+ specialize qw/av1_calc_proj_params avx2/;
+
add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
+ }
+ add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
+ specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
+
+ add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
+ specialize qw/av1_nn_predict sse3/;
}
# end encoder functions
+# CNN functions
+
+add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
+add_proto qw/void av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+add_proto qw/void av1_cnn_convolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step";
+add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
+
# Deringing Functions
add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
-add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
+add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift";
-add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
-add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
@@ -288,27 +375,32 @@ add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
- specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
- specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
}
# WARPED_MOTION / GLOBAL_MOTION functions
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 neon/;
+specialize qw/av1_warp_affine sse4_1 avx2 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+ specialize qw/av1_highbd_warp_affine sse4_1/;
+}
-add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_highbd_warp_affine sse4_1/;
+add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
+specialize qw/av1_calc_frame_error sse2 avx2/;
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
- specialize qw/compute_cross_correlation sse4_1/;
+ add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
+ specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
}
# LOOP_RESTORATION functions
-add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
-specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
+add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
@@ -317,44 +409,48 @@ specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
# CONVOLVE_ROUND/COMPOUND_ROUND functions
-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-
- add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
- add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
+}
+
+ add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
specialize qw/av1_convolve_2d_scale sse4_1/;
- specialize qw/av1_jnt_convolve_2d ssse3 avx2 neon/;
- specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
- specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
- specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
- specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
- specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
- specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
- specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
- specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
- specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
- specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
- specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
- specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
+ specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
+ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
+ specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
+ specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
+ specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
+ specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+ specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+ specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+ specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+ specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+ }
# INTRA_EDGE functions
add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
@@ -368,8 +464,8 @@ add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
specialize qw/av1_upsample_intra_edge_high sse4_1/;
# CFL
-add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
-specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
+add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
@@ -380,19 +476,21 @@ specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+ add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
-add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+ add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
-add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
-specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
+ add_proto qw/cfl_predict_hbd_fn cfl_get_predict_hbd_fn/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_predict_hbd_fn ssse3 avx2 neon/;
+}
-add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size";
-specialize qw/get_predict_hbd_fn ssse3 avx2 neon/;
+add_proto qw/cfl_predict_lbd_fn cfl_get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_predict_lbd_fn ssse3 avx2 neon/;
1;
diff --git a/media/libaom/src/av1/common/av1_txfm.c b/media/libaom/src/av1/common/av1_txfm.c
index bb70eab70..ac43402f4 100644
--- a/media/libaom/src/av1/common/av1_txfm.c
+++ b/media/libaom/src/av1/common/av1_txfm.c
@@ -10,10 +10,11 @@
*/
#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
#include "av1/common/av1_txfm.h"
-// av1_cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+// av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)));
const int32_t av1_cospi_arr_data[7][64] = {
{ 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
972, 964, 955, 946, 936, 926, 915, 903, 891, 878, 865, 851, 837,
diff --git a/media/libaom/src/av1/common/av1_txfm.h b/media/libaom/src/av1/common/av1_txfm.h
index 59d64ca4a..20049b680 100644
--- a/media/libaom/src/av1/common/av1_txfm.h
+++ b/media/libaom/src/av1/common/av1_txfm.h
@@ -59,7 +59,9 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
const int64_t min_value = -(1LL << (bit - 1));
if (value < min_value || value > max_value) {
fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+#if !CONFIG_AV1_ENCODER
assert(0);
+#endif
}
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
#if DO_RANGE_CHECK_CLAMP
@@ -110,7 +112,7 @@ typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd);
-typedef enum TXFM_TYPE {
+enum {
TXFM_TYPE_DCT4,
TXFM_TYPE_DCT8,
TXFM_TYPE_DCT16,
@@ -125,7 +127,7 @@ typedef enum TXFM_TYPE {
TXFM_TYPE_IDENTITY32,
TXFM_TYPES,
TXFM_TYPE_INVALID,
-} TXFM_TYPE;
+} UENUM1BYTE(TXFM_TYPE);
typedef struct TXFM_2D_FLIP_CFG {
TX_SIZE tx_size;
diff --git a/media/libaom/src/av1/common/blockd.c b/media/libaom/src/av1/common/blockd.c
index 2e796b656..00725ea2d 100644
--- a/media/libaom/src/av1/common/blockd.c
+++ b/media/libaom/src/av1/common/blockd.c
@@ -13,8 +13,8 @@
#include "aom_ports/system_state.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
if (!left_mi) return DC_PRED;
@@ -28,11 +28,12 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
return above_mi->mode;
}
-void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
- int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- int has_eob, int aoff, int loff) {
- ENTROPY_CONTEXT *const a = pd->above_context + aoff;
- ENTROPY_CONTEXT *const l = pd->left_context + loff;
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+ struct macroblockd_plane *pd, int plane,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int has_eob, int aoff, int loff) {
+ ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff;
+ ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff;
const int txs_wide = tx_size_wide_unit[tx_size];
const int txs_high = tx_size_high_unit[tx_size];
@@ -56,23 +57,18 @@ void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
memset(l, has_eob, sizeof(*l) * txs_high);
}
}
-void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE bsize, const int num_planes) {
- int i;
- int nplanes;
- int chroma_ref;
- chroma_ref =
- is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
- xd->plane[1].subsampling_y);
- nplanes = 1 + (num_planes - 1) * chroma_ref;
- for (i = 0; i < nplanes; i++) {
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ const int num_planes) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
+ for (int i = 0; i < nplanes; i++) {
struct macroblockd_plane *const pd = &xd->plane[i];
const BLOCK_SIZE plane_bsize =
get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
- const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
- memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
- memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+ const int txs_wide = mi_size_wide[plane_bsize];
+ const int txs_high = mi_size_high[plane_bsize];
+ memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+ memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
}
}
@@ -104,37 +100,3 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
xd->plane[i].subsampling_y = 1;
}
}
-
-const int16_t dr_intra_derivative[90] = {
- // More evenly spread out angles and limited to 10-bit
- // Values that are 0 will never be used
- // Approx angle
- 0, 0, 0, //
- 1023, 0, 0, // 3, ...
- 547, 0, 0, // 6, ...
- 372, 0, 0, 0, 0, // 9, ...
- 273, 0, 0, // 14, ...
- 215, 0, 0, // 17, ...
- 178, 0, 0, // 20, ...
- 151, 0, 0, // 23, ... (113 & 203 are base angles)
- 132, 0, 0, // 26, ...
- 116, 0, 0, // 29, ...
- 102, 0, 0, 0, // 32, ...
- 90, 0, 0, // 36, ...
- 80, 0, 0, // 39, ...
- 71, 0, 0, // 42, ...
- 64, 0, 0, // 45, ... (45 & 135 are base angles)
- 57, 0, 0, // 48, ...
- 51, 0, 0, // 51, ...
- 45, 0, 0, 0, // 54, ...
- 40, 0, 0, // 58, ...
- 35, 0, 0, // 61, ...
- 31, 0, 0, // 64, ...
- 27, 0, 0, // 67, ... (67 & 157 are base angles)
- 23, 0, 0, // 70, ...
- 19, 0, 0, // 73, ...
- 15, 0, 0, 0, 0, // 76, ...
- 11, 0, 0, // 81, ...
- 7, 0, 0, // 84, ...
- 3, 0, 0, // 87, ...
-};
diff --git a/media/libaom/src/av1/common/blockd.h b/media/libaom/src/av1/common/blockd.h
index a2311c1b0..47597bc83 100644
--- a/media/libaom/src/av1/common/blockd.h
+++ b/media/libaom/src/av1/common/blockd.h
@@ -37,20 +37,22 @@ extern "C" {
#define MAX_DIFFWTD_MASK_BITS 1
+#define INTERINTRA_WEDGE_SIGN 0
+
// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum ATTRIBUTE_PACKED {
+enum {
DIFFWTD_38 = 0,
DIFFWTD_38_INV,
DIFFWTD_MASK_TYPES,
-} DIFFWTD_MASK_TYPE;
+} UENUM1BYTE(DIFFWTD_MASK_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
KEY_FRAME = 0,
INTER_FRAME = 1,
INTRA_ONLY_FRAME = 2, // replaces intra-only
S_FRAME = 3,
FRAME_TYPES,
-} FRAME_TYPE;
+} UENUM1BYTE(FRAME_TYPE);
static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
@@ -73,24 +75,24 @@ static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
}
static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
- static PREDICTION_MODE lut[] = {
- MB_MODE_COUNT, // DC_PRED
- MB_MODE_COUNT, // V_PRED
- MB_MODE_COUNT, // H_PRED
- MB_MODE_COUNT, // D45_PRED
- MB_MODE_COUNT, // D135_PRED
- MB_MODE_COUNT, // D113_PRED
- MB_MODE_COUNT, // D157_PRED
- MB_MODE_COUNT, // D203_PRED
- MB_MODE_COUNT, // D67_PRED
- MB_MODE_COUNT, // SMOOTH_PRED
- MB_MODE_COUNT, // SMOOTH_V_PRED
- MB_MODE_COUNT, // SMOOTH_H_PRED
- MB_MODE_COUNT, // PAETH_PRED
- MB_MODE_COUNT, // NEARESTMV
- MB_MODE_COUNT, // NEARMV
- MB_MODE_COUNT, // GLOBALMV
- MB_MODE_COUNT, // NEWMV
+ static const PREDICTION_MODE lut[] = {
+ DC_PRED, // DC_PRED
+ V_PRED, // V_PRED
+ H_PRED, // H_PRED
+ D45_PRED, // D45_PRED
+ D135_PRED, // D135_PRED
+ D113_PRED, // D113_PRED
+ D157_PRED, // D157_PRED
+ D203_PRED, // D203_PRED
+ D67_PRED, // D67_PRED
+ SMOOTH_PRED, // SMOOTH_PRED
+ SMOOTH_V_PRED, // SMOOTH_V_PRED
+ SMOOTH_H_PRED, // SMOOTH_H_PRED
+ PAETH_PRED, // PAETH_PRED
+ NEARESTMV, // NEARESTMV
+ NEARMV, // NEARMV
+ GLOBALMV, // GLOBALMV
+ NEWMV, // NEWMV
NEARESTMV, // NEAREST_NEARESTMV
NEARMV, // NEAR_NEARMV
NEARESTMV, // NEAREST_NEWMV
@@ -101,12 +103,12 @@ static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
NEWMV, // NEW_NEWMV
};
assert(NELEMENTS(lut) == MB_MODE_COUNT);
- assert(is_inter_compound_mode(mode));
+ assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode));
return lut[mode];
}
static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
- static PREDICTION_MODE lut[] = {
+ static const PREDICTION_MODE lut[] = {
MB_MODE_COUNT, // DC_PRED
MB_MODE_COUNT, // V_PRED
MB_MODE_COUNT, // H_PRED
@@ -156,18 +158,16 @@ static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
modes for the Y blocks to the left and above us; for interframes, there
is a single probability table. */
-typedef int8_t MV_REFERENCE_FRAME;
-
typedef struct {
- // Number of base colors for Y (0) and UV (1)
- uint8_t palette_size[2];
// Value of base colors for Y, U, and V
uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+ // Number of base colors for Y (0) and UV (1)
+ uint8_t palette_size[2];
} PALETTE_MODE_INFO;
typedef struct {
- uint8_t use_filter_intra;
FILTER_INTRA_MODE filter_intra_mode;
+ uint8_t use_filter_intra;
} FILTER_INTRA_MODE_INFO;
static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
@@ -189,23 +189,24 @@ typedef struct RD_STATS {
int64_t rdcost;
int64_t sse;
int skip; // sse should equal to dist when skip == 1
- int64_t ref_rdcost;
int zero_rate;
- uint8_t invalid_rate;
#if CONFIG_RD_DEBUG
int txb_coeff_cost[MAX_MB_PLANE];
- int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
- [TXB_COEFF_COST_MAP_SIZE];
+ // TODO(jingning): Temporary solution to silence stack over-size warning
+ // in handle_inter_mode. This should be fixed after rate-distortion
+ // optimization refactoring.
+ int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+ [TXB_COEFF_COST_MAP_SIZE];
#endif // CONFIG_RD_DEBUG
} RD_STATS;
// This struct is used to group function args that are commonly
// sent together in functions related to interinter compound modes
typedef struct {
- int wedge_index;
- int wedge_sign;
- DIFFWTD_MASK_TYPE mask_type;
uint8_t *seg_mask;
+ int8_t wedge_index;
+ int8_t wedge_sign;
+ DIFFWTD_MASK_TYPE mask_type;
COMPOUND_TYPE type;
} INTERINTER_COMPOUND_DATA;
@@ -213,66 +214,60 @@ typedef struct {
#define TXK_TYPE_BUF_LEN 64
// This structure now relates to 4x4 block regions.
typedef struct MB_MODE_INFO {
+ // interinter members
+ INTERINTER_COMPOUND_DATA interinter_comp;
+ WarpedMotionParams wm_params;
+ int_mv mv[2];
+ int current_qindex;
+ // Only for INTER blocks
+ int_interpfilters interp_filters;
+ // TODO(debargha): Consolidate these flags
+#if CONFIG_RD_DEBUG
+ RD_STATS rd_stats;
+ int mi_row;
+ int mi_col;
+#endif
+#if CONFIG_INSPECTION
+ int16_t tx_skip[TXK_TYPE_BUF_LEN];
+#endif
+ PALETTE_MODE_INFO palette_mode_info;
// Common for both INTER and INTRA blocks
BLOCK_SIZE sb_type;
PREDICTION_MODE mode;
- TX_SIZE tx_size;
- uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
- int8_t skip;
- int8_t skip_mode;
- int8_t segment_id;
- int8_t seg_id_predicted; // valid only when temporal_update is enabled
-
// Only for INTRA blocks
UV_PREDICTION_MODE uv_mode;
-
- PALETTE_MODE_INFO palette_mode_info;
- uint8_t use_intrabc;
-
- // Only for INTER blocks
- InterpFilters interp_filters;
- MV_REFERENCE_FRAME ref_frame[2];
-
- TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
-
- FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-
- // The actual prediction angle is the base angle + (angle_delta * step).
- int8_t angle_delta[PLANE_TYPES];
-
// interintra members
INTERINTRA_MODE interintra_mode;
- // TODO(debargha): Consolidate these flags
- int use_wedge_interintra;
- int interintra_wedge_index;
- int interintra_wedge_sign;
- // interinter members
- INTERINTER_COMPOUND_DATA interinter_comp;
MOTION_MODE motion_mode;
- int overlappable_neighbors[2];
- int_mv mv[2];
- uint8_t ref_mv_idx;
PARTITION_TYPE partition;
+ MV_REFERENCE_FRAME ref_frame[2];
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ int8_t skip;
+ uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+ TX_SIZE tx_size;
+ int8_t delta_lf_from_base;
+ int8_t delta_lf[FRAME_LF_COUNT];
+ int8_t interintra_wedge_index;
+ // The actual prediction angle is the base angle + (angle_delta * step).
+ int8_t angle_delta[PLANE_TYPES];
/* deringing gain *per-superblock* */
- int8_t cdef_strength;
- int current_qindex;
- int delta_lf_from_base;
- int delta_lf[FRAME_LF_COUNT];
-#if CONFIG_RD_DEBUG
- RD_STATS rd_stats;
- int mi_row;
- int mi_col;
-#endif
- int num_proj_ref;
- WarpedMotionParams wm_params;
-
- // Index of the alpha Cb and alpha Cr combination
- int cfl_alpha_idx;
// Joint sign of alpha Cb and alpha Cr
- int cfl_alpha_signs;
-
- int compound_idx;
- int comp_group_idx;
+ int8_t cfl_alpha_signs;
+ // Index of the alpha Cb and alpha Cr combination
+ uint8_t cfl_alpha_idx;
+ uint8_t num_proj_ref;
+ uint8_t overlappable_neighbors[2];
+ // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
+ uint8_t compound_idx;
+ uint8_t use_wedge_interintra : 1;
+ uint8_t segment_id : 3;
+ uint8_t seg_id_predicted : 1; // valid only when temporal_update is enabled
+ uint8_t skip_mode : 1;
+ uint8_t use_intrabc : 1;
+ uint8_t ref_mv_idx : 2;
+ // Indicate if masked compound is used(1) or not(0).
+ uint8_t comp_group_idx : 1;
+ int8_t cdef_strength : 4;
} MB_MODE_INFO;
static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
@@ -366,13 +361,13 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
int mi_row, int tx_blk_col, int tx_blk_row,
int subsampling_x, int subsampling_y) {
*pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
- (tx_blk_col << tx_size_wide_log2[0]);
+ (tx_blk_col << MI_SIZE_LOG2);
*pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
- (tx_blk_row << tx_size_high_log2[0]);
+ (tx_blk_row << MI_SIZE_LOG2);
}
#endif
-enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision);
struct buf_2d {
uint8_t *buf;
@@ -403,10 +398,10 @@ typedef struct macroblockd_plane {
int subsampling_y;
struct buf_2d dst;
struct buf_2d pre[2];
- ENTROPY_CONTEXT *above_context;
- ENTROPY_CONTEXT *left_context;
+ ENTROPY_CONTEXT *above_entropy_context;
+ ENTROPY_CONTEXT *left_entropy_context;
- // The dequantizers below are true dequntizers used only in the
+ // The dequantizers below are true dequantizers used only in the
// dequantization process. They have the same coefficient
// shift/scale as TX.
int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
@@ -417,23 +412,9 @@ typedef struct macroblockd_plane {
qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-
- // the 'dequantizers' below are not literal dequantizer values.
- // They're used by encoder RDO to generate ad-hoc lambda values.
- // They use a hardwired Q3 coeff shift and do not necessarily match
- // the TX scale in use.
- const int16_t *dequant_Q3;
} MACROBLOCKD_PLANE;
-#define BLOCK_OFFSET(x, i) \
- ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
-
-typedef struct RefBuffer {
- int idx; // frame buf idx
- int map_idx; // frame map idx
- YV12_BUFFER_CONFIG *buf;
- struct scale_factors sf;
-} RefBuffer;
+#define BLOCK_OFFSET(i) ((i) << 4)
typedef struct {
DECLARE_ALIGNED(16, InterpKernel, vfilter);
@@ -478,74 +459,148 @@ typedef struct cfl_ctx {
// Chroma subsampling
int subsampling_x, subsampling_y;
- int mi_row, mi_col;
-
// Whether the reconstructed luma pixels need to be stored
int store_y;
#if CONFIG_DEBUG
int rate;
#endif // CONFIG_DEBUG
-
- int is_chroma_reference;
} CFL_CTX;
-typedef struct jnt_comp_params {
- int use_jnt_comp_avg;
+typedef struct dist_wtd_comp_params {
+ int use_dist_wtd_comp_avg;
int fwd_offset;
int bck_offset;
-} JNT_COMP_PARAMS;
+} DIST_WTD_COMP_PARAMS;
+
+struct scale_factors;
// Most/all of the pointers are mere pointers to actual arrays are allocated
// elsewhere. This is mostly for coding convenience.
typedef struct macroblockd {
+ // Row and column position of current macroblock in mi units.
+ int mi_row;
+ int mi_col;
+ // Same as cm->mi_params.mi_stride, copied here for convenience.
+ int mi_stride;
+
+ // True if current block transmits chroma information.
+ // More detail:
+ // Smallest supported block size for both luma and chroma plane is 4x4. Hence,
+ // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
+ // blocks smaller than 8x8 maybe combined into one chroma block.
+ // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
+ // luma blocks. Then, a single chroma block of size 4x4 will cover the area of
+ // these four luma blocks. This is implemented in bitstream as follows:
+ // - There are four MB_MODE_INFO structs for the four luma blocks.
+ // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
+ // any information for chroma planes.
+ // - Last block will have is_chroma_ref = true and transmits chroma
+ // information for the 4x4 chroma block that covers whole 8x8 area covered by
+ // four luma blocks.
+ // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+ bool is_chroma_ref;
+
struct macroblockd_plane plane[MAX_MB_PLANE];
TileInfo tile;
- int mi_stride;
-
+ // Appropriate offset inside cm->mi_params.mi_grid_base based on current
+ // mi_row and mi_col.
MB_MODE_INFO **mi;
+
+ // True if 4x4 block above the current block is available.
+ bool up_available;
+ // True if 4x4 block to the left of the current block is available.
+ bool left_available;
+ // True if the above chrome reference block is available.
+ bool chroma_up_available;
+ // True if the left chrome reference block is available.
+ bool chroma_left_available;
+
+ // MB_MODE_INFO for 4x4 block to the left of the current block, if
+ // left_available == true; otherwise NULL.
MB_MODE_INFO *left_mbmi;
+ // MB_MODE_INFO for 4x4 block above the current block, if
+ // up_available == true; otherwise NULL.
MB_MODE_INFO *above_mbmi;
+ // Above chroma reference block if is_chroma_ref == true for the current block
+ // and chroma_up_available == true; otherwise NULL.
+ // See also: the special case logic when current chroma block covers more than
+ // one luma blocks in set_mi_row_col().
MB_MODE_INFO *chroma_left_mbmi;
+ // Left chroma reference block if is_chroma_ref == true for the current block
+ // and chroma_left_available == true; otherwise NULL.
+ // See also: the special case logic when current chroma block covers more than
+ // one luma blocks in set_mi_row_col().
MB_MODE_INFO *chroma_above_mbmi;
- int up_available;
- int left_available;
- int chroma_up_available;
- int chroma_left_available;
+ // Appropriate offset based on current 'mi_row' and 'mi_col', inside
+ // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
+ // 'MACROBLOCK' structs.
+ uint8_t *tx_type_map;
+ // Stride for 'tx_type_map'. Note that this may / may not be same as
+ // 'mi_stride', depending on which actual array 'tx_type_map' points to.
+ int tx_type_map_stride;
- /* Distance of MB away from frame edges in subpixels (1/8th pixel) */
+ // Distance of this macroblock from frame edges in 1/8th pixel units.
int mb_to_left_edge;
int mb_to_right_edge;
int mb_to_top_edge;
int mb_to_bottom_edge;
- /* pointers to reference frames */
- const RefBuffer *block_refs[2];
+ // Scale factors for reference frames of the current block.
+ // These are pointers into 'cm->ref_scale_factors'.
+ const struct scale_factors *block_ref_scale_factors[2];
- /* pointer to current frame */
const YV12_BUFFER_CONFIG *cur_buf;
- ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
- ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE];
-
- PARTITION_CONTEXT *above_seg_context;
- PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
-
+ // Entropy contexts for the above blocks.
+ // above_entropy_context[i][j] corresponds to above entropy context for ith
+ // plane and jth mi column of this *frame*, wrt current 'mi_row'.
+ // These are pointers into 'cm->above_contexts.entropy'.
+ ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE];
+ // Entropy contexts for the left blocks.
+ // left_entropy_context[i][j] corresponds to left entropy context for ith
+ // plane and jth mi row of this *superblock*, wrt current 'mi_col'.
+ // Note: These contain actual data, NOT pointers.
+ ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE];
+
+ // Partition contexts for the above blocks.
+ // above_partition_context[i] corresponds to above partition context for ith
+ // mi column of this *frame*, wrt current 'mi_row'.
+ // These are pointers into 'cm->above_contexts.partition'.
+ PARTITION_CONTEXT *above_partition_context;
+ // Partition contexts for the left blocks.
+ // left_partition_context[i] corresponds to left partition context for ith
+ // mi row of this *superblock*, wrt current 'mi_col'.
+ // Note: These contain actual data, NOT pointers.
+ PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE];
+
+ // Transform contexts for the above blocks.
+ // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in
+ // code currently. Need to make it consistent / document why.
TXFM_CONTEXT *above_txfm_context;
+ // Transform contexts for the left blocks.
TXFM_CONTEXT *left_txfm_context;
+ // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'.
+ // Can we remove this indirection?
TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+ // Default values for the two restoration filters for each plane.
+ // These values are used as reference values when writing the bitstream. That
+ // is, we transmit the delta between the actual values in
+ // cm->rst_info[plane].unit_info[unit_idx] and these reference values.
WienerInfo wiener_info[MAX_MB_PLANE];
SgrprojInfo sgrproj_info[MAX_MB_PLANE];
- // block dimension in the unit of mode_info.
- uint8_t n4_w, n4_h;
+ // Block dimensions in MB_MODE_INFO units.
+ uint8_t width;
+ uint8_t height;
uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+ uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
uint8_t is_sec_rect;
// Counts of each reference frame in the above and left neighboring blocks.
@@ -553,15 +608,18 @@ typedef struct macroblockd {
uint8_t neighbors_ref_counts[REF_FRAMES];
FRAME_CONTEXT *tile_ctx;
- /* Bit depth: 8, 10, 12 */
+ // Bit depth: copied from cm->seq_params.bit_depth for convenience.
int bd;
int qindex[MAX_SEGMENTS];
int lossless[MAX_SEGMENTS];
+ // TODO(urvang): Move to decoder.
int corrupted;
+ // Same as cm->features.cur_frame_force_integer_mv.
int cur_frame_force_integer_mv;
- // same with that in AV1_COMMON
+ // Pointer to cm->error.
struct aom_internal_error_info *error_info;
+ // Same as cm->global_motion.
const WarpedMotionParams *global_motion;
int delta_qindex;
int current_qindex;
@@ -571,7 +629,7 @@ typedef struct macroblockd {
// filtering level) and code the delta between previous superblock's delta
// lf and current delta lf. It is equivalent to the delta between previous
// superblock's actual lf and current lf.
- int delta_lf_from_base;
+ int8_t delta_lf_from_base;
// For this experiment, we have four frame filter levels for different plane
// and direction. So, to support the per superblock update, we need to add
// a few more params as below.
@@ -585,14 +643,27 @@ typedef struct macroblockd {
// SEG_LVL_ALT_LF_Y_H = 2;
// SEG_LVL_ALT_LF_U = 3;
// SEG_LVL_ALT_LF_V = 4;
- int delta_lf[FRAME_LF_COUNT];
- int cdef_preset[4];
+ int8_t delta_lf[FRAME_LF_COUNT];
+ // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
+ // current superblock has already been read from (decoder) / written to
+ // (encoder) the bitstream; and false otherwise.
+ // More detail:
+ // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st
+ // non-skip coding block. So, we need this array to keep track of whether CDEF
+ // strengths for the given CDEF units have been transmitted yet or not.
+ // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is
+ // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
+ // superblock size is 128x128). Hence the array size is 4.
+ // (3) In the current implementation, CDEF strength for this CDEF unit is
+ // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
+ // cm->mi_params.mi_grid_base).
+ bool cdef_transmitted[4];
DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
uint8_t *mc_buf[2];
CFL_CTX cfl;
- JNT_COMP_PARAMS jcp_param;
+ DIST_WTD_COMP_PARAMS jcp_param;
uint16_t cb_offset[MAX_MB_PLANE];
uint16_t txb_offset[MAX_MB_PLANE];
@@ -602,7 +673,7 @@ typedef struct macroblockd {
uint8_t *tmp_obmc_bufs[2];
} MACROBLOCKD;
-static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
+static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
}
@@ -646,19 +717,19 @@ static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
PLANE_TYPE plane_type) {
static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
- DCT_DCT, // DC
- ADST_DCT, // V
- DCT_ADST, // H
- DCT_DCT, // D45
- ADST_ADST, // D135
- ADST_DCT, // D117
- DCT_ADST, // D153
- DCT_ADST, // D207
- ADST_DCT, // D63
- ADST_ADST, // SMOOTH
- ADST_DCT, // SMOOTH_V
- DCT_ADST, // SMOOTH_H
- ADST_ADST, // PAETH
+ DCT_DCT, // DC_PRED
+ ADST_DCT, // V_PRED
+ DCT_ADST, // H_PRED
+ DCT_DCT, // D45_PRED
+ ADST_ADST, // D135_PRED
+ ADST_DCT, // D113_PRED
+ DCT_ADST, // D157_PRED
+ DCT_ADST, // D203_PRED
+ ADST_DCT, // D67_PRED
+ ADST_ADST, // SMOOTH_PRED
+ ADST_DCT, // SMOOTH_V_PRED
+ DCT_ADST, // SMOOTH_H_PRED
+ ADST_ADST, // PAETH_PRED
};
const PREDICTION_MODE mode =
(plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
@@ -686,6 +757,22 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
};
+static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = {
+ 0x080F, // DC_PRED: 0000 1000 0000 1111
+ 0x040F, // V_PRED: 0000 0100 0000 1111
+ 0x080F, // H_PRED: 0000 1000 0000 1111
+ 0x020F, // D45_PRED: 0000 0010 0000 1111
+ 0x080F, // D135_PRED: 0000 1000 0000 1111
+ 0x040F, // D113_PRED: 0000 0100 0000 1111
+ 0x080F, // D157_PRED: 0000 1000 0000 1111
+ 0x080F, // D203_PRED: 0000 1000 0000 1111
+ 0x040F, // D67_PRED: 0000 0100 0000 1111
+ 0x080F, // SMOOTH_PRED: 0000 1000 0000 1111
+ 0x040F, // SMOOTH_V_PRED: 0000 0100 0000 1111
+ 0x080F, // SMOOTH_H_PRED: 0000 1000 0000 1111
+ 0x0C0E, // PAETH_PRED: 0000 1100 0000 1110
+};
+
static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
0x0001, // 0000 0000 0000 0001
0x0201, // 0000 0010 0000 0001
@@ -695,6 +782,11 @@ static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
0xFFFF, // 1111 1111 1111 1111
};
+static const TxSetType av1_ext_tx_set_lookup[2][2] = {
+ { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX },
+ { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT },
+};
+
static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
int use_reduced_set) {
const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
@@ -704,13 +796,7 @@ static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
if (use_reduced_set)
return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
- if (is_inter) {
- return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
- : EXT_TX_SET_ALL16);
- } else {
- return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
- : EXT_TX_SET_DTT4_IDTX_1DDCT);
- }
+ return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
}
// Maps tx set types to the indices.
@@ -749,7 +835,6 @@ static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
return largest_tx_size;
}
-extern const int16_t dr_intra_derivative[90];
static const uint8_t mode_to_angle_map[] = {
0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
};
@@ -777,11 +862,13 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
const MACROBLOCKD *xd,
- TX_SIZE tx_size) {
+ TX_SIZE tx_size,
+ int is_screen_content_type) {
const MB_MODE_INFO *const mbmi = xd->mi[0];
if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
- xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+ xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
+ is_screen_content_type)
return DCT_DCT;
return intra_mode_to_tx_type(mbmi, plane_type);
@@ -792,45 +879,77 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
int subsampling_x,
int subsampling_y) {
- if (bsize == BLOCK_INVALID) return BLOCK_INVALID;
+ assert(bsize < BLOCK_SIZES_ALL);
+ assert(subsampling_x >= 0 && subsampling_x < 2);
+ assert(subsampling_y >= 0 && subsampling_y < 2);
return ss_size_lookup[bsize][subsampling_x][subsampling_y];
}
+/*
+ * Logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+ * txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
int blk_col) {
- TX_SIZE txs = max_txsize_rect_lookup[bsize];
- for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
- txs = sub_tx_size_map[txs];
- const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
- const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
- const int bw_log2 = mi_size_wide_log2[bsize];
- const int stride_log2 = bw_log2 - tx_w_log2;
+ static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
+ };
+ static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
+ };
+ static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
+ };
const int index =
- ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+ ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+ (blk_col >> tw_w_log2_table[bsize]);
assert(index < INTER_TX_SIZE_BUF_LEN);
return index;
}
+#if CONFIG_INSPECTION
+/*
+ * Here is the logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+ * txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
int blk_col) {
- TX_SIZE txs = max_txsize_rect_lookup[bsize];
- for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
- txs = sub_tx_size_map[txs];
- const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
- const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
- const int bw_uint_log2 = mi_size_wide_log2[bsize];
- const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+ };
+ static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+ };
+ static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2,
+ };
const int index =
- ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+ ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+ (blk_col >> tw_w_log2_table[bsize]);
assert(index < TXK_TYPE_BUF_LEN);
return index;
}
+#endif // CONFIG_INSPECTION
-static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
- int blk_row, int blk_col, TX_SIZE tx_size,
+static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row,
+ int blk_col, TX_SIZE tx_size,
TX_TYPE tx_type) {
- const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col);
- txk_type[txk_type_idx] = tx_type;
+ const int stride = xd->tx_type_map_stride;
+ xd->tx_type_map[blk_row * stride + blk_col] = tx_type;
const int txw = tx_size_wide_unit[tx_size];
const int txh = tx_size_high_unit[tx_size];
@@ -843,71 +962,84 @@ static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
const int tx_unit = tx_size_wide_unit[TX_16X16];
for (int idy = 0; idy < txh; idy += tx_unit) {
for (int idx = 0; idx < txw; idx += tx_unit) {
- const int this_index =
- av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx);
- txk_type[this_index] = tx_type;
+ xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type;
}
}
}
}
-static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type,
- const MACROBLOCKD *xd, int blk_row,
+static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd,
+ PLANE_TYPE plane_type, int blk_row,
int blk_col, TX_SIZE tx_size,
int reduced_tx_set) {
const MB_MODE_INFO *const mbmi = xd->mi[0];
- const struct macroblockd_plane *const pd = &xd->plane[plane_type];
- const TxSetType tx_set_type =
- av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+ if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+ return DCT_DCT;
+ }
TX_TYPE tx_type;
- if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
- tx_type = DCT_DCT;
+ if (plane_type == PLANE_TYPE_Y) {
+ tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
} else {
- if (plane_type == PLANE_TYPE_Y) {
- const int txk_type_idx =
- av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
- tx_type = mbmi->txk_type[txk_type_idx];
- } else if (is_inter_block(mbmi)) {
+ if (is_inter_block(mbmi)) {
// scale back to y plane's coordinate
+ const struct macroblockd_plane *const pd = &xd->plane[plane_type];
blk_row <<= pd->subsampling_y;
blk_col <<= pd->subsampling_x;
- const int txk_type_idx =
- av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
- tx_type = mbmi->txk_type[txk_type_idx];
+ tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
} else {
// In intra mode, uv planes don't share the same prediction mode as y
// plane, so the tx_type should not be shared
tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
}
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+ if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
}
assert(tx_type < TX_TYPES);
- if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
+ assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
+ reduced_tx_set)][tx_type]);
return tx_type;
}
void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
const int num_planes);
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * int depth = 0;
+ * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+ * depth++;
+ * tx_size = sub_tx_size_map[tx_size];
+ * }
+ */
static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
- TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
- int depth = 0;
- while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
- depth++;
- tx_size = sub_tx_size_map[tx_size];
- }
- return depth;
+ static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
+ 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ };
+ return bsize_to_max_depth_table[bsize];
}
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * assert(tx_size != TX_4X4);
+ * int depth = 0;
+ * while (tx_size != TX_4X4) {
+ * depth++;
+ * tx_size = sub_tx_size_map[tx_size];
+ * }
+ * assert(depth < 10);
+ */
static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
- TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
- assert(tx_size != TX_4X4);
- int depth = 0;
- while (tx_size != TX_4X4) {
- depth++;
- tx_size = sub_tx_size_map[tx_size];
- assert(depth < 10);
- }
+ assert(bsize < BLOCK_SIZES_ALL);
+ static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
+ 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
+ };
+ const int depth = bsize_to_tx_size_depth_table[bsize];
assert(depth <= MAX_TX_CATS);
return depth - 1;
}
@@ -948,8 +1080,8 @@ static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
pd->subsampling_y);
}
-void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE bsize, const int num_planes);
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ const int num_planes);
void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
@@ -960,9 +1092,10 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg);
-void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
- int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
- int has_eob, int aoff, int loff);
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+ struct macroblockd_plane *pd, int plane,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int has_eob, int aoff, int loff);
#define MAX_INTERINTRA_SB_SQUARE 32 * 32
static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
@@ -1013,15 +1146,13 @@ static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
}
static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
}
static INLINE int is_motion_variation_allowed_compound(
const MB_MODE_INFO *mbmi) {
- if (!has_second_ref(mbmi))
- return 1;
- else
- return 0;
+ return !has_second_ref(mbmi);
}
// input: log2 of length, 0(4), 1(8), ...
@@ -1045,7 +1176,8 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
assert(!has_second_ref(mbmi));
if (mbmi->num_proj_ref >= 1 &&
- (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+ (allow_warped_motion &&
+ !av1_is_scaled(xd->block_ref_scale_factors[0]))) {
if (xd->cur_frame_force_integer_mv) {
return OBMC_CAUSAL;
}
@@ -1057,25 +1189,13 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
}
}
-static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
- const WarpedMotionParams *gm_params,
- const MACROBLOCKD *xd,
- const MB_MODE_INFO *mbmi,
- int allow_warped_motion) {
- const MOTION_MODE last_motion_mode_allowed =
- motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion);
-
- // Check that the input mode is not illegal
- if (last_motion_mode_allowed < mode)
- assert(0 && "Illegal motion mode selected");
-}
-
static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
return (is_inter_block(mbmi));
}
static INLINE int av1_allow_palette(int allow_screen_content_tools,
BLOCK_SIZE sb_type) {
+ assert(sb_type < BLOCK_SIZES_ALL);
return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
}
diff --git a/media/libaom/src/av1/common/cdef.c b/media/libaom/src/av1/common/cdef.c
index e9e2b0e42..ef7b866b5 100644
--- a/media/libaom/src/av1/common/cdef.c
+++ b/media/libaom/src/av1/common/cdef.c
@@ -16,45 +16,29 @@
#include "config/aom_scale_rtcd.h"
#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/cdef.h"
#include "av1/common/cdef_block.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/reconinter.h"
-int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
- int maxc, maxr;
- int skip = 1;
- maxc = cm->mi_cols - mi_col;
- maxr = cm->mi_rows - mi_row;
-
- maxr = AOMMIN(maxr, MI_SIZE_64X64);
- maxc = AOMMIN(maxc, MI_SIZE_64X64);
-
- for (int r = 0; r < maxr; r++) {
- for (int c = 0; c < maxc; c++) {
- skip =
- skip &&
- cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
- }
- }
- return skip;
-}
-
static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
int mi_stride) {
- int is_skip = 1;
- for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
- for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
- is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip;
+ MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
+ for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
+ for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
+ if (!mbmi[c]->skip) return 0;
+ }
+ }
- return is_skip;
+ return 1;
}
-int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
- cdef_list *dlist, BLOCK_SIZE bs) {
- MB_MODE_INFO **grid = cm->mi_grid_visible;
- int maxc = cm->mi_cols - mi_col;
- int maxr = cm->mi_rows - mi_row;
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col, cdef_list *dlist,
+ BLOCK_SIZE bs) {
+ MB_MODE_INFO **grid = mi_params->mi_grid_base;
+ int maxc = mi_params->mi_cols - mi_col;
+ int maxr = mi_params->mi_rows - mi_row;
if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
maxc = AOMMIN(maxc, MI_SIZE_128X128);
@@ -65,22 +49,17 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
else
maxr = AOMMIN(maxr, MI_SIZE_64X64);
- const int r_step = mi_size_high[BLOCK_8X8];
- const int c_step = mi_size_wide[BLOCK_8X8];
- const int r_shift = (r_step == 2);
- const int c_shift = (c_step == 2);
-
- assert(r_step == 1 || r_step == 2);
- assert(c_step == 1 || c_step == 2);
-
+ const int r_step = 2; // mi_size_high[BLOCK_8X8]
+ const int c_step = 2; // mi_size_wide[BLOCK_8X8]
+ const int r_shift = 1;
+ const int c_shift = 1;
int count = 0;
-
for (int r = 0; r < maxr; r += r_step) {
for (int c = 0; c < maxc; c += c_step) {
- if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
+ if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c,
+ mi_params->mi_stride)) {
dlist[count].by = r >> r_shift;
dlist[count].bx = c >> c_shift;
- dlist[count].skip = 0;
count++;
}
}
@@ -88,8 +67,9 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
return count;
}
-void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
- int sstride, int v, int h) {
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride, int v,
+ int h) {
for (int i = 0; i < v; i++) {
for (int j = 0; j < h; j++) {
dst[i * dstride + j] = src[i * sstride + j];
@@ -97,9 +77,9 @@ void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
}
}
-void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
- const uint16_t *src, int sstride, int v,
- int h) {
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride, int v,
+ int h) {
for (int i = 0; i < v; i++) {
for (int j = 0; j < h; j++) {
dst[i * dstride + j] = src[i * sstride + j];
@@ -107,16 +87,16 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
}
}
-static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride,
const uint8_t *src, int src_voffset, int src_hoffset,
int sstride, int vsize, int hsize) {
if (cm->seq_params.use_highbitdepth) {
const uint16_t *base =
&CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
- copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+ cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
} else {
const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
- copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+ cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
}
}
@@ -140,6 +120,8 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd) {
+ const CdefInfo *const cdef_info = &cm->cdef_info;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
const int num_planes = av1_num_planes(cm);
DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
uint16_t *linebuf[3];
@@ -154,8 +136,8 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int xdec[3];
int ydec[3];
int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
- const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
- const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
num_planes);
row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
@@ -168,7 +150,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
}
- const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+ const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
for (int pli = 0; pli < num_planes; pli++) {
linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
colbuf[pli] =
@@ -190,17 +172,18 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int nhb, nvb;
int cstart = 0;
curr_row_cdef[fbc] = 0;
- if (cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
- MI_SIZE_64X64 * fbc] == NULL ||
- cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
- MI_SIZE_64X64 * fbc]
+ if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc] == NULL ||
+ mi_params
+ ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc]
->cdef_strength == -1) {
cdef_left = 0;
continue;
}
if (!cdef_left) cstart = -CDEF_HBORDER;
- nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
- nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+ nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+ nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
int frame_top, frame_left, frame_bottom, frame_right;
int mi_row = MI_SIZE_64X64 * fbr;
@@ -218,32 +201,35 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
frame_left = (mi_col == 0) ? 1 : 0;
if (fbr != nvfb - 1)
- frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
+ frame_bottom = (mi_row + MI_SIZE_64X64 == mi_params->mi_rows) ? 1 : 0;
else
frame_bottom = 1;
if (fbc != nhfb - 1)
- frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
+ frame_right = (mi_col + MI_SIZE_64X64 == mi_params->mi_cols) ? 1 : 0;
else
frame_right = 1;
const int mbmi_cdef_strength =
- cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
- MI_SIZE_64X64 * fbc]
+ mi_params
+ ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc]
->cdef_strength;
- level = cm->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+ level =
+ cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
sec_strength =
- cm->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+ cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
sec_strength += sec_strength == 3;
- uv_level = cm->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+ uv_level =
+ cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
uv_sec_strength =
- cm->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+ cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
uv_sec_strength += uv_sec_strength == 3;
if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
uv_sec_strength == 0) ||
- (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
- fbc * MI_SIZE_64X64, dlist,
- BLOCK_64X64)) == 0) {
+ (cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
+ fbc * MI_SIZE_64X64, dlist,
+ BLOCK_64X64)) == 0) {
cdef_left = 0;
continue;
}
@@ -252,8 +238,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
for (int pli = 0; pli < num_planes; pli++) {
int coffset;
int rend, cend;
- int pri_damping = cm->cdef_pri_damping;
- int sec_damping = cm->cdef_sec_damping;
+ int damping = cdef_info->cdef_damping;
int hsize = nhb << mi_wide_l2[pli];
int vsize = nvb << mi_high_l2[pli];
@@ -364,7 +349,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
}
if (cm->seq_params.use_highbitdepth) {
- cdef_filter_fb(
+ av1_cdef_filter_fb(
NULL,
&CONVERT_TO_SHORTPTR(
xd->plane[pli]
@@ -374,9 +359,9 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
xd->plane[pli].dst.stride,
&src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
- sec_strength, pri_damping, sec_damping, coeff_shift);
+ sec_strength, damping, coeff_shift);
} else {
- cdef_filter_fb(
+ av1_cdef_filter_fb(
&xd->plane[pli]
.dst.buf[xd->plane[pli].dst.stride *
(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
@@ -384,7 +369,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
NULL, xd->plane[pli].dst.stride,
&src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
- sec_strength, pri_damping, sec_damping, coeff_shift);
+ sec_strength, damping, coeff_shift);
}
}
cdef_left = 1;
diff --git a/media/libaom/src/av1/common/cdef.h b/media/libaom/src/av1/common/cdef.h
index 3b2eac8a5..c36fd135a 100644
--- a/media/libaom/src/av1/common/cdef.h
+++ b/media/libaom/src/av1/common/cdef.h
@@ -20,8 +20,8 @@
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/cdef_block.h"
-#include "av1/common/onyxc_int.h"
static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
@@ -37,13 +37,14 @@ static INLINE int constrain(int diff, int threshold, int damping) {
extern "C" {
#endif
-int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
-int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
- cdef_list *dlist, BLOCK_SIZE bsize);
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col, cdef_list *dlist,
+ BLOCK_SIZE bsize);
void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
- AV1_COMMON *cm, MACROBLOCKD *xd, int fast);
+ AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
+ int rdmult);
#ifdef __cplusplus
} // extern "C"
diff --git a/media/libaom/src/av1/common/cdef_block.c b/media/libaom/src/av1/common/cdef_block.c
index df1de89be..7120705d3 100644
--- a/media/libaom/src/av1/common/cdef_block.c
+++ b/media/libaom/src/av1/common/cdef_block.c
@@ -108,17 +108,17 @@ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
}
const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
-const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+const int cdef_sec_taps[2] = { 2, 1 };
/* Smooth in the direction detected. */
void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
const uint16_t *in, int pri_strength, int sec_strength,
int dir, int pri_damping, int sec_damping, int bsize,
- AOM_UNUSED int max_unused, int coeff_shift) {
+ int coeff_shift) {
int i, j, k;
const int s = CDEF_BSTRIDE;
const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
- const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
int16_t sum = 0;
@@ -173,25 +173,20 @@ static INLINE int adjust_strength(int strength, int32_t var) {
return var ? (strength * (4 + i) + 8) >> 4 : 0;
}
-void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
- int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
- int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
- cdef_list *dlist, int cdef_count, int level,
- int sec_strength, int pri_damping, int sec_damping,
- int coeff_shift) {
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+ uint16_t *in, int xdec, int ydec,
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int damping, int coeff_shift) {
int bi;
int bx;
int by;
- int bsize, bsizex, bsizey;
-
- int pri_strength = level << coeff_shift;
+ const int pri_strength = level << coeff_shift;
sec_strength <<= coeff_shift;
- sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
- pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
- bsize =
- ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
- bsizex = 3 - xdec;
- bsizey = 3 - ydec;
+ damping += coeff_shift - (pli != AOM_PLANE_Y);
+ const int bw_log2 = 3 - xdec;
+ const int bh_log2 = 3 - ydec;
if (dirinit && pri_strength == 0 && sec_strength == 0) {
// If we're here, both primary and secondary strengths are 0, and
// we still haven't written anything to y[] yet, so we just copy
@@ -200,12 +195,12 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
- int iy, ix;
// TODO(stemidts/jmvalin): SIMD optimisations
- for (iy = 0; iy < 1 << bsizey; iy++)
- for (ix = 0; ix < 1 << bsizex; ix++)
- dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
- in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
+ for (int iy = 0; iy < 1 << bh_log2; iy++) {
+ memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)],
+ &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)],
+ ((size_t)1 << bw_log2) * sizeof(*dst16));
+ }
}
return;
}
@@ -231,27 +226,28 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
}
}
+ const int bsize =
+ ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
+ const int t = pri_strength;
+ const int s = sec_strength;
for (bi = 0; bi < cdef_count; bi++) {
- int t = dlist[bi].skip ? 0 : pri_strength;
- int s = dlist[bi].skip ? 0 : sec_strength;
by = dlist[bi].by;
bx = dlist[bi].bx;
- if (dst8)
- cdef_filter_block(&dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL,
- dstride,
- &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
- (pli ? t : adjust_strength(t, var[by][bx])), s,
- t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize,
- (256 << coeff_shift) - 1, coeff_shift);
- else
+ if (dst8) {
+ cdef_filter_block(
+ &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], NULL, dstride,
+ &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
+ (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+ damping, damping, bsize, coeff_shift);
+ } else {
cdef_filter_block(
NULL,
- &dst16[dirinit ? bi << (bsizex + bsizey)
- : (by << bsizey) * dstride + (bx << bsizex)],
- dirinit ? 1 << bsizex : dstride,
- &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+ &dst16[dirinit ? bi << (bw_log2 + bh_log2)
+ : (by << bh_log2) * dstride + (bx << bw_log2)],
+ dirinit ? 1 << bw_log2 : dstride,
+ &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
(pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
- pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1,
- coeff_shift);
+ damping, damping, bsize, coeff_shift);
+ }
}
}
diff --git a/media/libaom/src/av1/common/cdef_block.h b/media/libaom/src/av1/common/cdef_block.h
index 6b4452cd6..6b0ae0a9d 100644
--- a/media/libaom/src/av1/common/cdef_block.h
+++ b/media/libaom/src/av1/common/cdef_block.h
@@ -32,28 +32,27 @@
(CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
extern const int cdef_pri_taps[2][2];
-extern const int cdef_sec_taps[2][2];
+extern const int cdef_sec_taps[2];
DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
typedef struct {
uint8_t by;
uint8_t bx;
- uint8_t skip;
} cdef_list;
typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
int dstride, const uint16_t *in,
int pri_strength, int sec_strength,
int dir, int pri_damping,
- int sec_damping, int bsize, int max,
+ int sec_damping, int bsize,
int coeff_shift);
void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
cdef_list *dlist, int cdef_count, int bsize);
-void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
- int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
- int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
- cdef_list *dlist, int cdef_count, int level,
- int sec_strength, int pri_damping, int sec_damping,
- int coeff_shift);
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+ uint16_t *in, int xdec, int ydec,
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int damping, int coeff_shift);
#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/media/libaom/src/av1/common/cdef_block_simd.h b/media/libaom/src/av1/common/cdef_block_simd.h
index 14587a023..5a52bc1e4 100644
--- a/media/libaom/src/av1/common/cdef_block_simd.h
+++ b/media/libaom/src/av1/common/cdef_block_simd.h
@@ -226,7 +226,6 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
- AOM_UNUSED int max_unused,
int coeff_shift) {
v128 p0, p1, p2, p3;
v256 sum, row, tap, res;
@@ -239,7 +238,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
int s2o2 = cdef_directions[(dir + 6) & 7][1];
const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
- const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
if (pri_strength)
pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -393,7 +392,6 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
- AOM_UNUSED int max_unused,
int coeff_shift) {
int i;
v128 p0, p1, p2, p3;
@@ -407,7 +405,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
int s2o2 = cdef_directions[(dir + 6) & 7][1];
const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
- const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
if (pri_strength)
pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -541,7 +539,6 @@ void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
- AOM_UNUSED int max_unused,
int coeff_shift) {
int i;
v256 p0, p1, p2, p3, sum, row, res;
@@ -554,7 +551,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
int s2o2 = cdef_directions[(dir + 6) & 7][1];
const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
- const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
if (pri_strength)
pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -699,7 +696,6 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
- AOM_UNUSED int max_unused,
int coeff_shift) {
int i;
v256 sum, p0, p1, p2, p3, row, res;
@@ -712,7 +708,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
int s2o2 = cdef_directions[(dir + 6) & 7][1];
const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
- const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
if (pri_strength)
pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -833,63 +829,62 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir, int pri_damping,
- int sec_damping, int bsize, int max,
- int coeff_shift) {
+ int sec_damping, int bsize, int coeff_shift) {
if (dst8) {
if (bsize == BLOCK_8X8) {
SIMD_FUNC(cdef_filter_block_8x8_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
} else if (bsize == BLOCK_4X8) {
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
- sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift);
} else if (bsize == BLOCK_8X4) {
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
} else {
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
}
} else {
if (bsize == BLOCK_8X8) {
SIMD_FUNC(cdef_filter_block_8x8_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
} else if (bsize == BLOCK_4X8) {
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
- sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift);
} else if (bsize == BLOCK_8X4) {
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
} else {
assert(bsize == BLOCK_4X4);
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
- sec_damping, max, coeff_shift);
+ sec_damping, coeff_shift);
}
}
}
-void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
- const uint8_t *src, int sstride, int v,
- int h) {
+void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int v, int h) {
int i, j;
for (i = 0; i < v; i++) {
for (j = 0; j < (h & ~0x7); j += 8) {
@@ -902,9 +897,9 @@ void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
}
}
-void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
- const uint16_t *src, int sstride,
- int v, int h) {
+void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride,
+ int v, int h) {
int i, j;
for (i = 0; i < v; i++) {
for (j = 0; j < (h & ~0x7); j += 8) {
diff --git a/media/libaom/src/av1/common/cfl.c b/media/libaom/src/av1/common/cfl.c
index ccc59b4eb..98199cb95 100644
--- a/media/libaom/src/av1/common/cfl.c
+++ b/media/libaom/src/av1/common/cfl.c
@@ -9,9 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include "av1/common/av1_common_int.h"
#include "av1/common/cfl.h"
#include "av1/common/common_data.h"
-#include "av1/common/onyxc_int.h"
#include "config/av1_rtcd.h"
@@ -37,7 +37,7 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
assert(pred_plane < CFL_PRED_PLANES);
assert(width <= CFL_BUF_LINE);
- if (get_bitdepth_data_path_index(xd)) {
+ if (is_cur_buf_hbd(xd)) {
uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
return;
@@ -69,7 +69,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
assert(pred_plane < CFL_PRED_PLANES);
assert(width <= CFL_BUF_LINE);
assert(height <= CFL_BUF_LINE);
- if (get_bitdepth_data_path_index(xd)) {
+ if (is_cur_buf_hbd(xd)) {
uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
width, height);
@@ -136,7 +136,7 @@ static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
CFL_SUB_AVG_FN(c)
-static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
+static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
CFL_PRED_TYPE pred_type) {
const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
: CFL_SIGN_V(joint_sign);
@@ -158,18 +158,9 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
}
}
-// Null function used for invalid tx_sizes
-void cfl_predict_lbd_null(const int16_t *ac_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3) {
- (void)ac_buf_q3;
- (void)dst;
- (void)dst_stride;
- (void)alpha_q3;
- assert(0);
-}
-
CFL_PREDICT_FN(c, lbd)
+#if CONFIG_AV1_HIGHBITDEPTH
void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
int alpha_q3, int bit_depth, int width, int height) {
for (int j = 0; j < height; j++) {
@@ -182,18 +173,8 @@ void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
}
}
-// Null function used for invalid tx_sizes
-void cfl_predict_hbd_null(const int16_t *ac_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd) {
- (void)ac_buf_q3;
- (void)dst;
- (void)dst_stride;
- (void)alpha_q3;
- (void)bd;
- assert(0);
-}
-
CFL_PREDICT_FN(c, hbd)
+#endif
static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
CFL_CTX *const cfl = &xd->cfl;
@@ -201,7 +182,7 @@ static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
assert(cfl->are_parameters_computed == 0);
cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
- get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+ cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
cfl->are_parameters_computed = 1;
}
@@ -217,31 +198,15 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
CFL_BUF_SQUARE);
- if (get_bitdepth_data_path_index(xd)) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
- get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
- xd->bd);
+ cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
+ alpha_q3, xd->bd);
return;
}
- get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
-}
-
-// Null function used for invalid tx_sizes
-void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
- uint16_t *output_q3) {
- (void)input;
- (void)input_stride;
- (void)output_q3;
- assert(0);
-}
-
-// Null function used for invalid tx_sizes
-void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
- uint16_t *output_q3) {
- (void)input;
- (void)input_stride;
- (void)output_q3;
- assert(0);
+#endif
+ cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
}
static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
@@ -287,6 +252,7 @@ static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
int input_stride,
uint16_t *output_q3, int width,
@@ -329,9 +295,11 @@ static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
output_q3 += CFL_BUF_LINE;
}
}
+#endif
CFL_GET_SUBSAMPLE_FUNCTION(c)
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
int sub_x, int sub_y) {
if (sub_x == 1) {
@@ -342,6 +310,7 @@ static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
}
return cfl_get_luma_subsampling_444_hbd(tx_size);
}
+#endif
static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
int sub_x, int sub_y) {
@@ -358,7 +327,7 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
int row, int col, TX_SIZE tx_size, int use_hbd) {
const int width = tx_size_wide[tx_size];
const int height = tx_size_high[tx_size];
- const int tx_off_log2 = tx_size_wide_log2[0];
+ const int tx_off_log2 = MI_SIZE_LOG2;
const int sub_x = cfl->subsampling_x;
const int sub_y = cfl->subsampling_y;
const int store_row = row << (tx_off_log2 - sub_y);
@@ -387,7 +356,7 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
// Store the input into the CfL pixel buffer
uint16_t *recon_buf_q3 =
cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
-
+#if CONFIG_AV1_HIGHBITDEPTH
if (use_hbd) {
cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
input_stride, recon_buf_q3);
@@ -395,20 +364,25 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
recon_buf_q3);
}
+#else
+ (void)use_hbd;
+ cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3);
+#endif
}
// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
// and non-chroma-referenced blocks are stored together in the CfL buffer.
-static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
+ int mi_col, int *row_out,
int *col_out) {
// Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
- if ((cfl->mi_row & 0x01) && cfl->subsampling_y) {
+ if ((mi_row & 0x01) && cfl->subsampling_y) {
assert(*row_out == 0);
(*row_out)++;
}
// Increment col index for right: 4x8, 4x16 or both right 4x4s.
- if ((cfl->mi_col & 0x01) && cfl->subsampling_x) {
+ if ((mi_col & 0x01) && cfl->subsampling_x) {
assert(*col_out == 0);
(*col_out)++;
}
@@ -418,17 +392,31 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
BLOCK_SIZE bsize) {
CFL_CTX *const cfl = &xd->cfl;
struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
- uint8_t *dst =
- &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+ uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
// Only dimensions of size 4 can have an odd offset.
assert(!((col & 1) && tx_size_wide[tx_size] != 4));
assert(!((row & 1) && tx_size_high[tx_size] != 4));
- sub8x8_adjust_offset(cfl, &row, &col);
+ sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
}
- cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
- get_bitdepth_data_path_index(xd));
+ cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
+}
+
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+ BLOCK_SIZE plane_bsize, int plane,
+ TX_SIZE tx_size) {
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+ << MI_SIZE_LOG2;
+ return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+ BLOCK_SIZE plane_bsize, int plane,
+ TX_SIZE tx_size) {
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+ << MI_SIZE_LOG2;
+ return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
}
void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -438,11 +426,11 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
int col = 0;
if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
- sub8x8_adjust_offset(cfl, &row, &col);
+ sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
}
const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
tx_size = get_tx_size(width, height);
cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
- get_bitdepth_data_path_index(xd));
+ is_cur_buf_hbd(xd));
}
diff --git a/media/libaom/src/av1/common/cfl.h b/media/libaom/src/av1/common/cfl.h
index d627891bf..a1d6dc2ea 100644
--- a/media/libaom/src/av1/common/cfl.h
+++ b/media/libaom/src/av1/common/cfl.h
@@ -12,8 +12,8 @@
#ifndef AOM_AV1_COMMON_CFL_H_
#define AOM_AV1_COMMON_CFL_H_
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
// Can we use CfL for the current block?
static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
@@ -41,7 +41,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
if (cm->seq_params.monochrome) return CFL_DISALLOWED;
- if (!xd->cfl.is_chroma_reference) {
+ if (!xd->is_chroma_ref) {
// For non-chroma-reference blocks, we should always store the luma pixels,
// in case the corresponding chroma-reference block uses CfL.
// Note that this can only happen for block sizes which are <8 on
@@ -80,14 +80,6 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
-// Null function used for invalid tx_sizes
-void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-
-// Null function used for invalid tx_sizes
-void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-
// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
#define CFL_lbd_TYPE uint8_t *cfl_type
#define CFL_hbd_TYPE uint16_t *cfl_type
@@ -97,7 +89,7 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
// will be constant allowing for loop unrolling and other constant propagated
// goodness.
#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \
- void subsample_##bd##_##sub##_##width##x##height##_##arch( \
+ void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \
const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \
cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \
output_q3, width, height); \
@@ -127,31 +119,32 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
// Declare an architecture-specific array of function pointers for size-specific
// wrappers.
-#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \
- static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
- subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \
- subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \
- subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \
- subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \
- cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \
- subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \
- subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \
- subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \
- subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \
- subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \
- subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \
- cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \
- cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \
- subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \
- subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \
- subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \
- subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \
- cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \
- cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \
+ cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \
+ cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \
+ cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \
+ cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \
+ cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \
+ cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \
+ cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \
+ cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \
+ cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \
+ cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \
+ cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
};
// The RTCD script does not support passing in an array, so we wrap it in this
// function.
+#if CONFIG_AV1_HIGHBITDEPTH
#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \
CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
@@ -159,144 +152,137 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
-
-// Null function used for invalid tx_sizes
-static INLINE void cfl_subtract_average_null(const uint16_t *src,
- int16_t *dst) {
- (void)dst;
- (void)src;
- assert(0);
-}
+#else
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd)
+#endif
// Declare a size-specific wrapper for the size-generic function. The compiler
// will inline the size generic function in here, the advantage is that the size
// will be constant allowing for loop unrolling and other constant propagated
// goodness.
-#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \
- void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
- int16_t *dst) { \
- subtract_average_##arch(src, dst, width, height, round_offset, \
- num_pel_log2); \
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \
+ void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+ int16_t *dst) { \
+ subtract_average_##arch(src, dst, width, height, round_offset, \
+ num_pel_log2); \
}
// Declare size-specific wrappers for all valid CfL sizes.
-#define CFL_SUB_AVG_FN(arch) \
- CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \
- CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \
- CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \
- CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \
- CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \
- CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \
- CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \
- CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \
- CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \
- CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \
- CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \
- CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \
- CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \
- CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \
- cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
- static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \
- subtract_average_4x4_##arch, /* 4x4 */ \
- subtract_average_8x8_##arch, /* 8x8 */ \
- subtract_average_16x16_##arch, /* 16x16 */ \
- subtract_average_32x32_##arch, /* 32x32 */ \
- cfl_subtract_average_null, /* 64x64 (invalid CFL size) */ \
- subtract_average_4x8_##arch, /* 4x8 */ \
- subtract_average_8x4_##arch, /* 8x4 */ \
- subtract_average_8x16_##arch, /* 8x16 */ \
- subtract_average_16x8_##arch, /* 16x8 */ \
- subtract_average_16x32_##arch, /* 16x32 */ \
- subtract_average_32x16_##arch, /* 32x16 */ \
- cfl_subtract_average_null, /* 32x64 (invalid CFL size) */ \
- cfl_subtract_average_null, /* 64x32 (invalid CFL size) */ \
- subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \
- subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \
- subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \
- subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \
- cfl_subtract_average_null, /* 16x64 (invalid CFL size) */ \
- cfl_subtract_average_null, /* 64x16 (invalid CFL size) */ \
- }; \
- /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
- /* index the function pointer array out of bounds. */ \
- return sub_avg[tx_size % TX_SIZES_ALL]; \
+#define CFL_SUB_AVG_FN(arch) \
+ CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \
+ CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \
+ CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \
+ CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \
+ CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \
+ CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \
+ CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \
+ CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \
+ CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \
+ CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \
+ CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \
+ CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \
+ CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \
+ CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \
+ cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \
+ TX_SIZE tx_size) { \
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \
+ cfl_subtract_average_4x4_##arch, /* 4x4 */ \
+ cfl_subtract_average_8x8_##arch, /* 8x8 */ \
+ cfl_subtract_average_16x16_##arch, /* 16x16 */ \
+ cfl_subtract_average_32x32_##arch, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_subtract_average_4x8_##arch, /* 4x8 */ \
+ cfl_subtract_average_8x4_##arch, /* 8x4 */ \
+ cfl_subtract_average_8x16_##arch, /* 8x16 */ \
+ cfl_subtract_average_16x8_##arch, /* 16x8 */ \
+ cfl_subtract_average_16x32_##arch, /* 16x32 */ \
+ cfl_subtract_average_32x16_##arch, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \
+ cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \
+ cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \
+ cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ }; \
+ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+ /* index the function pointer array out of bounds. */ \
+ return sub_avg[tx_size % TX_SIZES_ALL]; \
}
// For VSX SIMD optimization, the C versions of width == 4 subtract are
// faster than the VSX. As such, the VSX code calls the C versions.
-void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
-void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
-void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
-
-#define CFL_PREDICT_lbd(arch, width, height) \
- void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \
- uint8_t *dst, int dst_stride, \
- int alpha_q3) { \
- cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
- height); \
+void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height) \
+ void cfl_predict_lbd_##width##x##height##_##arch( \
+ const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \
+ int alpha_q3) { \
+ cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
+ height); \
}
-#define CFL_PREDICT_hbd(arch, width, height) \
- void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \
- uint16_t *dst, int dst_stride, \
- int alpha_q3, int bd) { \
- cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
- height); \
+#if CONFIG_AV1_HIGHBITDEPTH
+#define CFL_PREDICT_hbd(arch, width, height) \
+ void cfl_predict_hbd_##width##x##height##_##arch( \
+ const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+ int bd) { \
+ cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
+ height); \
}
+#endif
// This wrapper exists because clang format does not like calling macros with
// lowercase letters.
#define CFL_PREDICT_X(arch, width, height, bd) \
CFL_PREDICT_##bd(arch, width, height)
-// Null function used for invalid tx_sizes
-void cfl_predict_lbd_null(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-
-// Null function used for invalid tx_sizes
-void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-
-#define CFL_PREDICT_FN(arch, bd) \
- CFL_PREDICT_X(arch, 4, 4, bd) \
- CFL_PREDICT_X(arch, 4, 8, bd) \
- CFL_PREDICT_X(arch, 4, 16, bd) \
- CFL_PREDICT_X(arch, 8, 4, bd) \
- CFL_PREDICT_X(arch, 8, 8, bd) \
- CFL_PREDICT_X(arch, 8, 16, bd) \
- CFL_PREDICT_X(arch, 8, 32, bd) \
- CFL_PREDICT_X(arch, 16, 4, bd) \
- CFL_PREDICT_X(arch, 16, 8, bd) \
- CFL_PREDICT_X(arch, 16, 16, bd) \
- CFL_PREDICT_X(arch, 16, 32, bd) \
- CFL_PREDICT_X(arch, 32, 8, bd) \
- CFL_PREDICT_X(arch, 32, 16, bd) \
- CFL_PREDICT_X(arch, 32, 32, bd) \
- cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \
- static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \
- predict_##bd##_4x4_##arch, /* 4x4 */ \
- predict_##bd##_8x8_##arch, /* 8x8 */ \
- predict_##bd##_16x16_##arch, /* 16x16 */ \
- predict_##bd##_32x32_##arch, /* 32x32 */ \
- cfl_predict_##bd##_null, /* 64x64 (invalid CFL size) */ \
- predict_##bd##_4x8_##arch, /* 4x8 */ \
- predict_##bd##_8x4_##arch, /* 8x4 */ \
- predict_##bd##_8x16_##arch, /* 8x16 */ \
- predict_##bd##_16x8_##arch, /* 16x8 */ \
- predict_##bd##_16x32_##arch, /* 16x32 */ \
- predict_##bd##_32x16_##arch, /* 32x16 */ \
- cfl_predict_##bd##_null, /* 32x64 (invalid CFL size) */ \
- cfl_predict_##bd##_null, /* 64x32 (invalid CFL size) */ \
- predict_##bd##_4x16_##arch, /* 4x16 */ \
- predict_##bd##_16x4_##arch, /* 16x4 */ \
- predict_##bd##_8x32_##arch, /* 8x32 */ \
- predict_##bd##_32x8_##arch, /* 32x8 */ \
- cfl_predict_##bd##_null, /* 16x64 (invalid CFL size) */ \
- cfl_predict_##bd##_null, /* 64x16 (invalid CFL size) */ \
- }; \
- /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
- /* index the function pointer array out of bounds. */ \
- return pred[tx_size % TX_SIZES_ALL]; \
+#define CFL_PREDICT_FN(arch, bd) \
+ CFL_PREDICT_X(arch, 4, 4, bd) \
+ CFL_PREDICT_X(arch, 4, 8, bd) \
+ CFL_PREDICT_X(arch, 4, 16, bd) \
+ CFL_PREDICT_X(arch, 8, 4, bd) \
+ CFL_PREDICT_X(arch, 8, 8, bd) \
+ CFL_PREDICT_X(arch, 8, 16, bd) \
+ CFL_PREDICT_X(arch, 8, 32, bd) \
+ CFL_PREDICT_X(arch, 16, 4, bd) \
+ CFL_PREDICT_X(arch, 16, 8, bd) \
+ CFL_PREDICT_X(arch, 16, 16, bd) \
+ CFL_PREDICT_X(arch, 16, 32, bd) \
+ CFL_PREDICT_X(arch, 32, 8, bd) \
+ CFL_PREDICT_X(arch, 32, 16, bd) \
+ CFL_PREDICT_X(arch, 32, 32, bd) \
+ cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \
+ static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \
+ cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \
+ cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \
+ cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \
+ cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \
+ cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \
+ cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \
+ cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \
+ cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \
+ cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \
+ cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \
+ cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \
+ cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ }; \
+ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+ /* index the function pointer array out of bounds. */ \
+ return pred[tx_size % TX_SIZES_ALL]; \
}
#endif // AOM_AV1_COMMON_CFL_H_
diff --git a/media/libaom/src/av1/common/common_data.h b/media/libaom/src/av1/common/common_data.h
index 46e455fdb..402845caf 100644
--- a/media/libaom/src/av1/common/common_data.h
+++ b/media/libaom/src/av1/common/common_data.h
@@ -82,16 +82,16 @@ static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
}, { // PARTITION_HORZ_A
- BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
}, { // PARTITION_HORZ_B
- BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
}, { // PARTITION_VERT_A
- BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
}, { // PARTITION_VERT_B
- BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
}, { // PARTITION_HORZ_4
BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
diff --git a/media/libaom/src/av1/common/convolve.c b/media/libaom/src/av1/common/convolve.c
index 1f11126fc..e177e3cad 100644
--- a/media/libaom/src/av1/common/convolve.c
+++ b/media/libaom/src/av1/common/convolve.c
@@ -15,10 +15,10 @@
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/convolve.h"
#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/resize.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
@@ -73,15 +73,55 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
}
}
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+ int dst_stride, int w, int h, int dir,
+ double norm) {
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
+ DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
+ const int taps = 3;
+ int im_h = h + taps - 1;
+ int im_stride = w;
+ const int fo_vert = 1;
+ const int fo_horiz = 1;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = dir ? sobel_a : sobel_b;
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int16_t sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ im_block[y * im_stride + x] = sum;
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = dir ? sobel_b : sobel_a;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int16_t sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ dst[y * dst_stride + x] = sum * norm;
+ }
+ }
+}
+
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bd = 8;
@@ -91,7 +131,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -107,7 +147,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -128,11 +168,11 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const int fo_vert = filter_params_y->taps / 2 - 1;
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
(void)conv_params;
assert(conv_params->round_0 <= FILTER_BITS);
@@ -141,7 +181,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -158,12 +198,12 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_0;
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
(void)conv_params;
assert(bits >= 0);
@@ -172,7 +212,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -190,27 +230,27 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
(void)conv_params;
for (int y = 0; y < h; ++y) {
- memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+ memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
}
}
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
@@ -223,7 +263,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -239,7 +279,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -250,8 +290,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -260,23 +300,23 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
}
tmp -= (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
- dst8[y * dst8_stride + x] =
+ dst[y * dst_stride + x] =
clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
}
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_0;
const int bd = 8;
@@ -286,11 +326,11 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -301,8 +341,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -310,23 +350,23 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
tmp = tmp >> 1;
}
tmp -= round_offset;
- dst8[y * dst8_stride + x] =
+ dst[y * dst_stride + x] =
clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
}
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_1;
const int bd = 8;
@@ -336,11 +376,11 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -351,8 +391,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
res += round_offset;
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -360,23 +400,24 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
tmp = tmp >> 1;
}
tmp -= round_offset;
- dst8[y * dst8_stride + x] =
+ dst[y * dst_stride + x] =
clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
}
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const int bd = 8;
@@ -385,8 +426,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
(1 << (offset_bits - conv_params->round_1 - 1));
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -394,8 +435,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
res += round_offset;
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -403,16 +444,16 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
tmp = tmp >> 1;
}
tmp -= round_offset;
- dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
}
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
@@ -472,7 +513,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -482,7 +523,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
/* Subtract round offset and convolve round */
tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
- dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
} else {
dst16[y * dst16_stride + x] = res;
}
@@ -490,7 +531,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
/* Subtract round offset and convolve round */
int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
- dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
}
}
src_vert++;
@@ -511,89 +552,71 @@ static void convolve_2d_scale_wrapper(
y_step_qn, conv_params);
}
-// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
-// we may create optimized code to do 2-tap filtering for all bilinear filtering
-// usages, not just IntraBC.
-static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride, int w, int h,
- int subpel_x_q4, int subpel_y_q4,
- ConvolveParams *conv_params) {
- const InterpFilterParams *filter_params_x =
- subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
- const InterpFilterParams *filter_params_y =
- subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
- if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
- av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, 0, 0, conv_params);
- } else if (subpel_x_q4 != 0) {
- av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
- filter_params_y, 0, 0, conv_params);
- } else {
- av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
- filter_params_y, 0, 0, conv_params);
- }
-}
-
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilters interp_filters, const int subpel_x_q4,
- int x_step_q4, const int subpel_y_q4, int y_step_q4,
- int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf, int is_intrabc) {
- assert(IMPLIES(is_intrabc, !scaled));
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4, int scaled,
+ ConvolveParams *conv_params,
+ const struct scale_factors *sf) {
(void)x_step_q4;
(void)y_step_q4;
(void)dst;
(void)dst_stride;
- if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
- convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
- subpel_y_q4, conv_params);
- return;
+ const InterpFilterParams *filter_params_x = interp_filters[0];
+ const InterpFilterParams *filter_params_y = interp_filters[1];
+
+ // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
+ // Do we have SIMD support to 4-tap case?
+ // 2-tap filter indicates that it is for IntraBC.
+ if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert(!scaled);
+ if (subpel_x_qn && subpel_y_qn) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ } else if (subpel_x_qn) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ } else if (subpel_y_qn) {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
}
- InterpFilter filter_x = 0;
- InterpFilter filter_y = 0;
- const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
- const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
- if (need_filter_params_x)
- filter_x = av1_extract_interp_filter(interp_filters, 1);
- if (need_filter_params_y)
- filter_y = av1_extract_interp_filter(interp_filters, 0);
- const InterpFilterParams *filter_params_x =
- need_filter_params_x
- ? av1_get_interp_filter_params_with_block_size(filter_x, w)
- : NULL;
- const InterpFilterParams *filter_params_y =
- need_filter_params_y
- ? av1_get_interp_filter_params_with_block_size(filter_y, h)
- : NULL;
-
if (scaled) {
convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, subpel_x_q4,
- x_step_q4, subpel_y_q4, y_step_q4, conv_params);
+ filter_params_x, filter_params_y, subpel_x_qn,
+ x_step_q4, subpel_y_qn, y_step_q4, conv_params);
} else {
- sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
+ sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
src, src_stride, dst, dst_stride, w, h, filter_params_x,
- filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+ filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
void av1_highbd_convolve_2d_copy_sr_c(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
(void)conv_params;
(void)bd;
for (int y = 0; y < h; ++y) {
- memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+ memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
}
}
@@ -601,12 +624,12 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params, int bd) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_0;
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -614,7 +637,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -632,11 +655,11 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params, int bd) {
const int fo_vert = filter_params_y->taps / 2 - 1;
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
(void)conv_params;
assert(conv_params->round_0 <= FILTER_BITS);
@@ -644,7 +667,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -661,11 +684,12 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params, int bd) {
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits =
@@ -675,7 +699,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -691,7 +715,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
// vertical filter
int16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
@@ -709,17 +733,15 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
- uint16_t *dst16, int dst16_stride, int w,
- int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
int x, y, k;
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -731,7 +753,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
@@ -749,7 +771,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
int16_t *src_vert = im_block + fo_vert * im_stride;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = 1 << offset_bits;
@@ -759,8 +781,8 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -769,24 +791,22 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
}
tmp -= (1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1));
- dst16[y * dst16_stride + x] =
+ dst[y * dst_stride + x] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
}
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
- uint16_t *dst16, int dst16_stride, int w,
- int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+void av1_highbd_dist_wtd_convolve_x_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -796,11 +816,11 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
assert(round_bits >= 0);
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
assert(bits >= 0);
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -811,8 +831,8 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
res += round_offset;
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -820,24 +840,22 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
tmp = tmp >> 1;
}
tmp -= round_offset;
- dst16[y * dst16_stride + x] =
+ dst[y * dst_stride + x] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
}
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
- uint16_t *dst16, int dst16_stride, int w,
- int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+void av1_highbd_dist_wtd_convolve_y_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_0;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -847,11 +865,11 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
assert(round_bits >= 0);
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
assert(bits >= 0);
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -862,8 +880,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -871,22 +889,22 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
tmp = tmp >> 1;
}
tmp -= round_offset;
- dst16[y * dst16_stride + x] =
+ dst[y * dst_stride + x] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
}
-void av1_highbd_jnt_convolve_2d_copy_c(
- const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
- int w, int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
- CONV_BUF_TYPE *dst = conv_params->dst;
- int dst_stride = conv_params->dst_stride;
+void av1_highbd_dist_wtd_convolve_2d_copy_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -895,16 +913,16 @@ void av1_highbd_jnt_convolve_2d_copy_c(
assert(bits >= 0);
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
res += round_offset;
if (conv_params->do_average) {
- int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -912,10 +930,10 @@ void av1_highbd_jnt_convolve_2d_copy_c(
tmp = tmp >> 1;
}
tmp -= round_offset;
- dst16[y * dst16_stride + x] =
+ dst[y * dst_stride + x] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
} else {
- dst[y * dst_stride + x] = res;
+ dst16[y * dst16_stride + x] = res;
}
}
}
@@ -980,7 +998,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -1007,68 +1025,24 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
}
}
-static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w,
- int h, int subpel_x_q4,
- int subpel_y_q4,
- ConvolveParams *conv_params,
- int bd) {
- const InterpFilterParams *filter_params_x =
- subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
- const InterpFilterParams *filter_params_y =
- subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
- if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
- av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, 0, 0,
- conv_params, bd);
- } else if (subpel_x_q4 != 0) {
- av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, 0, 0,
- conv_params, bd);
- } else {
- av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, 0, 0,
- conv_params, bd);
- }
-}
-
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst8, int dst_stride, int w, int h,
- InterpFilters interp_filters,
- const int subpel_x_q4, int x_step_q4,
- const int subpel_y_q4, int y_step_q4,
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf,
- int is_intrabc, int bd) {
- assert(IMPLIES(is_intrabc, !scaled));
+ const struct scale_factors *sf, int bd) {
(void)x_step_q4;
(void)y_step_q4;
(void)dst_stride;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
- subpel_x_q4, subpel_y_q4, conv_params, bd);
- return;
- }
-
- InterpFilter filter_x = 0;
- InterpFilter filter_y = 0;
- const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
- const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
- if (need_filter_params_x)
- filter_x = av1_extract_interp_filter(interp_filters, 1);
- if (need_filter_params_y)
- filter_y = av1_extract_interp_filter(interp_filters, 0);
+ const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
+ const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
const InterpFilterParams *filter_params_x =
- need_filter_params_x
- ? av1_get_interp_filter_params_with_block_size(filter_x, w)
- : NULL;
+ need_filter_params_x ? interp_filters[0] : NULL;
const InterpFilterParams *filter_params_y =
- need_filter_params_y
- ? av1_get_interp_filter_params_with_block_size(filter_y, h)
- : NULL;
+ need_filter_params_y ? interp_filters[1] : NULL;
if (scaled) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -1076,18 +1050,19 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
assert(conv_params->dst != NULL);
}
av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, subpel_x_q4,
- x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ x_step_q4, subpel_y_qn, y_step_q4, conv_params,
bd);
} else {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
+ sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
0][conv_params->is_compound](
src, src_stride, dst, dst_stride, w, h, filter_params_x,
- filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+ filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
}
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
// Note: Fixed size intermediate buffers, place limits on parameters
// of some functions. 2d filtering proceeds in 2 steps:
@@ -1109,12 +1084,14 @@ static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
return sum;
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
const int16_t *b) {
int sum = 0;
for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
return sum;
}
+#endif
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
ptrdiff_t a_stride,
@@ -1215,6 +1192,7 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
y_step_q4, w, h, conv_params->round_1);
}
+#if CONFIG_AV1_HIGHBITDEPTH
static void highbd_convolve_add_src_horiz_hip(
const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
@@ -1293,3 +1271,4 @@ void av1_highbd_wiener_convolve_add_src_c(
temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/convolve.h b/media/libaom/src/av1/common/convolve.h
index 4109dd843..04df86c42 100644
--- a/media/libaom/src/av1/common/convolve.h
+++ b/media/libaom/src/av1/common/convolve.h
@@ -26,7 +26,8 @@ typedef struct ConvolveParams {
int round_1;
int plane;
int is_compound;
- int use_jnt_comp_avg;
+ int compound_index; // 0: the first single in compound mode, 1: the second.
+ int use_dist_wtd_comp_avg;
int fwd_offset;
int bck_offset;
} ConvolveParams;
@@ -41,32 +42,34 @@ typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params);
typedef void (*aom_highbd_convolve_fn_t)(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd);
struct AV1Common;
struct scale_factors;
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- InterpFilters interp_filters, const int subpel_x_q4,
- int x_step_q4, const int subpel_y_q4, int y_step_q4,
- int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf, int is_intrabc);
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4, int scaled,
+ ConvolveParams *conv_params,
+ const struct scale_factors *sf);
-static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
+static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
CONV_BUF_TYPE *dst,
int dst_stride,
int is_compound, int bd) {
ConvolveParams conv_params;
- conv_params.do_average = do_average;
- assert(IMPLIES(do_average, is_compound));
+ conv_params.compound_index = cmp_index;
+ assert(IMPLIES(cmp_index, is_compound));
+
conv_params.is_compound = is_compound;
conv_params.round_0 = ROUND0_BITS;
conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
@@ -82,6 +85,10 @@ static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
conv_params.dst = dst;
conv_params.dst_stride = dst_stride;
conv_params.plane = plane;
+
+ // By default, set do average to 1 if this is the second single prediction
+ // in a compound mode.
+ conv_params.do_average = cmp_index;
return conv_params;
}
@@ -111,12 +118,16 @@ static INLINE ConvolveParams get_conv_params_wiener(int bd) {
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
- InterpFilters interp_filters,
- const int subpel_x_q4, int x_step_q4,
- const int subpel_y_q4, int y_step_q4,
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf,
- int is_intrabc, int bd);
+ const struct scale_factors *sf, int bd);
+
+// TODO(sarahparker) This will need to be integerized and optimized
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+ int dst_stride, int w, int h, int dir,
+ double norm);
#ifdef __cplusplus
} // extern "C"
diff --git a/media/libaom/src/av1/common/debugmodes.c b/media/libaom/src/av1/common/debugmodes.c
index 868f341b5..ff02ddde0 100644
--- a/media/libaom/src/av1/common/debugmodes.c
+++ b/media/libaom/src/av1/common/debugmodes.c
@@ -11,14 +11,14 @@
#include <stdio.h>
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
fprintf(f, "%s", str);
- fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
- cm->show_frame, cm->base_qindex);
+ fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
+ cm->show_frame, cm->quant_params.base_qindex);
}
/* This function dereferences a pointer to the mbmi structure
* and uses the passed in member offset to print out the value of an integer
@@ -26,32 +26,31 @@ static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
*/
static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
size_t member_offset) {
- int mi_row, mi_col;
- MB_MODE_INFO **mi = cm->mi_grid_visible;
- int rows = cm->mi_rows;
- int cols = cm->mi_cols;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MB_MODE_INFO **mi = mi_params->mi_grid_base;
+ int rows = mi_params->mi_rows;
+ int cols = mi_params->mi_cols;
char prefix = descriptor[0];
log_frame_info(cm, descriptor, file);
- for (mi_row = 0; mi_row < rows; mi_row++) {
+ for (int mi_row = 0; mi_row < rows; mi_row++) {
fprintf(file, "%c ", prefix);
- for (mi_col = 0; mi_col < cols; mi_col++) {
+ for (int mi_col = 0; mi_col < cols; mi_col++) {
fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
mi++;
}
fprintf(file, "\n");
- mi += MAX_MIB_SIZE;
+ mi += mi_params->mi_stride - cols;
}
fprintf(file, "\n");
}
void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
- int mi_row;
- int mi_col;
+ CommonModeInfoParams *mi_params = &cm->mi_params;
FILE *mvs = fopen(file, "a");
- MB_MODE_INFO **mi = cm->mi_grid_visible;
- int rows = cm->mi_rows;
- int cols = cm->mi_cols;
+ MB_MODE_INFO **mi = mi_params->mi_grid_base;
+ const int rows = mi_params->mi_rows;
+ const int cols = mi_params->mi_cols;
print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
@@ -61,28 +60,28 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
// output skip infomation.
log_frame_info(cm, "Skips:", mvs);
- for (mi_row = 0; mi_row < rows; mi_row++) {
+ for (int mi_row = 0; mi_row < rows; mi_row++) {
fprintf(mvs, "S ");
- for (mi_col = 0; mi_col < cols; mi_col++) {
+ for (int mi_col = 0; mi_col < cols; mi_col++) {
fprintf(mvs, "%2d ", mi[0]->skip);
mi++;
}
fprintf(mvs, "\n");
- mi += MAX_MIB_SIZE;
+ mi += mi_params->mi_stride - cols;
}
fprintf(mvs, "\n");
// output motion vectors.
log_frame_info(cm, "Vectors ", mvs);
- mi = cm->mi_grid_visible;
- for (mi_row = 0; mi_row < rows; mi_row++) {
+ mi = mi_params->mi_grid_base;
+ for (int mi_row = 0; mi_row < rows; mi_row++) {
fprintf(mvs, "V ");
- for (mi_col = 0; mi_col < cols; mi_col++) {
+ for (int mi_col = 0; mi_col < cols; mi_col++) {
fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
mi++;
}
fprintf(mvs, "\n");
- mi += MAX_MIB_SIZE;
+ mi += mi_params->mi_stride - cols;
}
fprintf(mvs, "\n");
@@ -93,6 +92,13 @@ void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
const char *filename) {
FILE *hdrFile = fopen(filename, "w");
fwrite(data, size, sizeof(uint8_t), hdrFile);
+
+ // Reset order hints(7bit + a previous bit) to 0, so that all camera frame
+ // headers are identical in large scale coding.
+ uint8_t zero = 0;
+ fseek(hdrFile, 1, SEEK_SET);
+ // Reset second byte.
+ fwrite(&zero, 1, sizeof(uint8_t), hdrFile);
fclose(hdrFile);
}
diff --git a/media/libaom/src/av1/common/entropy.c b/media/libaom/src/av1/common/entropy.c
index 4f95ef69b..1f7a0efe0 100644
--- a/media/libaom/src/av1/common/entropy.c
+++ b/media/libaom/src/av1/common/entropy.c
@@ -13,10 +13,10 @@
#include "aom/aom_integer.h"
#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/entropy.h"
#include "av1/common/entropymode.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/scan.h"
#include "av1/common/token_cdfs.h"
#include "av1/common/txb_common.h"
@@ -29,7 +29,7 @@ static int get_q_ctx(int q) {
}
void av1_default_coef_probs(AV1_COMMON *cm) {
- const int index = get_q_ctx(cm->base_qindex);
+ const int index = get_q_ctx(cm->quant_params.base_qindex);
#if CONFIG_ENTROPY_STATS
cm->coef_cdf_category = index;
#endif
@@ -50,8 +50,9 @@ void av1_default_coef_probs(AV1_COMMON *cm) {
av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
}
-static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
- int cdf_stride, int nsymbs) {
+static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr,
+ int num_cdfs, int cdf_stride,
+ int nsymbs) {
for (int i = 0; i < num_cdfs; i++) {
cdf_ptr[i * cdf_stride + nsymbs] = 0;
}
@@ -68,7 +69,7 @@ static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
} while (0)
-static void reset_nmv_counter(nmv_context *nmv) {
+static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) {
RESET_CDF_COUNTER(nmv->joints_cdf, 4);
for (int i = 0; i < 2; i++) {
RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
@@ -101,7 +102,7 @@ void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
RESET_CDF_COUNTER(fc->refmv_cdf, 2);
RESET_CDF_COUNTER(fc->drl_cdf, 2);
RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
- RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+ RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
RESET_CDF_COUNTER(fc->interintra_cdf, 2);
RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
diff --git a/media/libaom/src/av1/common/entropy.h b/media/libaom/src/av1/common/entropy.h
index 991692c2f..ee78f56a3 100644
--- a/media/libaom/src/av1/common/entropy.h
+++ b/media/libaom/src/av1/common/entropy.h
@@ -48,18 +48,18 @@ extern "C" {
#define BR_CDF_SIZE (4)
#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
-#define COEFF_CONTEXT_BITS 6
+#define COEFF_CONTEXT_BITS 3
#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
#define BASE_CONTEXT_POSITION_NUM 12
-typedef enum TX_CLASS {
+enum {
TX_CLASS_2D = 0,
TX_CLASS_HORIZ = 1,
TX_CLASS_VERT = 2,
TX_CLASSES = 3,
-} TX_CLASS;
+} UENUM1BYTE(TX_CLASS);
#define DCT_MAX_VALUE 16384
#define DCT_MAX_VALUE_HIGH10 65536
diff --git a/media/libaom/src/av1/common/entropymode.c b/media/libaom/src/av1/common/entropymode.c
index 41dc30ddb..5f061be35 100644
--- a/media/libaom/src/av1/common/entropymode.c
+++ b/media/libaom/src/av1/common/entropymode.c
@@ -11,9 +11,9 @@
#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/reconinter.h"
#include "av1/common/scan.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/seg_common.h"
#include "av1/common/txb_common.h"
@@ -435,16 +435,16 @@ static const aom_cdf_prob
{ AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) }
};
-static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
- { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
- { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+ { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
-static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
- { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
- { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+ { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
{ AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
@@ -470,11 +470,11 @@ static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
{ AOM_CDF2(30237) } };
static const aom_cdf_prob
- default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] =
- { { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(1875, 11082, 27332) },
- { AOM_CDF4(2473, 9996, 26388) },
- { AOM_CDF4(4238, 11537, 25926) } };
+ default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+ INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(1875, 11082, 27332) },
+ { AOM_CDF4(2473, 9996, 26388) },
+ { AOM_CDF4(4238, 11537, 25926) } };
static const aom_cdf_prob
default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
@@ -488,63 +488,63 @@ static const aom_cdf_prob
{ AOM_CDF2(16384) }
};
-static const aom_cdf_prob
- default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
- { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
- { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) },
- { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) },
- { AOM_CDF2(16384) }
- };
+static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+ MASKED_COMPOUND_TYPES)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+ { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) },
+ { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }
+};
-static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
- { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359,
- 22362, 24127, 25702, 27752, 29450, 31171) },
- { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367,
- 18452, 19422, 22839, 26127, 29629) },
- { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332,
- 24520, 27470, 29456, 30529, 31656) },
- { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163,
- 20961, 22884, 24471, 26719, 28714, 30877) },
- { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730,
- 18114, 19313, 22521, 26012, 29550) },
- { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270,
- 20533, 23434, 25972, 27944, 29570, 31416) },
- { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638,
- 22038, 23963, 25311, 26988, 28766, 31012) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284,
- 24985, 25684, 27259, 28883, 30911) },
- { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057,
- 27251, 29173, 30089, 30960, 31933) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) },
- { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
- 20480, 22528, 24576, 26624, 28672, 30720) } };
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+ 16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+ 20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+ { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323,
+ 17367, 18452, 19422, 22839, 26127, 29629) },
+ { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939,
+ 21332, 24520, 27470, 29456, 30529, 31656) },
+ { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+ 19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+ { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369,
+ 16730, 18114, 19313, 22521, 26012, 29550) },
+ { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+ 17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+ { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+ 20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703,
+ 24284, 24985, 25684, 27259, 28883, 30911) },
+ { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935,
+ 25057, 27251, 29173, 30089, 30960, 31933) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) } };
static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
@@ -1068,9 +1068,16 @@ void av1_setup_frame_contexts(AV1_COMMON *cm) {
// This function must ONLY be called when cm->fc has been initialized with
// default probs, either by av1_setup_past_independence or after manually
// initializing them
- cm->frame_contexts[FRAME_CONTEXT_DEFAULTS] = *cm->fc;
- if (cm->large_scale_tile) {
- for (int i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+ *cm->default_frame_context = *cm->fc;
+ // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
+ // but could do with fuller testing
+ if (cm->tiles.large_scale) {
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
+ if (buf != NULL) buf->frame_context = *cm->fc;
+ }
+ for (int i = 0; i < FRAME_BUFFERS; ++i)
+ cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
}
}
@@ -1079,10 +1086,9 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
// Features disabled, 0, with delta coding (Default state).
av1_clearall_segfeatures(&cm->seg);
- cm->current_frame_seg_map = cm->cur_frame->seg_map;
-
- if (cm->current_frame_seg_map)
- memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+ if (cm->cur_frame->seg_map)
+ memset(cm->cur_frame->seg_map, 0,
+ (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
// reset mode ref deltas
av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
@@ -1092,12 +1098,6 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
av1_default_coef_probs(cm);
init_mode_probs(cm->fc);
av1_init_mv_probs(cm);
- av1_init_lv_map(cm);
cm->fc->initialized = 1;
av1_setup_frame_contexts(cm);
-
- // prev_mip will only be allocated in encoder.
- if (frame_is_intra_only(cm) && cm->prev_mip)
- memset(cm->prev_mip, 0,
- cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip));
}
diff --git a/media/libaom/src/av1/common/entropymode.h b/media/libaom/src/av1/common/entropymode.h
index 7047f34d2..bbbf55dc8 100644
--- a/media/libaom/src/av1/common/entropymode.h
+++ b/media/libaom/src/av1/common/entropymode.h
@@ -63,7 +63,6 @@ struct AV1Common;
typedef struct {
const int16_t *scan;
const int16_t *iscan;
- const int16_t *neighbors;
} SCAN_ORDER;
typedef struct frame_contexts {
@@ -92,7 +91,8 @@ typedef struct frame_contexts {
aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
[CDF_SIZE(INTER_COMPOUND_MODES)];
- aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+ aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL]
+ [CDF_SIZE(MASKED_COMPOUND_TYPES)];
aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
diff --git a/media/libaom/src/av1/common/entropymv.c b/media/libaom/src/av1/common/entropymv.c
index 491337387..e1e42f2f1 100644
--- a/media/libaom/src/av1/common/entropymv.c
+++ b/media/libaom/src/av1/common/entropymv.c
@@ -9,7 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/entropymv.h"
static const nmv_context default_nmv_context = {
diff --git a/media/libaom/src/av1/common/entropymv.h b/media/libaom/src/av1/common/entropymv.h
index fa818a2c1..cddc80768 100644
--- a/media/libaom/src/av1/common/entropymv.h
+++ b/media/libaom/src/av1/common/entropymv.h
@@ -30,12 +30,12 @@ void av1_init_mv_probs(struct AV1Common *cm);
/* Symbols for coding which components are zero jointly */
#define MV_JOINTS 4
-typedef enum {
+enum {
MV_JOINT_ZERO = 0, /* Zero vector */
MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */
MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */
MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
-} MV_JOINT_TYPE;
+} UENUM1BYTE(MV_JOINT_TYPE);
static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
@@ -47,7 +47,7 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
/* Symbols for coding magnitude class of nonzero components */
#define MV_CLASSES 11
-typedef enum {
+enum {
MV_CLASS_0 = 0, /* (0, 2] integer pel */
MV_CLASS_1 = 1, /* (2, 4] integer pel */
MV_CLASS_2 = 2, /* (4, 8] integer pel */
@@ -59,7 +59,7 @@ typedef enum {
MV_CLASS_8 = 8, /* (256, 512] integer pel */
MV_CLASS_9 = 9, /* (512, 1024] integer pel */
MV_CLASS_10 = 10, /* (1024,2048] integer pel */
-} MV_CLASS_TYPE;
+} UENUM1BYTE(MV_CLASS_TYPE);
#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
#define CLASS0_SIZE (1 << CLASS0_BITS)
@@ -91,11 +91,11 @@ typedef struct {
nmv_component comps[2];
} nmv_context;
-typedef enum {
+enum {
MV_SUBPEL_NONE = -1,
MV_SUBPEL_LOW_PRECISION = 0,
MV_SUBPEL_HIGH_PRECISION,
-} MvSubpelPrecision;
+} SENUM1BYTE(MvSubpelPrecision);
#ifdef __cplusplus
} // extern "C"
diff --git a/media/libaom/src/av1/common/enums.h b/media/libaom/src/av1/common/enums.h
index 869c06ef2..0c09a1bc7 100644
--- a/media/libaom/src/av1/common/enums.h
+++ b/media/libaom/src/av1/common/enums.h
@@ -16,6 +16,7 @@
#include "aom/aom_codec.h"
#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
#ifdef __cplusplus
extern "C" {
@@ -63,17 +64,6 @@ extern "C" {
#define FRAME_OFFSET_BITS 5
#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
-#define REF_FRAMES_LOG2 3
-#define REF_FRAMES (1 << REF_FRAMES_LOG2)
-
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
-// TODO(jkoleszar): These 3 extra references could probably come from the
-// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
-
// 4 frame filter levels: y plane vertical, y plane horizontal,
// u plane, and v plane
#define FRAME_LF_COUNT 4
@@ -83,11 +73,6 @@ extern "C" {
#define DIST_PRECISION_BITS 4
#define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16
-// TODO(chengchen): Temporal flag serve as experimental flag for WIP
-// bitmask construction.
-// Shall be removed when bitmask code is completely checkedin
-#define LOOP_FILTER_BITMASK 0
-
#define PROFILE_BITS 3
// The following three profiles are currently defined.
// Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only.
@@ -95,21 +80,12 @@ extern "C" {
// Profile 2. 8-bit and 10-bit 4:2:2
// 12-bit 4:0:0, 4:2:2 and 4:4:4
// Since we have three bits for the profiles, it can be extended later.
-typedef enum BITSTREAM_PROFILE {
+enum {
PROFILE_0,
PROFILE_1,
PROFILE_2,
MAX_PROFILES,
-} BITSTREAM_PROFILE;
-
-#define LEVEL_MAJOR_BITS 3
-#define LEVEL_MINOR_BITS 2
-#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
-
-#define LEVEL_MAJOR_MIN 2
-#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
-#define LEVEL_MINOR_MIN 0
-#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+} SENUM1BYTE(BITSTREAM_PROFILE);
#define OP_POINTS_CNT_MINUS_1_BITS 5
#define OP_POINTS_IDC_BITS 12
@@ -149,7 +125,28 @@ typedef enum ATTRIBUTE_PACKED {
// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
#define SQR_BLOCK_SIZES 6
-typedef enum ATTRIBUTE_PACKED {
+// Partition types. R: Recursive
+//
+// NONE HORZ VERT SPLIT
+// +-------+ +-------+ +---+---+ +---+---+
+// | | | | | | | | R | R |
+// | | +-------+ | | | +---+---+
+// | | | | | | | | R | R |
+// +-------+ +-------+ +---+---+ +---+---+
+//
+// HORZ_A HORZ_B VERT_A VERT_B
+// +---+---+ +-------+ +---+---+ +---+---+
+// | | | | | | | | | | |
+// +---+---+ +---+---+ +---+ | | +---+
+// | | | | | | | | | | |
+// +-------+ +---+---+ +---+---+ +---+---+
+//
+// HORZ_4 VERT_4
+// +-----+ +-+-+-+
+// +-----+ | | | |
+// +-----+ | | | |
+// +-----+ +-+-+-+
+enum {
PARTITION_NONE,
PARTITION_HORZ,
PARTITION_VERT,
@@ -163,7 +160,7 @@ typedef enum ATTRIBUTE_PACKED {
EXT_PARTITION_TYPES,
PARTITION_TYPES = PARTITION_SPLIT + 1,
PARTITION_INVALID = 255
-} PARTITION_TYPE;
+} UENUM1BYTE(PARTITION_TYPE);
typedef char PARTITION_CONTEXT;
#define PARTITION_PLOFFSET 4 // number of probability models per block size
@@ -171,12 +168,7 @@ typedef char PARTITION_CONTEXT;
#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
// block transform size
-#if defined(_MSC_VER)
-typedef uint8_t TX_SIZE;
-enum ATTRIBUTE_PACKED {
-#else
-typedef enum ATTRIBUTE_PACKED {
-#endif
+enum {
TX_4X4, // 4x4 transform
TX_8X8, // 8x8 transform
TX_16X16, // 16x16 transform
@@ -200,11 +192,7 @@ typedef enum ATTRIBUTE_PACKED {
TX_SIZES = TX_4X8, // Does NOT include rectangular transforms
TX_SIZES_LARGEST = TX_64X64,
TX_INVALID = 255 // Invalid transform size
-#if defined(_MSC_VER)
-};
-#else
-} TX_SIZE;
-#endif
+} UENUM1BYTE(TX_SIZE);
#define TX_SIZE_LUMA_MIN (TX_4X4)
/* We don't need to code a transform size unless the allowed size is at least
@@ -226,7 +214,7 @@ typedef enum ATTRIBUTE_PACKED {
#define TX_PAD_HOR 4
// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
// check.
-#define TX_PAD_TOP 2
+#define TX_PAD_TOP 0
#define TX_PAD_BOTTOM 4
#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
@@ -238,43 +226,44 @@ typedef enum ATTRIBUTE_PACKED {
#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
// frame transform mode
-typedef enum ATTRIBUTE_PACKED {
+enum {
ONLY_4X4, // use only 4x4 transform
TX_MODE_LARGEST, // transform size is the largest possible for pu size
TX_MODE_SELECT, // transform specified for each block
TX_MODES,
-} TX_MODE;
+} UENUM1BYTE(TX_MODE);
// 1D tx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
DCT_1D,
ADST_1D,
FLIPADST_1D,
IDTX_1D,
TX_TYPES_1D,
-} TX_TYPE_1D;
-
-typedef enum ATTRIBUTE_PACKED {
- DCT_DCT, // DCT in both horizontal and vertical
- ADST_DCT, // ADST in vertical, DCT in horizontal
- DCT_ADST, // DCT in vertical, ADST in horizontal
- ADST_ADST, // ADST in both directions
- FLIPADST_DCT,
- DCT_FLIPADST,
- FLIPADST_FLIPADST,
- ADST_FLIPADST,
- FLIPADST_ADST,
- IDTX,
- V_DCT,
- H_DCT,
- V_ADST,
- H_ADST,
- V_FLIPADST,
- H_FLIPADST,
+} UENUM1BYTE(TX_TYPE_1D);
+
+enum {
+ DCT_DCT, // DCT in both horizontal and vertical
+ ADST_DCT, // ADST in vertical, DCT in horizontal
+ DCT_ADST, // DCT in vertical, ADST in horizontal
+ ADST_ADST, // ADST in both directions
+ FLIPADST_DCT, // FLIPADST in vertical, DCT in horizontal
+ DCT_FLIPADST, // DCT in vertical, FLIPADST in horizontal
+ FLIPADST_FLIPADST, // FLIPADST in both directions
+ ADST_FLIPADST, // ADST in vertical, FLIPADST in horizontal
+ FLIPADST_ADST, // FLIPADST in vertical, ADST in horizontal
+ IDTX, // Identity in both directions
+ V_DCT, // DCT in vertical, identity in horizontal
+ H_DCT, // Identity in vertical, DCT in horizontal
+ V_ADST, // ADST in vertical, identity in horizontal
+ H_ADST, // Identity in vertical, ADST in horizontal
+ V_FLIPADST, // FLIPADST in vertical, identity in horizontal
+ H_FLIPADST, // Identity in vertical, FLIPADST in horizontal
TX_TYPES,
-} TX_TYPE;
+ DCT_ADST_TX_MASK = 0x000F, // Either DCT or ADST in each direction
+} UENUM1BYTE(TX_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
REG_REG,
REG_SMOOTH,
REG_SHARP,
@@ -284,9 +273,9 @@ typedef enum ATTRIBUTE_PACKED {
SHARP_REG,
SHARP_SMOOTH,
SHARP_SHARP,
-} DUAL_FILTER_TYPE;
+} UENUM1BYTE(DUAL_FILTER_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
// DCT only
EXT_TX_SET_DCTONLY,
// DCT + Identity only
@@ -300,15 +289,13 @@ typedef enum ATTRIBUTE_PACKED {
// Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
EXT_TX_SET_ALL16,
EXT_TX_SET_TYPES
-} TxSetType;
-
-#define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
+} UENUM1BYTE(TxSetType);
#define EXT_TX_SIZES 4 // number of sizes that use extended transforms
#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER
#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA
-typedef enum ATTRIBUTE_PACKED {
+enum {
AOM_LAST_FLAG = 1 << 0,
AOM_LAST2_FLAG = 1 << 1,
AOM_LAST3_FLAG = 1 << 2,
@@ -317,19 +304,15 @@ typedef enum ATTRIBUTE_PACKED {
AOM_ALT2_FLAG = 1 << 5,
AOM_ALT_FLAG = 1 << 6,
AOM_REFFRAME_ALL = (1 << 7) - 1
-} AOM_REFFRAME;
+} UENUM1BYTE(AOM_REFFRAME);
-typedef enum ATTRIBUTE_PACKED {
+enum {
UNIDIR_COMP_REFERENCE,
BIDIR_COMP_REFERENCE,
COMP_REFERENCE_TYPES,
-} COMP_REFERENCE_TYPE;
+} UENUM1BYTE(COMP_REFERENCE_TYPE);
-typedef enum ATTRIBUTE_PACKED {
- PLANE_TYPE_Y,
- PLANE_TYPE_UV,
- PLANE_TYPES
-} PLANE_TYPE;
+enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
#define CFL_ALPHABET_SIZE_LOG2 4
#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
@@ -337,24 +320,20 @@ typedef enum ATTRIBUTE_PACKED {
#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
-typedef enum ATTRIBUTE_PACKED {
- CFL_PRED_U,
- CFL_PRED_V,
- CFL_PRED_PLANES
-} CFL_PRED_TYPE;
+enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
CFL_SIGN_ZERO,
CFL_SIGN_NEG,
CFL_SIGN_POS,
CFL_SIGNS
-} CFL_SIGN_TYPE;
+} UENUM1BYTE(CFL_SIGN_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
CFL_DISALLOWED,
CFL_ALLOWED,
CFL_ALLOWED_TYPES
-} CFL_ALLOWED_TYPE;
+} UENUM1BYTE(CFL_ALLOWED_TYPE);
// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
@@ -371,12 +350,12 @@ typedef enum ATTRIBUTE_PACKED {
#define CFL_CONTEXT_V(js) \
(CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
-typedef enum ATTRIBUTE_PACKED {
+enum {
PALETTE_MAP,
COLOR_MAP_TYPES,
-} COLOR_MAP_TYPE;
+} UENUM1BYTE(COLOR_MAP_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
TWO_COLORS,
THREE_COLORS,
FOUR_COLORS,
@@ -385,9 +364,9 @@ typedef enum ATTRIBUTE_PACKED {
SEVEN_COLORS,
EIGHT_COLORS,
PALETTE_SIZES
-} PALETTE_SIZE;
+} UENUM1BYTE(PALETTE_SIZE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
PALETTE_COLOR_ONE,
PALETTE_COLOR_TWO,
PALETTE_COLOR_THREE,
@@ -397,11 +376,11 @@ typedef enum ATTRIBUTE_PACKED {
PALETTE_COLOR_SEVEN,
PALETTE_COLOR_EIGHT,
PALETTE_COLORS
-} PALETTE_COLOR;
+} UENUM1BYTE(PALETTE_COLOR);
// Note: All directional predictors must be between V_PRED and D67_PRED (both
// inclusive).
-typedef enum ATTRIBUTE_PACKED {
+enum {
DC_PRED, // Average of above and left pixels
V_PRED, // Vertical
H_PRED, // Horizontal
@@ -431,6 +410,8 @@ typedef enum ATTRIBUTE_PACKED {
MB_MODE_COUNT,
INTRA_MODE_START = DC_PRED,
INTRA_MODE_END = NEARESTMV,
+ DIR_MODE_START = V_PRED,
+ DIR_MODE_END = D67_PRED + 1,
INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
SINGLE_INTER_MODE_START = NEARESTMV,
SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
@@ -442,11 +423,11 @@ typedef enum ATTRIBUTE_PACKED {
INTER_MODE_END = MB_MODE_COUNT,
INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode.
INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks
-} PREDICTION_MODE;
+} UENUM1BYTE(PREDICTION_MODE);
// TODO(ltrudeau) Do we really want to pack this?
// TODO(ltrudeau) Do we match with PREDICTION_MODE?
-typedef enum ATTRIBUTE_PACKED {
+enum {
UV_DC_PRED, // Average of above and left pixels
UV_V_PRED, // Vertical
UV_H_PRED, // Horizontal
@@ -463,38 +444,71 @@ typedef enum ATTRIBUTE_PACKED {
UV_CFL_PRED, // Chroma-from-Luma
UV_INTRA_MODES,
UV_MODE_INVALID, // For uv_mode in inter blocks
-} UV_PREDICTION_MODE;
+} UENUM1BYTE(UV_PREDICTION_MODE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
SIMPLE_TRANSLATION,
OBMC_CAUSAL, // 2-sided OBMC
WARPED_CAUSAL, // 2-sided WARPED
MOTION_MODES
-} MOTION_MODE;
+} UENUM1BYTE(MOTION_MODE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
II_DC_PRED,
II_V_PRED,
II_H_PRED,
II_SMOOTH_PRED,
INTERINTRA_MODES
-} INTERINTRA_MODE;
+} UENUM1BYTE(INTERINTRA_MODE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
COMPOUND_AVERAGE,
+ COMPOUND_DISTWTD,
COMPOUND_WEDGE,
COMPOUND_DIFFWTD,
COMPOUND_TYPES,
-} COMPOUND_TYPE;
+ MASKED_COMPOUND_TYPES = 2,
+} UENUM1BYTE(COMPOUND_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
FILTER_DC_PRED,
FILTER_V_PRED,
FILTER_H_PRED,
FILTER_D157_PRED,
FILTER_PAETH_PRED,
FILTER_INTRA_MODES,
-} FILTER_INTRA_MODE;
+} UENUM1BYTE(FILTER_INTRA_MODE);
+
+enum {
+ SEQ_LEVEL_2_0,
+ SEQ_LEVEL_2_1,
+ SEQ_LEVEL_2_2,
+ SEQ_LEVEL_2_3,
+ SEQ_LEVEL_3_0,
+ SEQ_LEVEL_3_1,
+ SEQ_LEVEL_3_2,
+ SEQ_LEVEL_3_3,
+ SEQ_LEVEL_4_0,
+ SEQ_LEVEL_4_1,
+ SEQ_LEVEL_4_2,
+ SEQ_LEVEL_4_3,
+ SEQ_LEVEL_5_0,
+ SEQ_LEVEL_5_1,
+ SEQ_LEVEL_5_2,
+ SEQ_LEVEL_5_3,
+ SEQ_LEVEL_6_0,
+ SEQ_LEVEL_6_1,
+ SEQ_LEVEL_6_2,
+ SEQ_LEVEL_6_3,
+ SEQ_LEVEL_7_0,
+ SEQ_LEVEL_7_1,
+ SEQ_LEVEL_7_2,
+ SEQ_LEVEL_7_3,
+ SEQ_LEVELS,
+ SEQ_LEVEL_MAX = 31
+} UENUM1BYTE(AV1_LEVEL);
+
+#define LEVEL_BITS 5
#define DIRECTIONAL_MODES 8
#define MAX_ANGLE_DELTA 3
@@ -529,7 +543,9 @@ typedef enum ATTRIBUTE_PACKED {
#define DELTA_Q_SMALL 3
#define DELTA_Q_PROBS (DELTA_Q_SMALL)
-#define DEFAULT_DELTA_Q_RES 4
+#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4
+#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4
+
#define DELTA_LF_SMALL 3
#define DELTA_LF_PROBS (DELTA_LF_SMALL)
#define DEFAULT_DELTA_LF_RES 2
@@ -538,6 +554,7 @@ typedef enum ATTRIBUTE_PACKED {
#define MAX_MV_REF_CANDIDATES 2
#define MAX_REF_MV_STACK_SIZE 8
+#define USABLE_REF_MV_STACK_SIZE 4
#define REF_CAT_LEVEL 640
#define INTRA_INTER_CONTEXTS 4
@@ -550,28 +567,47 @@ typedef enum ATTRIBUTE_PACKED {
#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
typedef uint8_t TXFM_CONTEXT;
-#define NONE_FRAME -1
-#define INTRA_FRAME 0
-#define LAST_FRAME 1
-#define LAST2_FRAME 2
-#define LAST3_FRAME 3
-#define GOLDEN_FRAME 4
-#define BWDREF_FRAME 5
-#define ALTREF2_FRAME 6
-#define ALTREF_FRAME 7
-#define EXTREF_FRAME REF_FRAMES
-#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
-
-#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
-
-#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
+// An enum for single reference types (and some derived values).
+enum {
+ NONE_FRAME = -1,
+ INTRA_FRAME,
+ LAST_FRAME,
+ LAST2_FRAME,
+ LAST3_FRAME,
+ GOLDEN_FRAME,
+ BWDREF_FRAME,
+ ALTREF2_FRAME,
+ ALTREF_FRAME,
+ REF_FRAMES,
+
+ // Extra/scratch reference frame. It may be:
+ // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or
+ // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()).
+ EXTREF_FRAME = REF_FRAMES,
+
+ // Number of inter (non-intra) reference types.
+ INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1,
+
+ // Number of forward (aka past) reference types.
+ FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1,
+
+ // Number of backward (aka future) reference types.
+ BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1,
+
+ SINGLE_REFS = FWD_REFS + BWD_REFS,
+};
+
+#define REF_FRAMES_LOG2 3
+
+// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new
+// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the
+// encoder in the cpi->scaled_ref_buf array.
+#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
+
#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
-#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
-#define SINGLE_REFS (FWD_REFS + BWD_REFS)
-
-typedef enum ATTRIBUTE_PACKED {
+enum {
LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME }
LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME }
LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME }
@@ -585,7 +621,7 @@ typedef enum ATTRIBUTE_PACKED {
// NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
// that are explicitly signaled.
UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
-} UNIDIR_COMP_REF;
+} UENUM1BYTE(UNIDIR_COMP_REF);
#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
@@ -596,14 +632,37 @@ typedef enum ATTRIBUTE_PACKED {
// possible to have a reference pair not listed for explicit signaling.
#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
-typedef enum ATTRIBUTE_PACKED {
+// Note: It includes single and compound references. So, it can take values from
+// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
+typedef int8_t MV_REFERENCE_FRAME;
+
+enum {
RESTORE_NONE,
RESTORE_WIENER,
RESTORE_SGRPROJ,
RESTORE_SWITCHABLE,
RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
RESTORE_TYPES = 4,
-} RestorationType;
+} UENUM1BYTE(RestorationType);
+
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+enum {
+ SCALABILITY_L1T2 = 0,
+ SCALABILITY_L1T3 = 1,
+ SCALABILITY_L2T1 = 2,
+ SCALABILITY_L2T2 = 3,
+ SCALABILITY_L2T3 = 4,
+ SCALABILITY_S2T1 = 5,
+ SCALABILITY_S2T2 = 6,
+ SCALABILITY_S2T3 = 7,
+ SCALABILITY_L2T1h = 8,
+ SCALABILITY_L2T2h = 9,
+ SCALABILITY_L2T3h = 10,
+ SCALABILITY_S2T1h = 11,
+ SCALABILITY_S2T2h = 12,
+ SCALABILITY_S2T3h = 13,
+ SCALABILITY_SS = 14
+} UENUM1BYTE(SCALABILITY_STRUCTURES);
#define SUPERRES_SCALE_BITS 3
#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
diff --git a/media/libaom/src/av1/common/filter.h b/media/libaom/src/av1/common/filter.h
index 571422d11..91791d3dc 100644
--- a/media/libaom/src/av1/common/filter.h
+++ b/media/libaom/src/av1/common/filter.h
@@ -19,6 +19,7 @@
#include "aom/aom_integer.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
#ifdef __cplusplus
extern "C" {
@@ -35,29 +36,55 @@ typedef enum ATTRIBUTE_PACKED {
SWITCHABLE_FILTERS = BILINEAR,
SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+ INTERP_INVALID = 0xff,
} InterpFilter;
-// With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
-// there are at most 10 filters, we can use 16 bits for each and have more than
-// enough space. This reduces argument passing and unifies the operation of
-// setting a (pair of) filters.
-//
-// Without CONFIG_DUAL_FILTER,
-typedef uint32_t InterpFilters;
-static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
- int x_filter) {
- return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf);
-}
+enum {
+ USE_2_TAPS_ORIG = 0, // This is used in temporal filtering.
+ USE_2_TAPS,
+ USE_4_TAPS,
+ USE_8_TAPS,
+} UENUM1BYTE(SUBPEL_SEARCH_TYPE);
+
+enum {
+ INTERP_EVAL_LUMA_EVAL_CHROMA = 0,
+ INTERP_SKIP_LUMA_EVAL_CHROMA,
+ INTERP_EVAL_INVALID,
+ INTERP_SKIP_LUMA_SKIP_CHROMA,
+} UENUM1BYTE(INTERP_EVAL_PLANE);
+
+enum {
+ INTERP_HORZ_NEQ_VERT_NEQ = 0,
+ INTERP_HORZ_EQ_VERT_NEQ,
+ INTERP_HORZ_NEQ_VERT_EQ,
+ INTERP_HORZ_EQ_VERT_EQ,
+ INTERP_PRED_TYPE_ALL,
+} UENUM1BYTE(INTERP_PRED_TYPE);
+// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters,
+// we can use 16 bits for each and have more than enough space. This reduces
+// argument passing and unifies the operation of setting a (pair of) filters.
+typedef struct InterpFilters {
+ uint16_t y_filter;
+ uint16_t x_filter;
+} InterpFilters;
-static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter,
- InterpFilter x_filter) {
- uint16_t y16 = y_filter & 0xf;
- uint16_t x16 = x_filter & 0xf;
- return y16 | ((uint32_t)x16 << 16);
+typedef union int_interpfilters {
+ uint32_t as_int;
+ InterpFilters as_filters;
+} int_interpfilters;
+
+static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters,
+ int dir) {
+ return (InterpFilter)((dir) ? filters.as_filters.x_filter
+ : filters.as_filters.y_filter);
}
-static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
- return av1_make_interp_filters(filter, filter);
+static INLINE int_interpfilters
+av1_broadcast_interp_filter(InterpFilter filter) {
+ int_interpfilters filters;
+ filters.as_filters.x_filter = filter;
+ filters.as_filters.y_filter = filter;
+ return filters;
}
static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
@@ -67,10 +94,10 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
#define LOG_SWITCHABLE_FILTERS 2
-#define MAX_SUBPEL_TAPS 12
#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff)
typedef struct InterpFilterParams {
const int16_t *filter_ptr;
@@ -141,9 +168,10 @@ static const InterpFilterParams
// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
-DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
- 64,
- 64,
+DECLARE_ALIGNED(256, static const int16_t,
+ av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const InterpFilterParams av1_intrabc_filter_params = {
@@ -173,6 +201,16 @@ DECLARE_ALIGNED(256, static const InterpKernel,
{ 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
};
+static const uint16_t
+ av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = {
+ { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG),
+ (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH),
+ (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) },
+ { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP),
+ (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP),
+ (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) }
+ };
+
// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
{ (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
@@ -192,14 +230,14 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
return &av1_interp_filter_params_list[interp_filter];
}
-static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
- const InterpFilter interp_filter) {
- return &av1_interp_4tap[interp_filter];
-}
-
static INLINE const int16_t *av1_get_interp_filter_kernel(
- const InterpFilter interp_filter) {
- return av1_interp_filter_params_list[interp_filter].filter_ptr;
+ const InterpFilter interp_filter, int subpel_search) {
+ assert(subpel_search >= USE_2_TAPS);
+ return (subpel_search == USE_2_TAPS)
+ ? av1_interp_4tap[BILINEAR].filter_ptr
+ : ((subpel_search == USE_4_TAPS)
+ ? av1_interp_4tap[interp_filter].filter_ptr
+ : av1_interp_filter_params_list[interp_filter].filter_ptr);
}
static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
@@ -207,6 +245,33 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
return filter_params->filter_ptr + filter_params->taps * subpel;
}
+static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) {
+ assert(subpel_search >= USE_2_TAPS);
+
+ switch (subpel_search) {
+ case USE_2_TAPS: return &av1_interp_4tap[BILINEAR];
+ case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR];
+ case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR];
+ default: assert(0); return NULL;
+ }
+}
+
+static INLINE void reset_interp_filter_allowed_mask(
+ uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+ uint16_t tmp = (~(1 << filt_type)) & 0xffff;
+ *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK);
+}
+
+static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask,
+ DUAL_FILTER_TYPE filt_type) {
+ *allow_interp_mask |= (1 << filt_type);
+}
+
+static INLINE uint8_t get_interp_filter_allowed_mask(
+ uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+ return (allow_interp_mask >> filt_type) & 1;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libaom/src/av1/common/frame_buffers.c b/media/libaom/src/av1/common/frame_buffers.c
index fd6c4bc79..f10ccd594 100644
--- a/media/libaom/src/av1/common/frame_buffers.c
+++ b/media/libaom/src/av1/common/frame_buffers.c
@@ -22,7 +22,11 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
list->int_fb = (InternalFrameBuffer *)aom_calloc(
list->num_internal_frame_buffers, sizeof(*list->int_fb));
- return (list->int_fb == NULL);
+ if (list->int_fb == NULL) {
+ list->num_internal_frame_buffers = 0;
+ return 1;
+ }
+ return 0;
}
void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -36,6 +40,7 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
}
aom_free(list->int_fb);
list->int_fb = NULL;
+ list->num_internal_frame_buffers = 0;
}
void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -69,7 +74,10 @@ int av1_get_frame_buffer(void *cb_priv, size_t min_size,
// due to access uninitialized memory in frame border. It could be
// skipped if border were totally removed.
int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
- if (!int_fb_list->int_fb[i].data) return -1;
+ if (!int_fb_list->int_fb[i].data) {
+ int_fb_list->int_fb[i].size = 0;
+ return -1;
+ }
int_fb_list->int_fb[i].size = min_size;
}
@@ -86,6 +94,5 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
(void)cb_priv;
if (int_fb) int_fb->in_use = 0;
- fb->priv = NULL;
return 0;
}
diff --git a/media/libaom/src/av1/common/idct.c b/media/libaom/src/av1/common/idct.c
index 2c1cb9827..bff438f3c 100644
--- a/media/libaom/src/av1/common/idct.c
+++ b/media/libaom/src/av1/common/idct.c
@@ -56,87 +56,87 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
}
-void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
}
-void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
txfm_param->tx_type, txfm_param->bd);
@@ -204,7 +204,7 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
txfm_param->eob = eob;
txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
txfm_param->bd = xd->bd;
- txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+ txfm_param->is_hbd = is_cur_buf_hbd(xd);
txfm_param->tx_set_type = av1_get_ext_tx_set_type(
txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
}
@@ -224,10 +224,10 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
break;
case TX_4X8:
- av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param);
break;
case TX_8X4:
- av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param);
break;
case TX_8X16:
av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
@@ -236,25 +236,25 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
break;
case TX_16X32:
- av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param);
break;
case TX_32X16:
- av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param);
break;
case TX_64X64:
av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
break;
case TX_32X64:
- av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param);
break;
case TX_64X32:
- av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param);
break;
case TX_16X64:
- av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param);
break;
case TX_64X16:
- av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param);
break;
case TX_4X4:
// this is like av1_short_idct4x4 but has a special case around eob<=1
@@ -263,16 +263,16 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
break;
case TX_16X4:
- av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param);
break;
case TX_4X16:
- av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param);
break;
case TX_8X32:
- av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param);
break;
case TX_32X8:
- av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param);
break;
default: assert(0 && "Invalid transform size"); break;
}
diff --git a/media/libaom/src/av1/common/idct.h b/media/libaom/src/av1/common/idct.h
index d9454e73f..004d25d49 100644
--- a/media/libaom/src/av1/common/idct.h
+++ b/media/libaom/src/av1/common/idct.h
@@ -44,22 +44,6 @@ static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
return (const int32_t *)input;
}
-typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *param);
-
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
-highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libaom/src/av1/common/loopfiltermask.c b/media/libaom/src/av1/common/loopfiltermask.c
new file mode 100644
index 000000000..157310f2d
--- /dev/null
+++ b/media/libaom/src/av1/common/loopfiltermask.c
@@ -0,0 +1,1458 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8-> ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// -----------------
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8-> ( in low order byte first we end up with
+// a mask that looks like this
+//
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// -----------------
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+#if CONFIG_LPF_MASK
+static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
+};
+
+static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
+
+static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
+
+static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1,
+ 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = {
+ 0, 47, 49, 19, 51, 53, 33, 55, 57, 42, 59,
+ 60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66
+};
+
+static const FilterMask left_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+ 0x0055005500550055ULL } }, // block size 32X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+ 0x5555555555555555ULL } }, // block size 64X64, TX_8X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+ 0x0005000500050005ULL } }, // block size 16X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+ 0x0011001100110011ULL } }, // block size 32X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+ 0x1111111111111111ULL } }, // block size 64X64, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 32X64, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X32
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 32X64, TX_32X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X64
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
+};
+
+static const FilterMask above_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+ 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+ 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+ 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+ 0x00000000000000ffULL } }, // block size 32X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+ 0x000000000000ffffULL } }, // block size 64X64, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+ 0x000000000000000fULL } }, // block size 16X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
+};
+
+static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm,
+ int mi_row, int mi_col) {
+ assert(cm->lf.lfm != NULL);
+ const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64
+ const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+ return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
+
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh);
+
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd);
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+int get_index_shift(int mi_col, int mi_row, int *index) {
+ // *index = mi_row >> 2;
+ // rows = mi_row % 4;
+ // stride_log2 = 4;
+ // shift = (rows << stride_log2) + mi_col;
+ *index = mi_row >> 2;
+ return ((mi_row & 3) << 4) | mi_col;
+}
+
+static void filter_selectively_vert_row2(
+ int subsampling_factor, uint8_t *s, int pitch, int plane,
+ uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+ uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+ const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+ uint64_t mask;
+ const int step = 1 << subsampling_factor;
+
+ for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+ mask_8x8_1 | mask_4x4_1;
+ mask; mask >>= step) {
+ const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+ const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+ if (mask & 1) {
+ if ((mask_16x16_0 | mask_16x16_1) & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
+
+ if ((mask_16x16_0 & mask_16x16_1) & 1) {
+ if (plane) {
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else {
+ aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ } else if (mask_16x16_0 & 1) {
+ lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ } else {
+ lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ }
+
+ if ((mask_8x8_0 | mask_8x8_1) & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
+
+ if ((mask_8x8_0 & mask_8x8_1) & 1) {
+ if (plane) {
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else {
+ aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ } else if (mask_8x8_0 & 1) {
+ lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ } else {
+ lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ }
+
+ if ((mask_4x4_0 | mask_4x4_1) & 1) {
+ if ((mask_4x4_0 & mask_4x4_1) & 1) {
+ aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else if (mask_4x4_0 & 1) {
+ aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ } else {
+ aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ }
+ }
+
+ s += 4;
+ lfl += step;
+ lfl2 += step;
+ mask_16x16_0 >>= step;
+ mask_8x8_0 >>= step;
+ mask_4x4_0 >>= step;
+ mask_16x16_1 >>= step;
+ mask_8x8_1 >>= step;
+ mask_4x4_1 >>= step;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(
+ int subsampling_factor, uint16_t *s, int pitch, int plane,
+ uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+ uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+ const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+ uint64_t mask;
+ const int step = 1 << subsampling_factor;
+
+ for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+ mask_8x8_1 | mask_4x4_1;
+ mask; mask >>= step) {
+ const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+ const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+ if (mask & 1) {
+ if ((mask_16x16_0 | mask_16x16_1) & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ HbdLpfFunc highbd_lpf_vertical =
+ plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
+
+ if ((mask_16x16_0 & mask_16x16_1) & 1) {
+ if (plane) {
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
+ } else if (mask_16x16_0 & 1) {
+ highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+ bd);
+ } else {
+ highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ }
+ }
+
+ if ((mask_8x8_0 | mask_8x8_1) & 1) {
+ HbdLpfFunc highbd_lpf_vertical =
+ plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
+ if ((mask_8x8_0 & mask_8x8_1) & 1) {
+ if (plane) {
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
+ } else if (mask_8x8_0 & 1) {
+ highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+ bd);
+ } else {
+ highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ }
+ }
+
+ if ((mask_4x4_0 | mask_4x4_1) & 1) {
+ if ((mask_4x4_0 & mask_4x4_1) & 1) {
+ aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ } else if (mask_4x4_0 & 1) {
+ aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
+ }
+ }
+
+ s += 4;
+ lfl += step;
+ lfl2 += step;
+ mask_16x16_0 >>= step;
+ mask_8x8_0 >>= step;
+ mask_4x4_0 >>= step;
+ mask_16x16_1 >>= step;
+ mask_8x8_1 >>= step;
+ mask_4x4_1 >>= step;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+ int subsampling, uint64_t mask_16x16,
+ uint64_t mask_8x8, uint64_t mask_4x4,
+ const loop_filter_info_n *lfi_n,
+ const uint8_t *lfl) {
+ uint64_t mask;
+ int count;
+ const int step = 1 << subsampling;
+ const unsigned int two_block_mask = subsampling ? 5 : 3;
+ int offset = 0;
+
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+ // Next block's thresholds, when it is within current 64x64 block.
+ // If it is out of bound, its mask is zero, and it points to current edge's
+ // filter parameters, instead of next edge's.
+ int next_edge = step;
+ if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+ count = 1;
+ if (mask & 1) {
+ if (mask_16x16 & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_horizontal =
+ plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+ if ((mask_16x16 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
+ count = 2;
+ } else {
+ lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+ }
+ } else if (mask_8x8 & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_horizontal =
+ plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
+
+ if ((mask_8x8 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
+ count = 2;
+ } else {
+ lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+ }
+ } else if (mask_4x4 & 1) {
+ if ((mask_4x4 & two_block_mask) == two_block_mask) {
+ aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ count = 2;
+ } else {
+ aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+ }
+ }
+ }
+
+ s += 4 * count;
+ lfl += step * count;
+ mask_16x16 >>= step * count;
+ mask_8x8 >>= step * count;
+ mask_4x4 >>= step * count;
+ offset += step * count;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(
+ uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+ uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+ uint8_t *lfl, int bd) {
+ uint64_t mask;
+ int count;
+ const int step = 1 << subsampling;
+ const unsigned int two_block_mask = subsampling ? 5 : 3;
+ int offset = 0;
+
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+ // Next block's thresholds, when it is within current 64x64 block.
+ // If it is out of bound, its mask is zero, and it points to current edge's
+ // filter parameters, instead of next edge's.
+ int next_edge = step;
+ if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+ count = 1;
+ if (mask & 1) {
+ if (mask_16x16 & 1) {
+ HbdLpfFunc highbd_lpf_horizontal =
+ plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
+
+ if ((mask_16x16 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
+ count = 2;
+ } else {
+ highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+ bd);
+ }
+ } else if (mask_8x8 & 1) {
+ HbdLpfFunc highbd_lpf_horizontal =
+ plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
+
+ if ((mask_8x8 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
+ count = 2;
+ } else {
+ highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+ bd);
+ }
+ } else if (mask_4x4 & 1) {
+ if ((mask_4x4 & two_block_mask) == two_block_mask) {
+ aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ count = 2;
+ } else {
+ aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, bd);
+ }
+ }
+ }
+
+ s += 4 * count;
+ lfl += step * count;
+ mask_16x16 >>= step * count;
+ mask_8x8 >>= step * count;
+ mask_4x4 >>= step * count;
+ offset += step * count;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_build_bitmask_vert_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ uint64_t skip, prev_skip = 0;
+ uint64_t is_coding_block_border;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) {
+ const int mi_row = r << subsampling_y;
+ const int row = mi_row % MI_SIZE_64X64;
+ const int row_uv = row | subsampling_y;
+ int index = 0;
+ const int shift = get_index_shift(0, row, &index);
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+ c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+ const int mi_col = c << subsampling_x;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int col_in_unit = 0;
+ col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+ const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+ if (x >= plane_ptr->dst.width) break;
+ const int col = col_in_unit << subsampling_x;
+ const int col_uv = col | subsampling_x;
+ const uint64_t mask = ((uint64_t)1 << (shift | col));
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
+ case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
+ case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((c + col_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+ switch (plane) {
+ case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ if (level == 0 && prev_level != 0) {
+ switch (plane) {
+ case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
+ case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
+ case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ col_in_unit += tx_size_wide_unit[tx_size];
+ }
+ }
+ }
+}
+
+void av1_build_bitmask_horz_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ uint64_t skip, prev_skip = 0;
+ uint64_t is_coding_block_border;
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) {
+ const int mi_col = c << subsampling_x;
+ const int col = mi_col % MI_SIZE_64X64;
+ const int col_uv = col | subsampling_x;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+ r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+ const int mi_row = r << subsampling_y;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int r_in_unit = 0;
+ r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+ const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+ if (y >= plane_ptr->dst.height) break;
+ const int row = r_in_unit << subsampling_y;
+ const int row_uv = row | subsampling_y;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ const uint64_t mask = ((uint64_t)1 << shift);
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
+ case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
+ case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((r + r_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+ switch (plane) {
+ case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ if (level == 0 && prev_level != 0) {
+ switch (plane) {
+ case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
+ case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
+ case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ r_in_unit += tx_size_high_unit[tx_size];
+ }
+ }
+ }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int two_row_step = 2 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ const int two_row_stride = row_stride << 1;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ uint8_t *lfl2;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+
+ // 1. vertical filtering. filter two rows at a time
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += two_row_step) {
+ const int row = r | ssy;
+ const int row_next = row + row_step;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ int index_next = 0;
+ const int shift_next = get_index_shift(col, row_next, &index_next);
+ const int has_next_row = row_next < cm->mi_params.mi_rows;
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_ver[row][col];
+ lfl2 = &lfm->lfl_y_ver[row_next][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u_ver[row][col];
+ lfl2 = &lfm->lfl_u_ver[row_next][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v_ver[row][col];
+ lfl2 = &lfm->lfl_v_ver[row_next][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+ uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+ uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+ uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+ uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+ uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+ if (!has_next_row) {
+ mask_16x16_1 = 0;
+ mask_8x8_1 = 0;
+ mask_4x4_1 = 0;
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_vert_row2(
+ ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+ mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_vert_row2(
+ ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#else
+ filter_selectively_vert_row2(
+ ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+ dst->buf += two_row_stride;
+ }
+ // reset buf pointer for horizontal filtering
+ dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += row_step) {
+ if (mi_row + r == 0) {
+ dst->buf += row_stride;
+ continue;
+ }
+ const int row = r | ssy;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_hor[row][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u_hor[row][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v_hor[row][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+ mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+ mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(
+ CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+ dst->buf += row_stride;
+ }
+ // reset buf pointer for next block
+ dst->buf = buf0;
+}
+
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane_ptr,
+ int pl, int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ int r, c;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int single_step = 1 << ssy;
+ const int r_step = 2 << ssy;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ uint8_t *lfl2;
+
+ // filter two rows at a time
+ for (r = 0; r < cm->seq_params.mib_size &&
+ ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+ r += r_step) {
+ for (c = 0; c < cm->seq_params.mib_size &&
+ ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+ c += MI_SIZE_64X64) {
+ dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+ assert(lfm);
+ const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+ const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ // current and next row should belong to the same mask_idx and index
+ // next row's shift
+ const int row_next = row + single_step;
+ int index_next = 0;
+ const int shift_next = get_index_shift(col, row_next, &index_next);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_ver[row][col];
+ lfl2 = &lfm->lfl_y_ver[row_next][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u_ver[row][col];
+ lfl2 = &lfm->lfl_u_ver[row_next][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v_ver[row][col];
+ lfl2 = &lfm->lfl_v_ver[row_next][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+ uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+ uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+ uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+ uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+ uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_vert_row2(
+ ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+ mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+ mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2);
+#else
+ filter_selectively_vert_row2(
+ ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+ dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+ }
+ dst->buf += 2 * MI_SIZE * dst->stride;
+ }
+}
+
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane_ptr,
+ int pl, int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ int r, c;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int r_step = 1 << ssy;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+
+ for (r = 0; r < cm->seq_params.mib_size &&
+ ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+ r += r_step) {
+ for (c = 0; c < cm->seq_params.mib_size &&
+ ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+ c += MI_SIZE_64X64) {
+ if (mi_row + r == 0) continue;
+
+ dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+ assert(lfm);
+ const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+ const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_hor[row][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u_hor[row][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v_hor[row][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+ mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+ mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl,
+ (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+ dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+ }
+ dst->buf += MI_SIZE * dst->stride;
+ }
+}
+
+void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi) {
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+ mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (tx_size - TX_4X16);
+ }
+ int index = 0;
+ const int row = mi_row % MI_SIZE_64X64;
+ const int col = mi_col % MI_SIZE_64X64;
+ const int shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+ // Use a lookup table that provides one bitmask for a given block size and
+ // a univariant transform size.
+ int index;
+ int shift;
+ int row;
+ int col;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (mbmi->tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+ mask_id =
+ 47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (mbmi->tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (mbmi->tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (mbmi->tx_size - TX_4X16);
+ }
+ row = mi_row % MI_SIZE_64X64;
+ col = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+ int is_horz_coding_block_border,
+ int is_vert_coding_block_border) {
+ int index;
+ int shift;
+ int row;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const int row_start = mi_row % MI_SIZE_64X64;
+ const int col_start = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col_start, row_start, &index);
+ if (is_horz_coding_block_border) {
+ const int block_shift = shift + mi_size_wide[bsize];
+ assert(block_shift <= 64);
+ const uint64_t right_edge_shift =
+ (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift);
+ const uint64_t left_edge_shift = (block_shift == 64)
+ ? (((uint64_t)1 << shift) - 1)
+ : ((uint64_t)1 << shift);
+ assert(right_edge_shift > left_edge_shift);
+ const uint64_t top_edge_mask = right_edge_shift - left_edge_shift;
+ lfm->is_horz_border.bits[index] |= top_edge_mask;
+ }
+ if (is_vert_coding_block_border) {
+ const int is_vert_border = mask_id_table_vert_border[bsize];
+ const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->is_vert_border.bits[i + index] |=
+ (left_mask_univariant_reordered[is_vert_border].bits[i]
+ << vert_shift);
+ }
+ }
+ const int is_skip = mbmi->skip && is_inter_block(mbmi);
+ if (is_skip) {
+ const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->skip.bits[i + index] |=
+ (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+ }
+ }
+ const uint8_t level_vert_y =
+ av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+ const uint8_t level_horz_y =
+ av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+ const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+ const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+ for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+ index = 0;
+ row = r % MI_SIZE_64X64;
+ memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_u_ver[row][col_start], level_u,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_u_hor[row][col_start], level_u,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_v_ver[row][col_start], level_v,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_v_hor[row][col_start], level_v,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ }
+}
+#endif // CONFIG_LPF_MASK
diff --git a/media/libaom/src/av1/common/mv.h b/media/libaom/src/av1/common/mv.h
index 5b0225192..be539e820 100644
--- a/media/libaom/src/av1/common/mv.h
+++ b/media/libaom/src/av1/common/mv.h
@@ -21,17 +21,34 @@ extern "C" {
#endif
#define INVALID_MV 0x80008000
+#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3)
+#define GET_MV_SUBPEL(x) ((x)*8)
+#define MARK_MV_INVALID(mv) \
+ do { \
+ ((int_mv *)(mv))->as_int = INVALID_MV; \
+ } while (0);
+#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col))
+
+// The motion vector in units of full pixel
+typedef struct fullpel_mv {
+ int16_t row;
+ int16_t col;
+} FULLPEL_MV;
+
+// The motion vector in units of 1/8-pel
typedef struct mv {
int16_t row;
int16_t col;
} MV;
static const MV kZeroMv = { 0, 0 };
+static const FULLPEL_MV kZeroFullMv = { 0, 0 };
typedef union int_mv {
uint32_t as_int;
MV as_mv;
+ FULLPEL_MV as_fullmv;
} int_mv; /* facilitates faster equality tests and copies */
typedef struct mv32 {
@@ -39,6 +56,38 @@ typedef struct mv32 {
int32_t col;
} MV32;
+// The mv limit for fullpel mvs
+typedef struct {
+ int col_min;
+ int col_max;
+ int row_min;
+ int row_max;
+} FullMvLimits;
+
+// The mv limit for subpel mvs
+typedef struct {
+ int col_min;
+ int col_max;
+ int row_min;
+ int row_max;
+} SubpelMvLimits;
+
+static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) {
+ const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row),
+ (int16_t)GET_MV_RAWPEL(subpel_mv->col) };
+ return full_mv;
+}
+
+static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
+ const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
+ (int16_t)GET_MV_SUBPEL(full_mv->col) };
+ return subpel_mv;
+}
+
+static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) {
+ mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
+}
+
// Bits of precision used for the model
#define WARPEDMODEL_PREC_BITS 16
#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
@@ -56,13 +105,13 @@ typedef struct mv32 {
#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
/* clang-format off */
-typedef enum ATTRIBUTE_PACKED {
+enum {
IDENTITY = 0, // identity transformation, 0-parameter
TRANSLATION = 1, // translational motion 2-parameter
ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter
AFFINE = 3, // affine, 6-parameter
TRANS_TYPES,
-} TransformationType;
+} UENUM1BYTE(TransformationType);
/* clang-format on */
// Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
@@ -87,18 +136,18 @@ static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
// z . y' = m4 m5 m1 * y
// 1] m6 m7 1) 1]
typedef struct {
- TransformationType wmtype;
int32_t wmmat[8];
int16_t alpha, beta, gamma, delta;
+ TransformationType wmtype;
int8_t invalid;
} WarpedMotionParams;
/* clang-format off */
static const WarpedMotionParams default_warp_params = {
- IDENTITY,
{ 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
0 },
0, 0, 0, 0,
+ IDENTITY,
0,
};
/* clang-format on */
@@ -225,7 +274,8 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
// All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
// bits of fractional precision. The offset for a translation is stored in
// entries 0 and 1. For translations, all but the top three (two if
- // cm->allow_high_precision_mv is false) fractional bits are always zero.
+ // cm->features.allow_high_precision_mv is false) fractional bits are always
+ // zero.
//
// After the right shifts, there are 3 fractional bits of precision. If
// allow_hp is false, the bottom bit is always zero (so we don't need a
@@ -263,7 +313,7 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
return res;
}
-static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
+static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) {
if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
@@ -277,7 +327,6 @@ static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
typedef struct candidate_mv {
int_mv this_mv;
int_mv comp_mv;
- int weight;
} CANDIDATE_MV;
static INLINE int is_zero_mv(const MV *mv) {
@@ -288,10 +337,14 @@ static INLINE int is_equal_mv(const MV *a, const MV *b) {
return *((const uint32_t *)a) == *((const uint32_t *)b);
}
-static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
- int max_row) {
- mv->col = clamp(mv->col, min_col, max_col);
- mv->row = clamp(mv->row, min_row, max_row);
+static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
+ mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+ mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
+}
+
+static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) {
+ mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+ mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
}
#ifdef __cplusplus
diff --git a/media/libaom/src/av1/common/mvref_common.c b/media/libaom/src/av1/common/mvref_common.c
index 7f24ab4e6..db3098cc0 100644
--- a/media/libaom/src/av1/common/mvref_common.c
+++ b/media/libaom/src/av1/common/mvref_common.c
@@ -23,7 +23,7 @@ static int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
// TODO(jingning): Consider the use of lookup table for (num / den)
// altogether.
-static void get_mv_projection(MV *output, MV ref, int num, int den) {
+static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) {
den = AOMMIN(den, MAX_FRAME_DISTANCE);
num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
: AOMMAX(num, -MAX_FRAME_DISTANCE);
@@ -40,7 +40,7 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) {
void av1_copy_frame_mvs(const AV1_COMMON *const cm,
const MB_MODE_INFO *const mi, int mi_row, int mi_col,
int x_mis, int y_mis) {
- const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+ const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
MV_REF *frame_mvs =
cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
@@ -71,34 +71,35 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm,
}
}
-static void add_ref_mv_candidate(
+static AOM_INLINE void add_ref_mv_candidate(
const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
- CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
- const WarpedMotionParams *gm_params, int col, int weight) {
- if (!is_inter_block(candidate)) return; // for intrabc
- int index = 0, ref;
+ CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+ int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params,
+ uint16_t weight) {
+ if (!is_inter_block(candidate)) return;
assert(weight % 2 == 0);
+ int index, ref;
if (rf[1] == NONE_FRAME) {
// single reference frame
for (ref = 0; ref < 2; ++ref) {
if (candidate->ref_frame[ref] == rf[0]) {
- int_mv this_refmv;
- if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
- this_refmv = gm_mv_candidates[0];
- else
- this_refmv = get_sub_block_mv(candidate, ref, col);
-
- for (index = 0; index < *refmv_count; ++index)
- if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
-
- if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+ const int is_gm_block =
+ is_global_mv_block(candidate, gm_params[rf[0]].wmtype);
+ const int_mv this_refmv =
+ is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref);
+ for (index = 0; index < *refmv_count; ++index) {
+ if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) {
+ ref_mv_weight[index] += weight;
+ break;
+ }
+ }
// Add a new item to the list.
if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
ref_mv_stack[index].this_mv = this_refmv;
- ref_mv_stack[index].weight = weight;
+ ref_mv_weight[index] = weight;
++(*refmv_count);
}
if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
@@ -114,21 +115,22 @@ static void add_ref_mv_candidate(
if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
this_refmv[ref] = gm_mv_candidates[ref];
else
- this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
+ this_refmv[ref] = get_block_mv(candidate, ref);
}
- for (index = 0; index < *refmv_count; ++index)
+ for (index = 0; index < *refmv_count; ++index) {
if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
- (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
+ (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) {
+ ref_mv_weight[index] += weight;
break;
-
- if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+ }
+ }
// Add a new item to the list.
if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
ref_mv_stack[index].this_mv = this_refmv[0];
ref_mv_stack[index].comp_mv = this_refmv[1];
- ref_mv_stack[index].weight = weight;
+ ref_mv_weight[index] = weight;
++(*refmv_count);
}
if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
@@ -137,42 +139,39 @@ static void add_ref_mv_candidate(
}
}
-static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
- int mi_row, int mi_col,
- const MV_REFERENCE_FRAME rf[2], int row_offset,
- CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
- uint8_t *ref_match_count, uint8_t *newmv_count,
- int_mv *gm_mv_candidates, int max_row_offset,
- int *processed_rows) {
- int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
+static AOM_INLINE void scan_row_mbmi(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col,
+ const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack,
+ uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+ uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset,
+ int *processed_rows) {
+ int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
- const int n8_w_8 = mi_size_wide[BLOCK_8X8];
- const int n8_w_16 = mi_size_wide[BLOCK_16X16];
- int i;
+ const int width_8x8 = mi_size_wide[BLOCK_8X8];
+ const int width_16x16 = mi_size_wide[BLOCK_16X16];
int col_offset = 0;
// TODO(jingning): Revisit this part after cb4x4 is stable.
if (abs(row_offset) > 1) {
col_offset = 1;
- if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
+ if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset;
}
- const int use_step_16 = (xd->n4_w >= 16);
+ const int use_step_16 = (xd->width >= 16);
MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
- (void)mi_row;
- for (i = 0; i < end_mi;) {
+ for (int i = 0; i < end_mi;) {
const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
const int candidate_bsize = candidate->sb_type;
const int n4_w = mi_size_wide[candidate_bsize];
- int len = AOMMIN(xd->n4_w, n4_w);
+ int len = AOMMIN(xd->width, n4_w);
if (use_step_16)
- len = AOMMAX(n8_w_16, len);
+ len = AOMMAX(width_16x16, len);
else if (abs(row_offset) > 1)
- len = AOMMAX(len, n8_w_8);
+ len = AOMMAX(len, width_8x8);
- int weight = 2;
- if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
- int inc = AOMMIN(-max_row_offset + row_offset + 1,
- mi_size_high[candidate_bsize]);
+ uint16_t weight = 2;
+ if (xd->width >= width_8x8 && xd->width <= n4_w) {
+ uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1,
+ mi_size_high[candidate_bsize]);
// Obtain range used in weight calculation.
weight = AOMMAX(weight, inc);
// Update processed rows.
@@ -180,21 +179,20 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
}
add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
- newmv_count, ref_mv_stack, gm_mv_candidates,
- cm->global_motion, col_offset + i, len * weight);
+ newmv_count, ref_mv_stack, ref_mv_weight,
+ gm_mv_candidates, cm->global_motion, len * weight);
i += len;
}
}
-static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
- int mi_row, int mi_col,
- const MV_REFERENCE_FRAME rf[2], int col_offset,
- CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
- uint8_t *ref_match_count, uint8_t *newmv_count,
- int_mv *gm_mv_candidates, int max_col_offset,
- int *processed_cols) {
- int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
+static AOM_INLINE void scan_col_mbmi(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row,
+ const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack,
+ uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+ uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset,
+ int *processed_cols) {
+ int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
const int n8_h_8 = mi_size_high[BLOCK_8X8];
const int n8_h_16 = mi_size_high[BLOCK_16X16];
@@ -202,24 +200,23 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
int row_offset = 0;
if (abs(col_offset) > 1) {
row_offset = 1;
- if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
+ if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset;
}
- const int use_step_16 = (xd->n4_h >= 16);
- (void)mi_col;
+ const int use_step_16 = (xd->height >= 16);
for (i = 0; i < end_mi;) {
const MB_MODE_INFO *const candidate =
xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
const int candidate_bsize = candidate->sb_type;
const int n4_h = mi_size_high[candidate_bsize];
- int len = AOMMIN(xd->n4_h, n4_h);
+ int len = AOMMIN(xd->height, n4_h);
if (use_step_16)
len = AOMMAX(n8_h_16, len);
else if (abs(col_offset) > 1)
len = AOMMAX(len, n8_h_8);
int weight = 2;
- if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
+ if (xd->height >= n8_h_8 && xd->height <= n4_h) {
int inc = AOMMIN(-max_col_offset + col_offset + 1,
mi_size_wide[candidate_bsize]);
// Obtain range used in weight calculation.
@@ -229,20 +226,19 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
}
add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
- newmv_count, ref_mv_stack, gm_mv_candidates,
- cm->global_motion, col_offset, len * weight);
+ newmv_count, ref_mv_stack, ref_mv_weight,
+ gm_mv_candidates, cm->global_motion, len * weight);
i += len;
}
}
-static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
- const int mi_row, const int mi_col,
- const MV_REFERENCE_FRAME rf[2], int row_offset,
- int col_offset, CANDIDATE_MV *ref_mv_stack,
- uint8_t *ref_match_count, uint8_t *newmv_count,
- int_mv *gm_mv_candidates,
- uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
+static AOM_INLINE void scan_blk_mbmi(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row,
+ const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset,
+ int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+ uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates,
+ uint8_t *refmv_count) {
const TileInfo *const tile = &xd->tile;
POSITION mi_pos;
@@ -255,8 +251,8 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
const int len = mi_size_wide[BLOCK_8X8];
add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
- newmv_count, ref_mv_stack, gm_mv_candidates,
- cm->global_motion, mi_pos.col, 2 * len);
+ newmv_count, ref_mv_stack, ref_mv_weight,
+ gm_mv_candidates, cm->global_motion, 2 * len);
} // Analyze a single 8x8 block motion information.
}
@@ -291,19 +287,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
// The left hand of two vertical rectangles always has a top right (as the
// block above will have been decoded)
- if (xd->n4_w < xd->n4_h)
+ if (xd->width < xd->height)
if (!xd->is_sec_rect) has_tr = 1;
// The bottom of two horizontal rectangles never has a top right (as the block
// to the right won't have been decoded)
- if (xd->n4_w > xd->n4_h)
+ if (xd->width > xd->height)
if (xd->is_sec_rect) has_tr = 0;
// The bottom left square of a Vertical A (in the old format) does
// not have a top right as it is decoded before the right hand
// rectangle of the partition
if (xd->mi[0]->partition == PARTITION_VERT_A) {
- if (xd->n4_w == xd->n4_h)
+ if (xd->width == xd->height)
if (mask_row & bs) has_tr = 0;
}
@@ -326,112 +322,98 @@ static int check_sb_border(const int mi_row, const int mi_col,
static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
int blk_row, int blk_col, int_mv *gm_mv_candidates,
- uint8_t refmv_count[MODE_CTX_REF_FRAMES],
- CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
+ uint8_t *const refmv_count,
+ CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
int16_t *mode_context) {
POSITION mi_pos;
- int idx;
- const int weight_unit = 1; // mi_size_wide[BLOCK_8X8];
-
mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
const TPL_MV_REF *prev_frame_mvs =
- cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
+ cm->tpl_mvs +
+ ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) +
((mi_col + mi_pos.col) >> 1);
+ if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0;
MV_REFERENCE_FRAME rf[2];
av1_set_ref_frame(rf, ref_frame);
+ const uint16_t weight_unit = 1; // mi_size_wide[BLOCK_8X8];
+ const int cur_frame_index = cm->cur_frame->order_hint;
+ const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
+ const int frame0_index = buf_0->order_hint;
+ const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
+ cur_frame_index, frame0_index);
+ int idx;
+ const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+ const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+
+ int_mv this_refmv;
+ get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+ cur_offset_0, prev_frame_mvs->ref_frame_offset);
+ lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv,
+ force_integer_mv);
+
if (rf[1] == NONE_FRAME) {
- int cur_frame_index = cm->cur_frame->cur_frame_offset;
- int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
- int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
- int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
- CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
-
- if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
- int_mv this_refmv;
-
- get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
- cur_offset_0, prev_frame_mvs->ref_frame_offset);
- lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
- cm->cur_frame_force_integer_mv);
-
- if (blk_row == 0 && blk_col == 0)
- if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
- abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
- mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
- for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
- if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
-
- if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
-
- if (idx == refmv_count[rf[0]] &&
- refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
- ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
- ref_mv_stack[idx].weight = 2 * weight_unit;
- ++(refmv_count[rf[0]]);
- }
- return 1;
+ if (blk_row == 0 && blk_col == 0) {
+ if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+ abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+ mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+ }
+
+ for (idx = 0; idx < *refmv_count; ++idx)
+ if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+
+ if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
+
+ if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+ ref_mv_weight[idx] = 2 * weight_unit;
+ ++(*refmv_count);
}
} else {
// Process compound inter mode
- int cur_frame_index = cm->cur_frame->cur_frame_offset;
- int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
- int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
-
- int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
- int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
- int frame1_index = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
- int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index);
- CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
-
- if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
- int_mv this_refmv;
- int_mv comp_refmv;
- get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
- cur_offset_0, prev_frame_mvs->ref_frame_offset);
- get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
- cur_offset_1, prev_frame_mvs->ref_frame_offset);
-
- lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
- cm->cur_frame_force_integer_mv);
- lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
- cm->cur_frame_force_integer_mv);
-
- if (blk_row == 0 && blk_col == 0)
- if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
- abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
- abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
- abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
- mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
- for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
- if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
- comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
- break;
+ const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
+ const int frame1_index = buf_1->order_hint;
+ const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
+ cur_frame_index, frame1_index);
+ int_mv comp_refmv;
+ get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+ cur_offset_1, prev_frame_mvs->ref_frame_offset);
+ lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv,
+ force_integer_mv);
+
+ if (blk_row == 0 && blk_col == 0) {
+ if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+ abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+ abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+ abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+ mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+ }
+
+ for (idx = 0; idx < *refmv_count; ++idx) {
+ if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+ comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+ break;
+ }
- if (idx < refmv_count[ref_frame])
- ref_mv_stack[idx].weight += 2 * weight_unit;
+ if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
- if (idx == refmv_count[ref_frame] &&
- refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
- ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
- ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
- ref_mv_stack[idx].weight = 2 * weight_unit;
- ++(refmv_count[ref_frame]);
- }
- return 1;
+ if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+ ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
+ ref_mv_weight[idx] = 2 * weight_unit;
+ ++(*refmv_count);
}
}
- return 0;
+
+ return 1;
}
-static void process_compound_ref_mv_candidate(
+static AOM_INLINE void process_compound_ref_mv_candidate(
const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
@@ -456,10 +438,11 @@ static void process_compound_ref_mv_candidate(
}
}
-static void process_single_ref_mv_candidate(
+static AOM_INLINE void process_single_ref_mv_candidate(
const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
- MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
- CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+ MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count,
+ CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) {
for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
int_mv this_mv = candidate->mv[rf_idx];
@@ -469,49 +452,50 @@ static void process_single_ref_mv_candidate(
this_mv.as_mv.col = -this_mv.as_mv.col;
}
int stack_idx;
- for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
- const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+ for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) {
+ const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv;
if (this_mv.as_int == stack_mv.as_int) break;
}
- if (stack_idx == refmv_count[ref_frame]) {
- ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+ if (stack_idx == *refmv_count) {
+ ref_mv_stack[stack_idx].this_mv = this_mv;
// TODO(jingning): Set an arbitrary small number here. The weight
// doesn't matter as long as it is properly initialized.
- ref_mv_stack[ref_frame][stack_idx].weight = 2;
- ++refmv_count[ref_frame];
+ ref_mv_weight[stack_idx] = 2;
+ ++(*refmv_count);
}
}
}
}
-static void setup_ref_mv_list(
+static AOM_INLINE void setup_ref_mv_list(
const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
- uint8_t refmv_count[MODE_CTX_REF_FRAMES],
- CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
- int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+ uint8_t *const refmv_count,
+ CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
+ int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
int mi_row, int mi_col, int16_t *mode_context) {
- const int bs = AOMMAX(xd->n4_w, xd->n4_h);
+ const int bs = AOMMAX(xd->width, xd->height);
const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
MV_REFERENCE_FRAME rf[2];
const TileInfo *const tile = &xd->tile;
int max_row_offset = 0, max_col_offset = 0;
- const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
- const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+ const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+ const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
int processed_rows = 0;
int processed_cols = 0;
av1_set_ref_frame(rf, ref_frame);
mode_context[ref_frame] = 0;
- refmv_count[ref_frame] = 0;
+ *refmv_count = 0;
// Find valid maximum row/col offset.
if (xd->up_available) {
max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
- if (xd->n4_h < mi_size_high[BLOCK_8X8])
+ if (xd->height < mi_size_high[BLOCK_8X8])
max_row_offset = -(2 << 1) + row_adj;
max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
@@ -520,7 +504,7 @@ static void setup_ref_mv_list(
if (xd->left_available) {
max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
- if (xd->n4_w < mi_size_wide[BLOCK_8X8])
+ if (xd->width < mi_size_wide[BLOCK_8X8])
max_col_offset = -(2 << 1) + col_adj;
max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
@@ -532,48 +516,48 @@ static void setup_ref_mv_list(
// Scan the first above row mode info. row_offset = -1;
if (abs(max_row_offset) >= 1)
- scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
- &refmv_count[ref_frame], &row_match_count, &newmv_count,
- gm_mv_candidates, max_row_offset, &processed_rows);
+ scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
+ refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
+ max_row_offset, &processed_rows);
// Scan the first left column mode info. col_offset = -1;
if (abs(max_col_offset) >= 1)
- scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
- &refmv_count[ref_frame], &col_match_count, &newmv_count,
- gm_mv_candidates, max_col_offset, &processed_cols);
+ scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
+ refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
+ max_col_offset, &processed_cols);
// Check top-right boundary
if (has_tr)
- scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
- ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
- gm_mv_candidates, &refmv_count[ref_frame]);
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack,
+ ref_mv_weight, &row_match_count, &newmv_count,
+ gm_mv_candidates, refmv_count);
const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
- const uint8_t nearest_refmv_count = refmv_count[ref_frame];
+ const uint8_t nearest_refmv_count = *refmv_count;
// TODO(yunqing): for comp_search, do it for all 3 cases.
for (int idx = 0; idx < nearest_refmv_count; ++idx)
- ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
+ ref_mv_weight[idx] += REF_CAT_LEVEL;
- if (cm->allow_ref_frame_mvs) {
+ if (cm->features.allow_ref_frame_mvs) {
int is_available = 0;
- const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
- const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
- const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
- const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
+ const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height);
+ const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width);
+ const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]);
+ const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]);
const int tpl_sample_pos[3][2] = {
{ voffset, -2 },
{ voffset, hoffset },
{ voffset - 2, hoffset },
};
- const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
- (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
- (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
- (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+ const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) &&
+ (xd->height < mi_size_high[BLOCK_64X64]) &&
+ (xd->width >= mi_size_wide[BLOCK_8X8]) &&
+ (xd->width < mi_size_wide[BLOCK_64X64]);
- const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+ const int step_h = (xd->height >= mi_size_high[BLOCK_64X64])
? mi_size_high[BLOCK_16X16]
: mi_size_high[BLOCK_8X8];
- const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+ const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64])
? mi_size_wide[BLOCK_16X16]
: mi_size_wide[BLOCK_8X8];
@@ -581,7 +565,7 @@ static void setup_ref_mv_list(
for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
blk_col, gm_mv_candidates, refmv_count,
- ref_mv_stack, mode_context);
+ ref_mv_stack, ref_mv_weight, mode_context);
if (blk_row == 0 && blk_col == 0) is_available = ret;
}
}
@@ -594,16 +578,17 @@ static void setup_ref_mv_list(
if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
- gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
+ gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight,
+ mode_context);
}
}
uint8_t dummy_newmv_count = 0;
// Scan the second outer area.
- scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight,
&row_match_count, &dummy_newmv_count, gm_mv_candidates,
- &refmv_count[ref_frame]);
+ refmv_count);
for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
const int row_offset = -(idx << 1) + 1 + row_adj;
@@ -611,24 +596,21 @@ static void setup_ref_mv_list(
if (abs(row_offset) <= abs(max_row_offset) &&
abs(row_offset) > processed_rows)
- scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
- ref_mv_stack[ref_frame], &refmv_count[ref_frame],
- &row_match_count, &dummy_newmv_count, gm_mv_candidates,
- max_row_offset, &processed_rows);
+ scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
+ refmv_count, &row_match_count, &dummy_newmv_count,
+ gm_mv_candidates, max_row_offset, &processed_rows);
if (abs(col_offset) <= abs(max_col_offset) &&
abs(col_offset) > processed_cols)
- scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
- ref_mv_stack[ref_frame], &refmv_count[ref_frame],
- &col_match_count, &dummy_newmv_count, gm_mv_candidates,
- max_col_offset, &processed_cols);
+ scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
+ refmv_count, &col_match_count, &dummy_newmv_count,
+ gm_mv_candidates, max_col_offset, &processed_cols);
}
const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
switch (nearest_match) {
case 0:
- mode_context[ref_frame] |= 0;
if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
if (ref_match_count == 1)
mode_context[ref_frame] |= (1 << REFMV_OFFSET);
@@ -658,45 +640,48 @@ static void setup_ref_mv_list(
while (len > 0) {
int nr_len = 0;
for (int idx = 1; idx < len; ++idx) {
- if (ref_mv_stack[ref_frame][idx - 1].weight <
- ref_mv_stack[ref_frame][idx].weight) {
- CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
- ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
- ref_mv_stack[ref_frame][idx] = tmp_mv;
+ if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+ const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+ const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+ ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+ ref_mv_stack[idx] = tmp_mv;
+ ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+ ref_mv_weight[idx] = tmp_ref_mv_weight;
nr_len = idx;
}
}
len = nr_len;
}
- len = refmv_count[ref_frame];
+ len = *refmv_count;
while (len > nearest_refmv_count) {
int nr_len = nearest_refmv_count;
for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
- if (ref_mv_stack[ref_frame][idx - 1].weight <
- ref_mv_stack[ref_frame][idx].weight) {
- CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
- ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
- ref_mv_stack[ref_frame][idx] = tmp_mv;
+ if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+ const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+ const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+ ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+ ref_mv_stack[idx] = tmp_mv;
+ ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+ ref_mv_weight[idx] = tmp_ref_mv_weight;
nr_len = idx;
}
}
len = nr_len;
}
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width);
+ mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height);
+ mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row);
+ const int mi_size = AOMMIN(mi_width, mi_height);
if (rf[1] > NONE_FRAME) {
// TODO(jingning, yunqing): Refactor and consolidate the compound and
// single reference frame modes. Reduce unnecessary redundancy.
- if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
+ if (*refmv_count < MAX_MV_REF_CANDIDATES) {
int_mv ref_id[2][2], ref_diff[2][2];
int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
- int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
- mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
- int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
- mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
- int mi_size = AOMMIN(mi_width, mi_height);
-
for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
process_compound_ref_mv_candidate(
@@ -712,95 +697,82 @@ static void setup_ref_mv_list(
}
// Build up the compound mv predictor
- int_mv comp_list[3][2];
+ int_mv comp_list[MAX_MV_REF_CANDIDATES][2];
for (int idx = 0; idx < 2; ++idx) {
int comp_idx = 0;
- for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
+ for (int list_idx = 0;
+ list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
++list_idx, ++comp_idx)
comp_list[comp_idx][idx] = ref_id[idx][list_idx];
- for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
+ for (int list_idx = 0;
+ list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
++list_idx, ++comp_idx)
comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
- for (; comp_idx < 3; ++comp_idx)
+ for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx)
comp_list[comp_idx][idx] = gm_mv_candidates[idx];
}
- if (refmv_count[ref_frame]) {
- assert(refmv_count[ref_frame] == 1);
- if (comp_list[0][0].as_int ==
- ref_mv_stack[ref_frame][0].this_mv.as_int &&
- comp_list[0][1].as_int ==
- ref_mv_stack[ref_frame][0].comp_mv.as_int) {
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
- comp_list[1][0];
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
- comp_list[1][1];
+ if (*refmv_count) {
+ assert(*refmv_count == 1);
+ if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int &&
+ comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) {
+ ref_mv_stack[*refmv_count].this_mv = comp_list[1][0];
+ ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1];
} else {
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
- comp_list[0][0];
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
- comp_list[0][1];
+ ref_mv_stack[*refmv_count].this_mv = comp_list[0][0];
+ ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1];
}
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
- ++refmv_count[ref_frame];
+ ref_mv_weight[*refmv_count] = 2;
+ ++*refmv_count;
} else {
for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
- comp_list[idx][0];
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
- comp_list[idx][1];
- ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
- ++refmv_count[ref_frame];
+ ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0];
+ ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1];
+ ref_mv_weight[*refmv_count] = 2;
+ ++*refmv_count;
}
}
}
- assert(refmv_count[ref_frame] >= 2);
+ assert(*refmv_count >= 2);
- for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
- clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
- xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
- clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
- xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+ for (int idx = 0; idx < *refmv_count; ++idx) {
+ clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+ xd->height << MI_SIZE_LOG2, xd);
+ clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2,
+ xd->height << MI_SIZE_LOG2, xd);
}
} else {
// Handle single reference frame extension
- int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
- mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
- int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
- mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
- int mi_size = AOMMIN(mi_width, mi_height);
-
for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
- refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+ *refmv_count < MAX_MV_REF_CANDIDATES;) {
const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
- ref_mv_stack);
+ ref_mv_stack, ref_mv_weight);
idx += mi_size_wide[candidate->sb_type];
}
for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
- refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+ *refmv_count < MAX_MV_REF_CANDIDATES;) {
const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
- ref_mv_stack);
+ ref_mv_stack, ref_mv_weight);
idx += mi_size_high[candidate->sb_type];
}
- for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
- clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
- xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+ for (int idx = 0; idx < *refmv_count; ++idx) {
+ clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+ xd->height << MI_SIZE_LOG2, xd);
}
if (mv_ref_list != NULL) {
- for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
- mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
+ for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx)
+ mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int;
- for (int idx = 0;
- idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
- mv_ref_list[rf[0]][idx].as_int =
- ref_mv_stack[ref_frame][idx].this_mv.as_int;
+ for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count);
+ ++idx) {
+ mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
}
}
}
@@ -810,43 +782,44 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
- int_mv *global_mvs, int mi_row, int mi_col,
- int16_t *mode_context) {
- int_mv zeromv[2];
- BLOCK_SIZE bsize = mi->sb_type;
- MV_REFERENCE_FRAME rf[2];
- av1_set_ref_frame(rf, ref_frame);
-
- if (ref_frame < REF_FRAMES) {
- if (ref_frame != INTRA_FRAME) {
- global_mvs[ref_frame] = gm_get_motion_vector(
- &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
- mi_col, mi_row, cm->cur_frame_force_integer_mv);
- } else {
+ int_mv *global_mvs, int16_t *mode_context) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int_mv gm_mv[2];
+
+ if (ref_frame == INTRA_FRAME) {
+ gm_mv[0].as_int = gm_mv[1].as_int = 0;
+ if (global_mvs != NULL) {
global_mvs[ref_frame].as_int = INVALID_MV;
}
- }
-
- if (ref_frame != INTRA_FRAME) {
- zeromv[0].as_int =
- gm_get_motion_vector(&cm->global_motion[rf[0]],
- cm->allow_high_precision_mv, bsize, mi_col, mi_row,
- cm->cur_frame_force_integer_mv)
- .as_int;
- zeromv[1].as_int =
- (rf[1] != NONE_FRAME)
- ? gm_get_motion_vector(&cm->global_motion[rf[1]],
- cm->allow_high_precision_mv, bsize, mi_col,
- mi_row, cm->cur_frame_force_integer_mv)
- .as_int
- : 0;
} else {
- zeromv[0].as_int = zeromv[1].as_int = 0;
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+ const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+ if (ref_frame < REF_FRAMES) {
+ gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame],
+ allow_high_precision_mv, bsize, mi_col,
+ mi_row, force_integer_mv);
+ gm_mv[1].as_int = 0;
+ if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0];
+ } else {
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+ gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]],
+ allow_high_precision_mv, bsize, mi_col,
+ mi_row, force_integer_mv);
+ gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]],
+ allow_high_precision_mv, bsize, mi_col,
+ mi_row, force_integer_mv);
+ }
}
- setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
- zeromv, mi_row, mi_col, mode_context);
+ setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame],
+ ref_mv_stack[ref_frame], ref_mv_weight[ref_frame],
+ mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row,
+ mi_col, mode_context);
}
void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
@@ -861,26 +834,29 @@ void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
}
void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
- cm->cur_frame->cur_frame_offset = cm->frame_offset;
+ cm->cur_frame->order_hint = cm->current_frame.order_hint;
+ cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
MV_REFERENCE_FRAME ref_frame;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
- if (buf_idx >= 0)
- cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME] =
- cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
+ cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] =
+ buf->display_order_hint;
+ }
}
}
void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
MV_REFERENCE_FRAME ref_frame;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
- if (cm->seq_params.enable_order_hint && buf_idx != INVALID_IDX) {
- const int ref_frame_offset =
- cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
+ const int ref_order_hint = buf->order_hint;
cm->ref_frame_sign_bias[ref_frame] =
- (get_relative_dist(cm, ref_frame_offset, (int)cm->frame_offset) <= 0)
+ (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
+ (int)cm->current_frame.order_hint) <= 0)
? 0
: 1;
} else {
@@ -908,8 +884,8 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
const int col =
(sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
- if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
- col >= (cm->mi_cols >> 1))
+ if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 ||
+ col >= (cm->mi_params.mi_cols >> 1))
return 0;
if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
@@ -935,35 +911,36 @@ static int motion_field_projection(AV1_COMMON *cm,
TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
int ref_offset[REF_FRAMES] = { 0 };
- (void)dir;
-
- const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
- if (start_frame_idx < 0) return 0;
+ const RefCntBuffer *const start_frame_buf =
+ get_ref_frame_buf(cm, start_frame);
+ if (start_frame_buf == NULL) return 0;
- if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
+ if (start_frame_buf->frame_type == KEY_FRAME ||
+ start_frame_buf->frame_type == INTRA_ONLY_FRAME)
+ return 0;
- if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
- cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
+ if (start_frame_buf->mi_rows != cm->mi_params.mi_rows ||
+ start_frame_buf->mi_cols != cm->mi_params.mi_cols)
return 0;
- const int start_frame_offset =
- cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
- const unsigned int *const ref_frame_offsets =
- &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
- const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
- int start_to_current_frame_offset =
- get_relative_dist(cm, start_frame_offset, cur_frame_offset);
+ const int start_frame_order_hint = start_frame_buf->order_hint;
+ const unsigned int *const ref_order_hints =
+ &start_frame_buf->ref_order_hints[0];
+ const int cur_order_hint = cm->cur_frame->order_hint;
+ int start_to_current_frame_offset = get_relative_dist(
+ &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint);
for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
- ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
- ref_frame_offsets[rf - LAST_FRAME]);
+ ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info,
+ start_frame_order_hint,
+ ref_order_hints[rf - LAST_FRAME]);
}
if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
- MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
- const int mvs_rows = (cm->mi_rows + 1) >> 1;
- const int mvs_cols = (cm->mi_cols + 1) >> 1;
+ MV_REF *mv_ref_base = start_frame_buf->mvs;
+ const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1;
+ const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1;
for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
@@ -988,7 +965,7 @@ static int motion_field_projection(AV1_COMMON *cm,
}
if (pos_valid) {
- const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+ const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c;
tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
@@ -1002,33 +979,35 @@ static int motion_field_projection(AV1_COMMON *cm,
}
void av1_setup_motion_field(AV1_COMMON *cm) {
+ const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+
memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
- if (!cm->seq_params.enable_order_hint) return;
+ if (!order_hint_info->enable_order_hint) return;
TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
- int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+ int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
+ (cm->mi_params.mi_stride >> 1);
for (int idx = 0; idx < size; ++idx) {
tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
tpl_mvs_base[idx].ref_frame_offset = 0;
}
- const int cur_order_hint = cm->cur_frame->cur_frame_offset;
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ const int cur_order_hint = cm->cur_frame->order_hint;
- int ref_buf_idx[INTER_REFS_PER_FRAME];
+ const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
int ref_order_hint[INTER_REFS_PER_FRAME];
for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
const int ref_idx = ref_frame - LAST_FRAME;
- const int buf_idx = cm->frame_refs[ref_idx].idx;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
int order_hint = 0;
- if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset;
+ if (buf != NULL) order_hint = buf->order_hint;
- ref_buf_idx[ref_idx] = buf_idx;
+ ref_buf[ref_idx] = buf;
ref_order_hint[ref_idx] = order_hint;
- if (get_relative_dist(cm, order_hint, cur_order_hint) > 0)
+ if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
cm->ref_frame_side[ref_frame] = 1;
else if (order_hint == cur_order_hint)
cm->ref_frame_side[ref_frame] = -1;
@@ -1036,10 +1015,10 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
int ref_stamp = MFMV_STACK_SIZE - 1;
- if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) {
+ if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) {
const int alt_of_lst_order_hint =
- frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]]
- .ref_frame_offset[ALTREF_FRAME - LAST_FRAME];
+ ref_buf[LAST_FRAME - LAST_FRAME]
+ ->ref_order_hints[ALTREF_FRAME - LAST_FRAME];
const int is_lst_overlay =
(alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
@@ -1047,47 +1026,50 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
--ref_stamp;
}
- if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+ if (get_relative_dist(order_hint_info,
+ ref_order_hint[BWDREF_FRAME - LAST_FRAME],
cur_order_hint) > 0) {
if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
}
- if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+ if (get_relative_dist(order_hint_info,
+ ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
cur_order_hint) > 0) {
if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
}
- if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+ if (get_relative_dist(order_hint_info,
+ ref_order_hint[ALTREF_FRAME - LAST_FRAME],
cur_order_hint) > 0 &&
ref_stamp >= 0)
if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
- if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
- if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
+ if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
}
-static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
- int row_offset, int sign_r, int col_offset,
- int sign_c) {
+static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
+ int *pts_inref, int row_offset, int sign_r,
+ int col_offset, int sign_c) {
int bw = block_size_wide[mbmi->sb_type];
int bh = block_size_high[mbmi->sb_type];
int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
- pts[0] = (x * 8);
- pts[1] = (y * 8);
- pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
- pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+ pts[0] = GET_MV_SUBPEL(x);
+ pts[1] = GET_MV_SUBPEL(y);
+ pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col;
+ pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row;
}
// Select samples according to the motion vector difference.
-int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+ BLOCK_SIZE bsize) {
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
int i, j, k, l = len;
- int ret = 0;
+ uint8_t ret = 0;
assert(len <= LEAST_SQUARES_SAMPLES_MAX);
// Obtain the motion vector difference.
@@ -1128,30 +1110,32 @@ int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
// Note: Samples returned are at 1/8-pel precision
// Sample are the neighbor block center point's coordinates relative to the
// left-top pixel of current block.
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
- int *pts, int *pts_inref) {
- MB_MODE_INFO *const mbmi0 = xd->mi[0];
- int ref_frame = mbmi0->ref_frame[0];
- int up_available = xd->up_available;
- int left_available = xd->left_available;
- int i, mi_step = 1, np = 0;
-
- const TileInfo *const tile = &xd->tile;
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+ int *pts_inref) {
+ const MB_MODE_INFO *const mbmi0 = xd->mi[0];
+ const int ref_frame = mbmi0->ref_frame[0];
+ const int up_available = xd->up_available;
+ const int left_available = xd->left_available;
+ int i, mi_step;
+ uint8_t np = 0;
int do_tl = 1;
int do_tr = 1;
+ const int mi_stride = xd->mi_stride;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
// scan the nearest above rows
if (up_available) {
- int mi_row_offset = -1;
- MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
- uint8_t n4_w = mi_size_wide[mbmi->sb_type];
+ const int mi_row_offset = -1;
+ const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
+ uint8_t superblock_width = mi_size_wide[mbmi->sb_type];
- if (xd->n4_w <= n4_w) {
+ if (xd->width <= superblock_width) {
// Handle "current block width <= above block width" case.
- int col_offset = -mi_col % n4_w;
+ const int col_offset = -mi_col % superblock_width;
if (col_offset < 0) do_tl = 0;
- if (col_offset + n4_w > xd->n4_w) do_tr = 0;
+ if (col_offset + superblock_width > xd->width) do_tr = 0;
if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
@@ -1162,11 +1146,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
}
} else {
// Handle "current block width > above block width" case.
- for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
- int mi_col_offset = i;
- mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
- n4_w = mi_size_wide[mbmi->sb_type];
- mi_step = AOMMIN(xd->n4_w, n4_w);
+ for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+ i += mi_step) {
+ mbmi = xd->mi[i + mi_row_offset * mi_stride];
+ superblock_width = mi_size_wide[mbmi->sb_type];
+ mi_step = AOMMIN(xd->width, superblock_width);
if (mbmi->ref_frame[0] == ref_frame &&
mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1183,14 +1167,13 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
// scan the nearest left columns
if (left_available) {
- int mi_col_offset = -1;
-
- MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
- uint8_t n4_h = mi_size_high[mbmi->sb_type];
+ const int mi_col_offset = -1;
+ const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
+ uint8_t superblock_height = mi_size_high[mbmi->sb_type];
- if (xd->n4_h <= n4_h) {
+ if (xd->height <= superblock_height) {
// Handle "current block height <= above block height" case.
- int row_offset = -mi_row % n4_h;
+ const int row_offset = -mi_row % superblock_height;
if (row_offset < 0) do_tl = 0;
@@ -1203,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
}
} else {
// Handle "current block height > above block height" case.
- for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
- int mi_row_offset = i;
- mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
- n4_h = mi_size_high[mbmi->sb_type];
- mi_step = AOMMIN(xd->n4_h, n4_h);
+ for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+ i += mi_step) {
+ mbmi = xd->mi[mi_col_offset + i * mi_stride];
+ superblock_height = mi_size_high[mbmi->sb_type];
+ mi_step = AOMMIN(xd->height, superblock_height);
if (mbmi->ref_frame[0] == ref_frame &&
mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1224,10 +1207,9 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
// Top-left block
if (do_tl && left_available && up_available) {
- int mi_row_offset = -1;
- int mi_col_offset = -1;
-
- MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+ const int mi_row_offset = -1;
+ const int mi_col_offset = -1;
+ MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride];
if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
@@ -1241,18 +1223,17 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
// Top-right block
if (do_tr &&
- has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
- POSITION trb_pos = { -1, xd->n4_w };
-
+ has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
+ const POSITION trb_pos = { -1, xd->width };
+ const TileInfo *const tile = &xd->tile;
if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
- int mi_row_offset = -1;
- int mi_col_offset = xd->n4_w;
-
- MB_MODE_INFO *mbmi =
- xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+ const int mi_row_offset = -1;
+ const int mi_col_offset = xd->width;
+ const MB_MODE_INFO *mbmi =
+ xd->mi[mi_col_offset + mi_row_offset * mi_stride];
if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
- record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
+ record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
np++;
if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
}
@@ -1264,36 +1245,43 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
}
void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
- cm->is_skip_mode_allowed = 0;
- cm->ref_frame_idx_0 = cm->ref_frame_idx_1 = INVALID_IDX;
+ const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+ SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
- if (!cm->seq_params.enable_order_hint || frame_is_intra_only(cm) ||
- cm->reference_mode == SINGLE_REFERENCE)
+ skip_mode_info->skip_mode_allowed = 0;
+ skip_mode_info->ref_frame_idx_0 = INVALID_IDX;
+ skip_mode_info->ref_frame_idx_1 = INVALID_IDX;
+
+ if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) ||
+ cm->current_frame.reference_mode == SINGLE_REFERENCE)
return;
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
- const int cur_frame_offset = cm->frame_offset;
- int ref_frame_offset[2] = { -1, INT_MAX };
+ const int cur_order_hint = cm->current_frame.order_hint;
+ int ref_order_hints[2] = { -1, INT_MAX };
int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
// Identify the nearest forward and backward references.
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- const int buf_idx = cm->frame_refs[i].idx;
- if (buf_idx == INVALID_IDX) continue;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+ if (buf == NULL) continue;
- const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
- if (get_relative_dist(cm, ref_offset, cur_frame_offset) < 0) {
+ const int ref_order_hint = buf->order_hint;
+ if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) <
+ 0) {
// Forward reference
- if (ref_frame_offset[0] == -1 ||
- get_relative_dist(cm, ref_offset, ref_frame_offset[0]) > 0) {
- ref_frame_offset[0] = ref_offset;
+ if (ref_order_hints[0] == -1 ||
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[0]) > 0) {
+ ref_order_hints[0] = ref_order_hint;
ref_idx[0] = i;
}
- } else if (get_relative_dist(cm, ref_offset, cur_frame_offset) > 0) {
+ } else if (get_relative_dist(order_hint_info, ref_order_hint,
+ cur_order_hint) > 0) {
// Backward reference
- if (ref_frame_offset[1] == INT_MAX ||
- get_relative_dist(cm, ref_offset, ref_frame_offset[1]) < 0) {
- ref_frame_offset[1] = ref_offset;
+ if (ref_order_hints[1] == INT_MAX ||
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[1]) < 0) {
+ ref_order_hints[1] = ref_order_hint;
ref_idx[1] = i;
}
}
@@ -1301,75 +1289,71 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
// == Bi-directional prediction ==
- cm->is_skip_mode_allowed = 1;
- cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
- cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+ skip_mode_info->skip_mode_allowed = 1;
+ skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+ skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
} else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
// == Forward prediction only ==
// Identify the second nearest forward reference.
- ref_frame_offset[1] = -1;
+ ref_order_hints[1] = -1;
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- const int buf_idx = cm->frame_refs[i].idx;
- if (buf_idx == INVALID_IDX) continue;
-
- const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
- if ((ref_frame_offset[0] != -1 &&
- get_relative_dist(cm, ref_offset, ref_frame_offset[0]) < 0) &&
- (ref_frame_offset[1] == -1 ||
- get_relative_dist(cm, ref_offset, ref_frame_offset[1]) > 0)) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+ if (buf == NULL) continue;
+
+ const int ref_order_hint = buf->order_hint;
+ if ((ref_order_hints[0] != -1 &&
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[0]) < 0) &&
+ (ref_order_hints[1] == -1 ||
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[1]) > 0)) {
// Second closest forward reference
- ref_frame_offset[1] = ref_offset;
+ ref_order_hints[1] = ref_order_hint;
ref_idx[1] = i;
}
}
- if (ref_frame_offset[1] != -1) {
- cm->is_skip_mode_allowed = 1;
- cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
- cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+ if (ref_order_hints[1] != -1) {
+ skip_mode_info->skip_mode_allowed = 1;
+ skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+ skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
}
}
}
typedef struct {
- int map_idx; // frame map index
- int buf_idx; // frame buffer index
- int sort_idx; // index based on the offset to be used for sorting
+ int map_idx; // frame map index
+ RefCntBuffer *buf; // frame buffer
+ int sort_idx; // index based on the offset to be used for sorting
} REF_FRAME_INFO;
+// Compares the sort_idx fields. If they are equal, then compares the map_idx
+// fields to break the tie. This ensures a stable sort.
static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
- if (info_a->sort_idx < info_b->sort_idx) return -1;
- if (info_a->sort_idx > info_b->sort_idx) return 1;
- return (info_a->map_idx < info_b->map_idx)
- ? -1
- : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+ const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
+ if (sort_idx_diff != 0) return sort_idx_diff;
+ return info_a->map_idx - info_b->map_idx;
}
-static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
- REF_FRAME_INFO *ref_info) {
+static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
+ REF_FRAME_INFO *ref_info) {
assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
- const int buf_idx = ref_info->buf_idx;
-
- cm->frame_refs[frame_idx].idx = buf_idx;
- cm->frame_refs[frame_idx].buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
- cm->frame_refs[frame_idx].map_idx = ref_info->map_idx;
+ remapped_ref_idx[frame_idx] = ref_info->map_idx;
}
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
- int gld_map_idx) {
- BufferPool *const pool = cm->buffer_pool;
- RefCntBuffer *const frame_bufs = pool->frame_bufs;
-
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+ int lst_map_idx, int gld_map_idx) {
int lst_frame_sort_idx = -1;
int gld_frame_sort_idx = -1;
- assert(cm->seq_params.enable_order_hint);
- assert(cm->seq_params.order_hint_bits_minus_1 >= 0);
- const int cur_frame_offset = (int)cm->frame_offset;
- const int cur_frame_sort_idx = 1 << cm->seq_params.order_hint_bits_minus_1;
+ assert(cm->seq_params.order_hint_info.enable_order_hint);
+ assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0);
+ const int cur_order_hint = (int)cm->current_frame.order_hint;
+ const int cur_frame_sort_idx =
+ 1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1;
REF_FRAME_INFO ref_frame_info[REF_FRAMES];
int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
@@ -1380,18 +1364,19 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
ref_frame_info[i].map_idx = map_idx;
ref_frame_info[i].sort_idx = -1;
- const int buf_idx = cm->ref_frame_map[map_idx];
- ref_frame_info[i].buf_idx = buf_idx;
+ RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
+ ref_frame_info[i].buf = buf;
- if (buf_idx < 0 || buf_idx >= FRAME_BUFFERS) continue;
- // TODO(zoeliu@google.com): To verify the checking on ref_count.
- if (frame_bufs[buf_idx].ref_count <= 0) continue;
+ if (buf == NULL) continue;
+ // If this assertion fails, there is a reference leak.
+ assert(buf->ref_count > 0);
- const int offset = (int)frame_bufs[buf_idx].cur_frame_offset;
+ const int offset = (int)buf->order_hint;
ref_frame_info[i].sort_idx =
(offset == -1) ? -1
: cur_frame_sort_idx +
- get_relative_dist(cm, offset, cur_frame_offset);
+ get_relative_dist(&cm->seq_params.order_hint_info,
+ offset, cur_order_hint);
assert(ref_frame_info[i].sort_idx >= -1);
if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
@@ -1414,8 +1399,8 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
compare_ref_frame_info);
// Identify forward and backward reference frames.
- // Forward reference: offset < cur_frame_offset
- // Backward reference: offset >= cur_frame_offset
+ // Forward reference: offset < order_hint
+ // Backward reference: offset >= order_hint
int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
for (int i = 0; i < REF_FRAMES; i++) {
@@ -1437,7 +1422,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
// == ALTREF_FRAME ==
if (bwd_start_idx <= bwd_end_idx) {
- set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
&ref_frame_info[bwd_end_idx]);
ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
bwd_end_idx--;
@@ -1445,7 +1430,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
// == BWDREF_FRAME ==
if (bwd_start_idx <= bwd_end_idx) {
- set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
&ref_frame_info[bwd_start_idx]);
ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
bwd_start_idx++;
@@ -1453,7 +1438,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
// == ALTREF2_FRAME ==
if (bwd_start_idx <= bwd_end_idx) {
- set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
&ref_frame_info[bwd_start_idx]);
ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
}
@@ -1463,13 +1448,15 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
// == LAST_FRAME ==
if (ref_frame_info[i].map_idx == lst_map_idx) {
- set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+ set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
+ &ref_frame_info[i]);
ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
}
// == GOLDEN_FRAME ==
if (ref_frame_info[i].map_idx == gld_map_idx) {
- set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+ set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
+ &ref_frame_info[i]);
ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
}
}
@@ -1501,18 +1488,19 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
}
if (fwd_start_idx > fwd_end_idx) break;
- set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
&ref_frame_info[fwd_end_idx]);
ref_flag_list[ref_frame - LAST_FRAME] = 1;
fwd_end_idx--;
}
- // Assign all the remaining frame(s), if any, to the earliest reference frame.
+ // Assign all the remaining frame(s), if any, to the earliest reference
+ // frame.
for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
- set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
&ref_frame_info[fwd_start_idx]);
ref_flag_list[ref_frame - LAST_FRAME] = 1;
}
diff --git a/media/libaom/src/av1/common/mvref_common.h b/media/libaom/src/av1/common/mvref_common.h
index 83f7a1ac0..05a0dbc04 100644
--- a/media/libaom/src/av1/common/mvref_common.h
+++ b/media/libaom/src/av1/common/mvref_common.h
@@ -11,7 +11,7 @@
#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
#define AOM_AV1_COMMON_MVREF_COMMON_H_
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#ifdef __cplusplus
@@ -34,10 +34,10 @@ typedef struct position {
// clamp_mv_ref
#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
-static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
- if (!cm->seq_params.enable_order_hint) return 0;
+static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
+ if (!oh->enable_order_hint) return 0;
- const int bits = cm->seq_params.order_hint_bits_minus_1 + 1;
+ const int bits = oh->order_hint_bits_minus_1 + 1;
assert(bits >= 1);
assert(a >= 0 && a < (1 << bits));
@@ -50,38 +50,19 @@ static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
}
static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
- clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
- xd->mb_to_right_edge + bw * 8 + MV_BORDER,
- xd->mb_to_top_edge - bh * 8 - MV_BORDER,
- xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
+ const SubpelMvLimits mv_limits = {
+ xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
+ xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
+ xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
+ xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
+ };
+ clamp_mv(mv, &mv_limits);
}
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
- int which_mv, int search_col) {
- (void)search_col;
+static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
return candidate->mv[which_mv];
}
-static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
- int which_mv, int search_col) {
- (void)search_col;
- return candidate->mv[which_mv];
-}
-
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
- const MV_REFERENCE_FRAME this_ref_frame,
- const int *ref_sign_bias) {
- int_mv mv = mbmi->mv[ref];
- if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
- mv.as_mv.row *= -1;
- mv.as_mv.col *= -1;
- }
- return mv;
-}
-
// Checks that the given mi_row, mi_col and search point
// are inside the borders of the tile.
static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
@@ -169,14 +150,14 @@ static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
// clang-format on
static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
- int8_t ref_frame_type) {
+ MV_REFERENCE_FRAME ref_frame_type) {
if (ref_frame_type >= REF_FRAMES) {
rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
} else {
+ assert(ref_frame_type > NONE_FRAME);
rf[0] = ref_frame_type;
rf[1] = NONE_FRAME;
- assert(ref_frame_type > NONE_FRAME);
}
}
@@ -201,18 +182,17 @@ static INLINE int16_t av1_mode_context_analyzer(
return comp_ctx;
}
-static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
- int ref_idx) {
- if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
- ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL)
+static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
+ if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+ ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
return 0;
- if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
- ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+ if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+ ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
return 1;
- if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
- ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+ if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL &&
+ ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
return 2;
return 0;
@@ -222,7 +202,8 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm);
void av1_setup_frame_sign_bias(AV1_COMMON *cm);
void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
void av1_setup_motion_field(AV1_COMMON *cm);
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+ int lst_map_idx, int gld_map_idx);
static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
av1_zero(xd->neighbors_ref_counts);
@@ -255,13 +236,16 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm,
const MB_MODE_INFO *const mi, int mi_row, int mi_col,
int x_mis, int y_mis);
+// The global_mvs output parameter points to an array of REF_FRAMES elements.
+// The caller may pass a null global_mvs if it does not need the global_mvs
+// output.
void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
- int_mv *global_mvs, int mi_row, int mi_col,
- int16_t *mode_context);
+ int_mv *global_mvs, int16_t *mode_context);
// check a list of motion vectors by sad score using a number rows of pixels
// above and a number cols of pixels in the left to select the one with best
@@ -269,25 +253,24 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
int_mv *near_mv, int is_integer);
-int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
- int *pts, int *pts_inref);
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+ BLOCK_SIZE bsize);
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+ int *pts_inref);
#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels
#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
- int mib_size, int mi_row, int mi_col) {
- (void)mi_col;
+ int mib_size, int mi_row) {
if (mi_row - mib_size < tile->mi_row_start) {
- ref_dv->as_mv.row = 0;
- ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
+ ref_dv->as_fullmv.row = 0;
+ ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
} else {
- ref_dv->as_mv.row = -MI_SIZE * mib_size;
- ref_dv->as_mv.col = 0;
+ ref_dv->as_fullmv.row = -MI_SIZE * mib_size;
+ ref_dv->as_fullmv.col = 0;
}
- ref_dv->as_mv.row *= 8;
- ref_dv->as_mv.col *= 8;
+ convert_fullmv_to_mv(ref_dv);
}
static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
@@ -319,15 +302,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
// Special case for sub 8x8 chroma cases, to prevent referring to chroma
// pixels outside current tile.
- for (int plane = 1; plane < av1_num_planes(cm); ++plane) {
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
- pd->subsampling_y)) {
- if (bw < 8 && pd->subsampling_x)
- if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
- if (bh < 8 && pd->subsampling_y)
- if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
- }
+ if (xd->is_chroma_ref && av1_num_planes(cm) > 1) {
+ const struct macroblockd_plane *const pd = &xd->plane[1];
+ if (bw < 8 && pd->subsampling_x)
+ if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+ if (bh < 8 && pd->subsampling_y)
+ if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
}
// Is the bottom right within an already coded SB? Also consider additional
diff --git a/media/libaom/src/av1/common/obmc.h b/media/libaom/src/av1/common/obmc.h
index 1c90cd93f..cc97b6bb1 100644
--- a/media/libaom/src/av1/common/obmc.h
+++ b/media/libaom/src/av1/common/obmc.h
@@ -12,25 +12,24 @@
#ifndef AOM_AV1_COMMON_OBMC_H_
#define AOM_AV1_COMMON_OBMC_H_
-typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
- uint8_t nb_mi_size,
- MB_MODE_INFO *nb_mi, void *fun_ctxt,
- const int num_planes);
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row,
+ int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *nb_mi,
+ void *fun_ctxt, const int num_planes);
static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
- MACROBLOCKD *xd, int mi_col,
- int nb_max,
+ MACROBLOCKD *xd, int nb_max,
overlappable_nb_visitor_t fun,
void *fun_ctxt) {
- const int num_planes = av1_num_planes(cm);
if (!xd->up_available) return;
+ const int num_planes = av1_num_planes(cm);
int nb_count = 0;
-
+ const int mi_col = xd->mi_col;
// prev_row_mi points into the mi array, starting at the beginning of the
// previous row.
MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
- const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
+ const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
uint8_t mi_step;
for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
above_mi_col += mi_step) {
@@ -49,26 +48,25 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
}
if (is_neighbor_overlappable(*above_mi)) {
++nb_count;
- fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
- fun_ctxt, num_planes);
+ fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
+ *above_mi, fun_ctxt, num_planes);
}
}
}
static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
- MACROBLOCKD *xd, int mi_row,
- int nb_max,
+ MACROBLOCKD *xd, int nb_max,
overlappable_nb_visitor_t fun,
void *fun_ctxt) {
- const int num_planes = av1_num_planes(cm);
if (!xd->left_available) return;
+ const int num_planes = av1_num_planes(cm);
int nb_count = 0;
-
// prev_col_mi points into the mi array, starting at the top of the
// previous column
+ const int mi_row = xd->mi_row;
MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
- const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
+ const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
uint8_t mi_step;
for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
left_mi_row += mi_step) {
@@ -82,7 +80,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
}
if (is_neighbor_overlappable(*left_mi)) {
++nb_count;
- fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
+ fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
fun_ctxt, num_planes);
}
}
diff --git a/media/libaom/src/av1/common/obu_util.c b/media/libaom/src/av1/common/obu_util.c
index 823b700b1..7d2694b89 100644
--- a/media/libaom/src/av1/common/obu_util.c
+++ b/media/libaom/src/av1/common/obu_util.c
@@ -8,6 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <assert.h>
+
#include "av1/common/obu_util.h"
#include "aom_dsp/bitreader_buffer.h"
@@ -112,36 +114,41 @@ aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
ObuHeader *obu_header,
size_t *const payload_size,
size_t *const bytes_read) {
- size_t length_field_size = 0, obu_size = 0;
+ size_t length_field_size_obu = 0;
+ size_t length_field_size_payload = 0;
+ size_t obu_size = 0;
aom_codec_err_t status;
if (is_annexb) {
// Size field comes before the OBU header, and includes the OBU header
status =
- read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+ read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu);
if (status != AOM_CODEC_OK) return status;
}
- struct aom_read_bit_buffer rb = { data + length_field_size,
+ struct aom_read_bit_buffer rb = { data + length_field_size_obu,
data + bytes_available, 0, NULL, NULL };
status = read_obu_header(&rb, is_annexb, obu_header);
if (status != AOM_CODEC_OK) return status;
- if (is_annexb) {
+ if (!obu_header->has_size_field) {
+ assert(is_annexb);
// Derive the payload size from the data we've already read
if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
*payload_size = obu_size - obu_header->size;
} else {
// Size field comes after the OBU header, and is just the payload size
- status = read_obu_size(data + obu_header->size,
- bytes_available - obu_header->size, payload_size,
- &length_field_size);
+ status = read_obu_size(
+ data + length_field_size_obu + obu_header->size,
+ bytes_available - length_field_size_obu - obu_header->size,
+ payload_size, &length_field_size_payload);
if (status != AOM_CODEC_OK) return status;
}
- *bytes_read = length_field_size + obu_header->size;
+ *bytes_read =
+ length_field_size_obu + obu_header->size + length_field_size_payload;
return AOM_CODEC_OK;
}
diff --git a/media/libaom/src/av1/common/onyxc_int.h b/media/libaom/src/av1/common/onyxc_int.h
deleted file mode 100644
index ff011c89e..000000000
--- a/media/libaom/src/av1/common/onyxc_int.h
+++ /dev/null
@@ -1,1342 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
-#define AOM_AV1_COMMON_ONYXC_INT_H_
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "aom/internal/aom_codec_internal.h"
-#include "aom_util/aom_thread.h"
-#include "av1/common/alloccommon.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/entropy.h"
-#include "av1/common/entropymode.h"
-#include "av1/common/entropymv.h"
-#include "av1/common/enums.h"
-#include "av1/common/frame_buffers.h"
-#include "av1/common/mv.h"
-#include "av1/common/quant_common.h"
-#include "av1/common/restoration.h"
-#include "av1/common/tile_common.h"
-#include "av1/common/timing.h"
-#include "av1/common/odintrin.h"
-#include "av1/encoder/hash_motion.h"
-#include "aom_dsp/grain_synthesis.h"
-#include "aom_dsp/grain_table.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(__clang__) && defined(__has_warning)
-#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
-#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT
-#endif
-#elif defined(__GNUC__) && __GNUC__ >= 7
-#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT
-#endif
-
-#ifndef AOM_FALLTHROUGH_INTENDED
-#define AOM_FALLTHROUGH_INTENDED \
- do { \
- } while (0)
-#endif
-
-#define CDEF_MAX_STRENGTHS 16
-
-/* Constant values while waiting for the sequence header */
-#define FRAME_ID_LENGTH 15
-#define DELTA_FRAME_ID_LENGTH 14
-
-#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
-// Extra frame context which is always kept at default values
-#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
-#define PRIMARY_REF_BITS 3
-#define PRIMARY_REF_NONE 7
-
-#define NUM_PING_PONG_BUFFERS 2
-
-#define MAX_NUM_TEMPORAL_LAYERS 8
-#define MAX_NUM_SPATIAL_LAYERS 4
-/* clang-format off */
-// clang-format seems to think this is a pointer dereference and not a
-// multiplication.
-#define MAX_NUM_OPERATING_POINTS \
- MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS
-/* clang-format on*/
-
-// TODO(jingning): Turning this on to set up transform coefficient
-// processing timer.
-#define TXCOEFF_TIMER 0
-#define TXCOEFF_COST_TIMER 0
-
-typedef enum {
- SINGLE_REFERENCE = 0,
- COMPOUND_REFERENCE = 1,
- REFERENCE_MODE_SELECT = 2,
- REFERENCE_MODES = 3,
-} REFERENCE_MODE;
-
-typedef enum {
- /**
- * Frame context updates are disabled
- */
- REFRESH_FRAME_CONTEXT_DISABLED,
- /**
- * Update frame context to values resulting from backward probability
- * updates based on entropy/counts in the decoded frame
- */
- REFRESH_FRAME_CONTEXT_BACKWARD,
-} REFRESH_FRAME_CONTEXT_MODE;
-
-#define MFMV_STACK_SIZE 3
-typedef struct {
- int_mv mfmv0;
- uint8_t ref_frame_offset;
-} TPL_MV_REF;
-
-typedef struct {
- int_mv mv;
- MV_REFERENCE_FRAME ref_frame;
-} MV_REF;
-
-typedef struct {
- int ref_count;
-
- unsigned int cur_frame_offset;
- unsigned int ref_frame_offset[INTER_REFS_PER_FRAME];
-
- MV_REF *mvs;
- uint8_t *seg_map;
- struct segmentation seg;
- int mi_rows;
- int mi_cols;
- // Width and height give the size of the buffer (before any upscaling, unlike
- // the sizes that can be derived from the buf structure)
- int width;
- int height;
- WarpedMotionParams global_motion[REF_FRAMES];
- int showable_frame; // frame can be used as show existing frame in future
- int film_grain_params_present;
- aom_film_grain_t film_grain_params;
- aom_codec_frame_buffer_t raw_frame_buffer;
- YV12_BUFFER_CONFIG buf;
- hash_table hash_table;
- uint8_t intra_only;
- FRAME_TYPE frame_type;
- // The Following variables will only be used in frame parallel decode.
-
- // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
- // that no FrameWorker owns, or is decoding, this buffer.
- AVxWorker *frame_worker_owner;
-
- // row and col indicate which position frame has been decoded to in real
- // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
- // when the frame is fully decoded.
- int row;
- int col;
-
- // Inter frame reference frame delta for loop filter
- int8_t ref_deltas[REF_FRAMES];
-
- // 0 = ZERO_MV, MV
- int8_t mode_deltas[MAX_MODE_LF_DELTAS];
-} RefCntBuffer;
-
-typedef struct BufferPool {
-// Protect BufferPool from being accessed by several FrameWorkers at
-// the same time during frame parallel decode.
-// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
- pthread_mutex_t pool_mutex;
-#endif
-
- // Private data associated with the frame buffer callbacks.
- void *cb_priv;
-
- aom_get_frame_buffer_cb_fn_t get_fb_cb;
- aom_release_frame_buffer_cb_fn_t release_fb_cb;
-
- RefCntBuffer frame_bufs[FRAME_BUFFERS];
-
- // Frame buffers allocated internally by the codec.
- InternalFrameBufferList int_frame_buffers;
-} BufferPool;
-
-typedef struct {
- int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
- [BASE_CONTEXT_POSITION_NUM + 1];
-} LV_MAP_CTX_TABLE;
-typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
- [BASE_CONTEXT_POSITION_NUM + 1];
-
-typedef struct BitstreamLevel {
- uint8_t major;
- uint8_t minor;
-} BitstreamLevel;
-
-// Sequence header structure.
-// Note: All syntax elements of sequence_header_obu that need to be
-// bit-identical across multiple sequence headers must be part of this struct,
-// so that consistency is checked by are_seq_headers_consistent() function.
-typedef struct SequenceHeader {
- int num_bits_width;
- int num_bits_height;
- int max_frame_width;
- int max_frame_height;
- int frame_id_numbers_present_flag;
- int frame_id_length;
- int delta_frame_id_length;
- BLOCK_SIZE sb_size; // Size of the superblock used for this frame
- int mib_size; // Size of the superblock in units of MI blocks
- int mib_size_log2; // Log 2 of above.
- int order_hint_bits_minus_1;
- int force_screen_content_tools; // 0 - force off
- // 1 - force on
- // 2 - adaptive
- int force_integer_mv; // 0 - Not to force. MV can be in 1/4 or 1/8
- // 1 - force to integer
- // 2 - adaptive
- int still_picture; // Video is a single frame still picture
- int reduced_still_picture_hdr; // Use reduced header for still picture
- int enable_filter_intra; // enables/disables filterintra
- int enable_intra_edge_filter; // enables/disables corner/edge/upsampling
- int enable_interintra_compound; // enables/disables interintra_compound
- int enable_masked_compound; // enables/disables masked compound
- int enable_dual_filter; // 0 - disable dual interpolation filter
- // 1 - enable vert/horiz filter selection
- int enable_order_hint; // 0 - disable order hint, and related tools
- // jnt_comp, ref_frame_mvs, frame_sign_bias
- // if 0, enable_jnt_comp and
- // enable_ref_frame_mvs must be set zs 0.
- int enable_jnt_comp; // 0 - disable joint compound modes
- // 1 - enable it
- int enable_ref_frame_mvs; // 0 - disable ref frame mvs
- // 1 - enable it
- int enable_warped_motion; // 0 - disable warped motion for sequence
- // 1 - enable it for the sequence
- int enable_superres; // 0 - Disable superres for the sequence, and disable
- // transmitting per-frame superres enabled flag.
- // 1 - Enable superres for the sequence, and also
- // enable per-frame flag to denote if superres is
- // enabled for that frame.
- int enable_cdef; // To turn on/off CDEF
- int enable_restoration; // To turn on/off loop restoration
- BITSTREAM_PROFILE profile;
-
- // Operating point info.
- int operating_points_cnt_minus_1;
- int operating_point_idc[MAX_NUM_OPERATING_POINTS];
- int display_model_info_present_flag;
- int decoder_model_info_present_flag;
- BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
- uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0
- // or 1.
-
- // Color config.
- aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1,
- // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
- int use_highbitdepth; // If true, we need to use 16bit frame buffers.
- int monochrome; // Monochorme video
- aom_color_primaries_t color_primaries;
- aom_transfer_characteristics_t transfer_characteristics;
- aom_matrix_coefficients_t matrix_coefficients;
- int color_range;
- int subsampling_x; // Chroma subsampling for x
- int subsampling_y; // Chroma subsampling for y
- aom_chroma_sample_position_t chroma_sample_position;
- int separate_uv_delta_q;
-
- int film_grain_params_present;
-} SequenceHeader;
-
-typedef struct AV1Common {
- struct aom_internal_error_info error;
- int width;
- int height;
- int render_width;
- int render_height;
- int last_width;
- int last_height;
- int timing_info_present;
- aom_timing_info_t timing_info;
- int buffer_removal_time_present;
- aom_dec_model_info_t buffer_model;
- aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
- aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
- uint32_t frame_presentation_time;
-
- int largest_tile_id;
- size_t largest_tile_size;
- int context_update_tile_id;
-
- // Scale of the current frame with respect to itself.
- struct scale_factors sf_identity;
-
- YV12_BUFFER_CONFIG *frame_to_show;
- RefCntBuffer *prev_frame;
-
- // TODO(hkuang): Combine this with cur_buf in macroblockd.
- RefCntBuffer *cur_frame;
-
- int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
-
- // Prepare ref_frame_map for the next frame.
- // Only used in frame parallel decode.
- int next_ref_frame_map[REF_FRAMES];
-
- // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
- // roll new_fb_idx into it.
-
- // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
- RefBuffer frame_refs[INTER_REFS_PER_FRAME];
- int is_skip_mode_allowed;
- int skip_mode_flag;
- int ref_frame_idx_0;
- int ref_frame_idx_1;
-
- int new_fb_idx;
-
- FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
- FRAME_TYPE frame_type;
-
- int show_frame;
- int showable_frame; // frame can be used as show existing frame in future
- int last_show_frame;
- int show_existing_frame;
- // Flag for a frame used as a reference - not written to the bitstream
- int is_reference_frame;
- int reset_decoder_state;
-
- // Flag signaling that the frame is encoded using only INTRA modes.
- uint8_t intra_only;
- uint8_t last_intra_only;
- uint8_t disable_cdf_update;
- int allow_high_precision_mv;
- int cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer
-
- int allow_screen_content_tools;
- int allow_intrabc;
- int allow_warped_motion;
-
- // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
- // MB_MODE_INFO (8-pixel) units.
- int MBs;
- int mb_rows, mi_rows;
- int mb_cols, mi_cols;
- int mi_stride;
-
- /* profile settings */
- TX_MODE tx_mode;
-
-#if CONFIG_ENTROPY_STATS
- int coef_cdf_category;
-#endif
-
- int base_qindex;
- int y_dc_delta_q;
- int u_dc_delta_q;
- int v_dc_delta_q;
- int u_ac_delta_q;
- int v_ac_delta_q;
-
- // The dequantizers below are true dequntizers used only in the
- // dequantization process. They have the same coefficient
- // shift/scale as TX.
- int16_t y_dequant_QTX[MAX_SEGMENTS][2];
- int16_t u_dequant_QTX[MAX_SEGMENTS][2];
- int16_t v_dequant_QTX[MAX_SEGMENTS][2];
-
- // Global quant matrix tables
- const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
- const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
-
- // Local quant matrix tables for each frame
- const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
- const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
- const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
-
- // Encoder
- int using_qmatrix;
- int qm_y;
- int qm_u;
- int qm_v;
- int min_qmlevel;
- int max_qmlevel;
-
- /* We allocate a MB_MODE_INFO struct for each macroblock, together with
- an extra row on top and column on the left to simplify prediction. */
- int mi_alloc_size;
- MB_MODE_INFO *mip; /* Base of allocated array */
- MB_MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
-
- // TODO(agrange): Move prev_mi into encoder structure.
- // prev_mip and prev_mi will only be allocated in encoder.
- MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */
- MB_MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
-
- // Separate mi functions between encoder and decoder.
- int (*alloc_mi)(struct AV1Common *cm, int mi_size);
- void (*free_mi)(struct AV1Common *cm);
- void (*setup_mi)(struct AV1Common *cm);
-
- // Grid of pointers to 8x8 MB_MODE_INFO structs. Any 8x8 not in the visible
- // area will be NULL.
- MB_MODE_INFO **mi_grid_base;
- MB_MODE_INFO **mi_grid_visible;
- MB_MODE_INFO **prev_mi_grid_base;
- MB_MODE_INFO **prev_mi_grid_visible;
-
- // Whether to use previous frames' motion vectors for prediction.
- int allow_ref_frame_mvs;
-
- uint8_t *last_frame_seg_map;
- uint8_t *current_frame_seg_map;
- int seg_map_alloc_size;
-
- InterpFilter interp_filter;
-
- int switchable_motion_mode;
-
- loop_filter_info_n lf_info;
- // The denominator of the superres scale; the numerator is fixed.
- uint8_t superres_scale_denominator;
- int superres_upscaled_width;
- int superres_upscaled_height;
- RestorationInfo rst_info[MAX_MB_PLANE];
-
- // rst_end_stripe[i] is one more than the index of the bottom stripe
- // for tile row i.
- int rst_end_stripe[MAX_TILE_ROWS];
-
- // Pointer to a scratch buffer used by self-guided restoration
- int32_t *rst_tmpbuf;
- RestorationLineBuffers *rlbs;
-
- // Output of loop restoration
- YV12_BUFFER_CONFIG rst_frame;
-
- // Flag signaling how frame contexts should be updated at the end of
- // a frame decode
- REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
-
- int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */
-
- struct loopfilter lf;
- struct segmentation seg;
- int coded_lossless; // frame is fully lossless at the coded resolution.
- int all_lossless; // frame is fully lossless at the upscaled resolution.
-
- int reduced_tx_set_used;
-
- // Context probabilities for reference frame prediction
- MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
- MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
- REFERENCE_MODE reference_mode;
-
- FRAME_CONTEXT *fc; /* this frame entropy */
- FRAME_CONTEXT *frame_contexts; // FRAME_CONTEXTS
- unsigned int frame_context_idx; /* Context to use/update */
- int fb_of_context_type[REF_FRAMES];
- int primary_ref_frame;
-
- unsigned int frame_offset;
-
- unsigned int current_video_frame;
-
- aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer
-
- int error_resilient_mode;
- int force_primary_ref_none;
-
- int tile_cols, tile_rows;
- int last_tile_cols, last_tile_rows;
-
- int max_tile_width_sb;
- int min_log2_tile_cols;
- int max_log2_tile_cols;
- int max_log2_tile_rows;
- int min_log2_tile_rows;
- int min_log2_tiles;
- int max_tile_height_sb;
- int uniform_tile_spacing_flag;
- int log2_tile_cols; // only valid for uniform tiles
- int log2_tile_rows; // only valid for uniform tiles
- int tile_col_start_sb[MAX_TILE_COLS + 1]; // valid for 0 <= i <= tile_cols
- int tile_row_start_sb[MAX_TILE_ROWS + 1]; // valid for 0 <= i <= tile_rows
- int tile_width, tile_height; // In MI units
-
- unsigned int large_scale_tile;
- unsigned int single_tile_decoding;
-
- int byte_alignment;
- int skip_loop_filter;
- int skip_film_grain;
-
- // Private data associated with the frame buffer callbacks.
- void *cb_priv;
- aom_get_frame_buffer_cb_fn_t get_fb_cb;
- aom_release_frame_buffer_cb_fn_t release_fb_cb;
-
- // Handles memory for the codec.
- InternalFrameBufferList int_frame_buffers;
-
- // External BufferPool passed from outside.
- BufferPool *buffer_pool;
-
- PARTITION_CONTEXT **above_seg_context;
- ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
- TXFM_CONTEXT **above_txfm_context;
- WarpedMotionParams global_motion[REF_FRAMES];
- aom_film_grain_t film_grain_params;
-
- int cdef_pri_damping;
- int cdef_sec_damping;
- int nb_cdef_strengths;
- int cdef_strengths[CDEF_MAX_STRENGTHS];
- int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
- int cdef_bits;
-
- int delta_q_present_flag;
- // Resolution of delta quant
- int delta_q_res;
- int delta_lf_present_flag;
- // Resolution of delta lf level
- int delta_lf_res;
- // This is a flag for number of deltas of loop filter level
- // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
- // 1: use separate deltas for each filter level
- int delta_lf_multi;
- int num_tg;
- SequenceHeader seq_params;
- int current_frame_id;
- int ref_frame_id[REF_FRAMES];
- int valid_for_referencing[REF_FRAMES];
- int invalid_delta_frame_id_minus_1;
- LV_MAP_CTX_TABLE coeff_ctx_table;
- TPL_MV_REF *tpl_mvs;
- int tpl_mvs_mem_size;
- // TODO(jingning): This can be combined with sign_bias later.
- int8_t ref_frame_side[REF_FRAMES];
-
- int is_annexb;
-
- int frame_refs_short_signaling;
- int temporal_layer_id;
- int spatial_layer_id;
- unsigned int number_temporal_layers;
- unsigned int number_spatial_layers;
- int num_allocated_above_context_mi_col;
- int num_allocated_above_contexts;
- int num_allocated_above_context_planes;
-
-#if TXCOEFF_TIMER
- int64_t cum_txcoeff_timer;
- int64_t txcoeff_timer;
- int txb_count;
-#endif
-
-#if TXCOEFF_COST_TIMER
- int64_t cum_txcoeff_cost_timer;
- int64_t txcoeff_cost_timer;
- int64_t txcoeff_cost_count;
-#endif
- const cfg_options_t *options;
-} AV1_COMMON;
-
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-static void lock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
- pthread_mutex_lock(&pool->pool_mutex);
-#else
- (void)pool;
-#endif
-}
-
-static void unlock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
- pthread_mutex_unlock(&pool->pool_mutex);
-#else
- (void)pool;
-#endif
-}
-
-static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
- if (index < 0 || index >= REF_FRAMES) return NULL;
- if (cm->ref_frame_map[index] < 0) return NULL;
- assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
- return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
-}
-
-static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
- const AV1_COMMON *const cm) {
- return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
-}
-
-static INLINE int get_free_fb(AV1_COMMON *cm) {
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
- int i;
-
- lock_buffer_pool(cm->buffer_pool);
- for (i = 0; i < FRAME_BUFFERS; ++i)
- if (frame_bufs[i].ref_count == 0) break;
-
- if (i != FRAME_BUFFERS) {
- if (frame_bufs[i].buf.use_external_reference_buffers) {
- // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
- // external reference buffers. Restore the buffer pointers to point to the
- // internally allocated memory.
- YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
- ybf->y_buffer = ybf->store_buf_adr[0];
- ybf->u_buffer = ybf->store_buf_adr[1];
- ybf->v_buffer = ybf->store_buf_adr[2];
- ybf->use_external_reference_buffers = 0;
- }
-
- frame_bufs[i].ref_count = 1;
- } else {
- // Reset i to be INVALID_IDX to indicate no free buffer found.
- i = INVALID_IDX;
- }
-
- unlock_buffer_pool(cm->buffer_pool);
- return i;
-}
-
-static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
- const int ref_index = *idx;
-
- if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
- bufs[ref_index].ref_count--;
-
- *idx = new_idx;
-
- bufs[new_idx].ref_count++;
-}
-
-static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
- return cm->frame_type == KEY_FRAME || cm->intra_only;
-}
-
-static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
- return cm->frame_type == S_FRAME;
-}
-
-static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
- if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
- cm->frame_refs[cm->primary_ref_frame].idx == INVALID_IDX) {
- return NULL;
- } else {
- return &cm->buffer_pool
- ->frame_bufs[cm->frame_refs[cm->primary_ref_frame].idx];
- }
-}
-
-// Returns 1 if this frame might allow mvs from some reference frame.
-static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
- return !cm->error_resilient_mode && cm->seq_params.enable_ref_frame_mvs &&
- cm->seq_params.enable_order_hint && !frame_is_intra_only(cm);
-}
-
-// Returns 1 if this frame might use warped_motion
-static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
- return !cm->error_resilient_mode && !frame_is_intra_only(cm) &&
- cm->seq_params.enable_warped_motion;
-}
-
-static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
- const int buf_rows = buf->mi_rows;
- const int buf_cols = buf->mi_cols;
-
- if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) {
- aom_free(buf->mvs);
- buf->mi_rows = cm->mi_rows;
- buf->mi_cols = cm->mi_cols;
- CHECK_MEM_ERROR(cm, buf->mvs,
- (MV_REF *)aom_calloc(
- ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1),
- sizeof(*buf->mvs)));
- aom_free(buf->seg_map);
- CHECK_MEM_ERROR(cm, buf->seg_map,
- (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols,
- sizeof(*buf->seg_map)));
- }
-
- const int mem_size =
- ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
- int realloc = cm->tpl_mvs == NULL;
- if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
-
- if (realloc) {
- aom_free(cm->tpl_mvs);
- CHECK_MEM_ERROR(cm, cm->tpl_mvs,
- (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
- cm->tpl_mvs_mem_size = mem_size;
- }
-}
-
-void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
-
-static INLINE int av1_num_planes(const AV1_COMMON *cm) {
- return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
-}
-
-static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd,
- const int tile_row) {
- const int num_planes = av1_num_planes(cm);
- for (int i = 0; i < num_planes; ++i) {
- xd->above_context[i] = cm->above_context[i][tile_row];
- }
- xd->above_seg_context = cm->above_seg_context[tile_row];
- xd->above_txfm_context = cm->above_txfm_context[tile_row];
-}
-
-static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
- tran_low_t *dqcoeff) {
- const int num_planes = av1_num_planes(cm);
- for (int i = 0; i < num_planes; ++i) {
- xd->plane[i].dqcoeff = dqcoeff;
-
- if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
- memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX,
- sizeof(cm->y_dequant_QTX));
- memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
-
- } else {
- if (i == AOM_PLANE_U) {
- memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX,
- sizeof(cm->u_dequant_QTX));
- memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix,
- sizeof(cm->u_iqmatrix));
- } else {
- memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX,
- sizeof(cm->v_dequant_QTX));
- memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix,
- sizeof(cm->v_iqmatrix));
- }
- }
- }
- xd->mi_stride = cm->mi_stride;
- xd->error_info = &cm->error;
- cfl_init(&xd->cfl, &cm->seq_params);
-}
-
-static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
- const int num_planes) {
- int i;
- int row_offset = mi_row;
- int col_offset = mi_col;
- for (i = 0; i < num_planes; ++i) {
- struct macroblockd_plane *const pd = &xd->plane[i];
- // Offset the buffer pointer
- const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
- row_offset = mi_row - 1;
- if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
- col_offset = mi_col - 1;
- int above_idx = col_offset;
- int left_idx = row_offset & MAX_MIB_MASK;
- pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
- pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
- }
-}
-
-static INLINE int calc_mi_size(int len) {
- // len is in mi units. Align to a multiple of SBs.
- return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
-}
-
-static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
- const int num_planes) {
- int i;
- for (i = 0; i < num_planes; i++) {
- xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
- xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
-
- xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
- xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
- }
-}
-
-static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
- int mi_row, int bh, int mi_col, int bw,
- int mi_rows, int mi_cols) {
- xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
- xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
- xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
- xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
-
- // Are edges available for intra prediction?
- xd->up_available = (mi_row > tile->mi_row_start);
-
- const int ss_x = xd->plane[1].subsampling_x;
- const int ss_y = xd->plane[1].subsampling_y;
-
- xd->left_available = (mi_col > tile->mi_col_start);
- xd->chroma_up_available = xd->up_available;
- xd->chroma_left_available = xd->left_available;
- if (ss_x && bw < mi_size_wide[BLOCK_8X8])
- xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
- if (ss_y && bh < mi_size_high[BLOCK_8X8])
- xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
- if (xd->up_available) {
- xd->above_mbmi = xd->mi[-xd->mi_stride];
- } else {
- xd->above_mbmi = NULL;
- }
-
- if (xd->left_available) {
- xd->left_mbmi = xd->mi[-1];
- } else {
- xd->left_mbmi = NULL;
- }
-
- const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
- ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
- if (chroma_ref) {
- // To help calculate the "above" and "left" chroma blocks, note that the
- // current block may cover multiple luma blocks (eg, if partitioned into
- // 4x4 luma blocks).
- // First, find the top-left-most luma block covered by this chroma block
- MB_MODE_INFO **base_mi =
- &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
-
- // Then, we consider the luma region covered by the left or above 4x4 chroma
- // prediction. We want to point to the chroma reference block in that
- // region, which is the bottom-right-most mi unit.
- // This leads to the following offsets:
- MB_MODE_INFO *chroma_above_mi =
- xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
- xd->chroma_above_mbmi = chroma_above_mi;
-
- MB_MODE_INFO *chroma_left_mi =
- xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
- xd->chroma_left_mbmi = chroma_left_mi;
- }
-
- xd->n4_h = bh;
- xd->n4_w = bw;
- xd->is_sec_rect = 0;
- if (xd->n4_w < xd->n4_h) {
- // Only mark is_sec_rect as 1 for the last block.
- // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
- // For other partitions, it would be (0, 1).
- if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
- }
-
- if (xd->n4_w > xd->n4_h)
- if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
-}
-
-static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
- const MB_MODE_INFO *above_mi,
- const MB_MODE_INFO *left_mi) {
- const PREDICTION_MODE above = av1_above_block_mode(above_mi);
- const PREDICTION_MODE left = av1_left_block_mode(left_mi);
- const int above_ctx = intra_mode_context[above];
- const int left_ctx = intra_mode_context[left];
- return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
-}
-
-static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
- int mi_col, BLOCK_SIZE subsize,
- BLOCK_SIZE bsize) {
- PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
- PARTITION_CONTEXT *const left_ctx =
- xd->left_seg_context + (mi_row & MAX_MIB_MASK);
-
- const int bw = mi_size_wide[bsize];
- const int bh = mi_size_high[bsize];
- memset(above_ctx, partition_context_lookup[subsize].above, bw);
- memset(left_ctx, partition_context_lookup[subsize].left, bh);
-}
-
-static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
- int subsampling_x, int subsampling_y) {
- const int bw = mi_size_wide[bsize];
- const int bh = mi_size_high[bsize];
- int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
- ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
- return ref_pos;
-}
-
-static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
- int subsampling_y) {
- BLOCK_SIZE bs = bsize;
- switch (bsize) {
- case BLOCK_4X4:
- if (subsampling_x == 1 && subsampling_y == 1)
- bs = BLOCK_8X8;
- else if (subsampling_x == 1)
- bs = BLOCK_8X4;
- else if (subsampling_y == 1)
- bs = BLOCK_4X8;
- break;
- case BLOCK_4X8:
- if (subsampling_x == 1 && subsampling_y == 1)
- bs = BLOCK_8X8;
- else if (subsampling_x == 1)
- bs = BLOCK_8X8;
- else if (subsampling_y == 1)
- bs = BLOCK_4X8;
- break;
- case BLOCK_8X4:
- if (subsampling_x == 1 && subsampling_y == 1)
- bs = BLOCK_8X8;
- else if (subsampling_x == 1)
- bs = BLOCK_8X4;
- else if (subsampling_y == 1)
- bs = BLOCK_8X8;
- break;
- case BLOCK_4X16:
- if (subsampling_x == 1 && subsampling_y == 1)
- bs = BLOCK_8X16;
- else if (subsampling_x == 1)
- bs = BLOCK_8X16;
- else if (subsampling_y == 1)
- bs = BLOCK_4X16;
- break;
- case BLOCK_16X4:
- if (subsampling_x == 1 && subsampling_y == 1)
- bs = BLOCK_16X8;
- else if (subsampling_x == 1)
- bs = BLOCK_16X4;
- else if (subsampling_y == 1)
- bs = BLOCK_16X8;
- break;
- default: break;
- }
- return bs;
-}
-
-static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
- size_t element) {
- assert(cdf != NULL);
- return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
-}
-
-static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
- const aom_cdf_prob *const in,
- BLOCK_SIZE bsize) {
- (void)bsize;
- out[0] = CDF_PROB_TOP;
- out[0] -= cdf_element_prob(in, PARTITION_HORZ);
- out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
- out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
- out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
- out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
- if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
- out[0] = AOM_ICDF(out[0]);
- out[1] = AOM_ICDF(CDF_PROB_TOP);
-}
-
-static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
- const aom_cdf_prob *const in,
- BLOCK_SIZE bsize) {
- (void)bsize;
- out[0] = CDF_PROB_TOP;
- out[0] -= cdf_element_prob(in, PARTITION_VERT);
- out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
- out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
- out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
- out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
- if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
- out[0] = AOM_ICDF(out[0]);
- out[1] = AOM_ICDF(CDF_PROB_TOP);
-}
-
-static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
- int mi_col, BLOCK_SIZE subsize,
- BLOCK_SIZE bsize,
- PARTITION_TYPE partition) {
- if (bsize >= BLOCK_8X8) {
- const int hbs = mi_size_wide[bsize] / 2;
- BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
- switch (partition) {
- case PARTITION_SPLIT:
- if (bsize != BLOCK_8X8) break;
- AOM_FALLTHROUGH_INTENDED;
- case PARTITION_NONE:
- case PARTITION_HORZ:
- case PARTITION_VERT:
- case PARTITION_HORZ_4:
- case PARTITION_VERT_4:
- update_partition_context(xd, mi_row, mi_col, subsize, bsize);
- break;
- case PARTITION_HORZ_A:
- update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
- update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
- break;
- case PARTITION_HORZ_B:
- update_partition_context(xd, mi_row, mi_col, subsize, subsize);
- update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
- break;
- case PARTITION_VERT_A:
- update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
- update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
- break;
- case PARTITION_VERT_B:
- update_partition_context(xd, mi_row, mi_col, subsize, subsize);
- update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
- break;
- default: assert(0 && "Invalid partition type");
- }
- }
-}
-
-static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
- int mi_col, BLOCK_SIZE bsize) {
- const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
- const PARTITION_CONTEXT *left_ctx =
- xd->left_seg_context + (mi_row & MAX_MIB_MASK);
- // Minimum partition point is 8x8. Offset the bsl accordingly.
- const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
- int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
-
- assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
- assert(bsl >= 0);
-
- return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-}
-
-// Return the number of elements in the partition CDF when
-// partitioning the (square) block with luma block size of bsize.
-static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
- if (bsize <= BLOCK_8X8)
- return PARTITION_TYPES;
- else if (bsize == BLOCK_128X128)
- return EXT_PARTITION_TYPES - 2;
- else
- return EXT_PARTITION_TYPES;
-}
-
-static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int plane) {
- int max_blocks_wide = block_size_wide[bsize];
- const struct macroblockd_plane *const pd = &xd->plane[plane];
-
- if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
-
- // Scale the width in the transform block unit.
- return max_blocks_wide >> tx_size_wide_log2[0];
-}
-
-static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int plane) {
- int max_blocks_high = block_size_high[bsize];
- const struct macroblockd_plane *const pd = &xd->plane[plane];
-
- if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
-
- // Scale the height in the transform block unit.
- return max_blocks_high >> tx_size_high_log2[0];
-}
-
-static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
- BLOCK_SIZE plane_bsize, int plane,
- TX_SIZE tx_size) {
- const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
- << tx_size_wide_log2[0];
- return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
-}
-
-static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
- BLOCK_SIZE plane_bsize, int plane,
- TX_SIZE tx_size) {
- const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
- << tx_size_high_log2[0];
- return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
-}
-
-static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
- int mi_col_start, int mi_col_end, const int tile_row) {
- const SequenceHeader *const seq_params = &cm->seq_params;
- const int num_planes = av1_num_planes(cm);
- const int width = mi_col_end - mi_col_start;
- const int aligned_width =
- ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
-
- const int offset_y = mi_col_start;
- const int width_y = aligned_width;
- const int offset_uv = offset_y >> seq_params->subsampling_x;
- const int width_uv = width_y >> seq_params->subsampling_x;
-
- av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
- if (num_planes > 1) {
- if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) {
- av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
- av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
- } else {
- aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
- "Invalid value of planes");
- }
- }
-
- av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width);
-
- memset(cm->above_txfm_context[tile_row] + mi_col_start,
- tx_size_wide[TX_SIZES_LARGEST],
- aligned_width * sizeof(TXFM_CONTEXT));
-}
-
-static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
- av1_zero(xd->left_context);
- av1_zero(xd->left_seg_context);
-
- memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
- sizeof(xd->left_txfm_context_buffer));
-}
-
-// Disable array-bounds checks as the TX_SIZE enum contains values larger than
-// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround
-// infeasible. The assert is enough for static analysis and this or other tools
-// asan, valgrind would catch oob access at runtime.
-#if defined(__GNUC__) && __GNUC__ >= 4
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-#if defined(__GNUC__) && __GNUC__ >= 4
-#pragma GCC diagnostic warning "-Warray-bounds"
-#endif
-
-static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
- int i;
- for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
-}
-
-static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
- const MACROBLOCKD *xd) {
- uint8_t bw = tx_size_wide[tx_size];
- uint8_t bh = tx_size_high[tx_size];
-
- if (skip) {
- bw = n4_w * MI_SIZE;
- bh = n4_h * MI_SIZE;
- }
-
- set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
- set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
-}
-
-static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
- TXFM_CONTEXT *left_ctx,
- TX_SIZE tx_size, TX_SIZE txb_size) {
- BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
- int bh = mi_size_high[bsize];
- int bw = mi_size_wide[bsize];
- uint8_t txw = tx_size_wide[tx_size];
- uint8_t txh = tx_size_high[tx_size];
- int i;
- for (i = 0; i < bh; ++i) left_ctx[i] = txh;
- for (i = 0; i < bw; ++i) above_ctx[i] = txw;
-}
-
-static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
- switch (tx_dim) {
- case 128:
- case 64: return TX_64X64; break;
- case 32: return TX_32X32; break;
- case 16: return TX_16X16; break;
- case 8: return TX_8X8; break;
- default: return TX_4X4;
- }
-}
-
-static INLINE TX_SIZE get_tx_size(int width, int height) {
- if (width == height) {
- return get_sqr_tx_size(width);
- }
- if (width < height) {
- if (width + width == height) {
- switch (width) {
- case 4: return TX_4X8; break;
- case 8: return TX_8X16; break;
- case 16: return TX_16X32; break;
- case 32: return TX_32X64; break;
- }
- } else {
- switch (width) {
- case 4: return TX_4X16; break;
- case 8: return TX_8X32; break;
- case 16: return TX_16X64; break;
- }
- }
- } else {
- if (height + height == width) {
- switch (height) {
- case 4: return TX_8X4; break;
- case 8: return TX_16X8; break;
- case 16: return TX_32X16; break;
- case 32: return TX_64X32; break;
- }
- } else {
- switch (height) {
- case 4: return TX_16X4; break;
- case 8: return TX_32X8; break;
- case 16: return TX_64X16; break;
- }
- }
- }
- assert(0);
- return TX_4X4;
-}
-
-static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
- TXFM_CONTEXT *left_ctx,
- BLOCK_SIZE bsize, TX_SIZE tx_size) {
- const uint8_t txw = tx_size_wide[tx_size];
- const uint8_t txh = tx_size_high[tx_size];
- const int above = *above_ctx < txw;
- const int left = *left_ctx < txh;
- int category = TXFM_PARTITION_CONTEXTS;
-
- // dummy return, not used by others.
- if (tx_size <= TX_4X4) return 0;
-
- TX_SIZE max_tx_size =
- get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
-
- if (max_tx_size >= TX_8X8) {
- category =
- (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
- (TX_SIZES - 1 - max_tx_size) * 2;
- }
- assert(category != TXFM_PARTITION_CONTEXTS);
- return category * 3 + above + left;
-}
-
-// Compute the next partition in the direction of the sb_type stored in the mi
-// array, starting with bsize.
-static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
- int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
- if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID;
-
- const int offset = mi_row * cm->mi_stride + mi_col;
- MB_MODE_INFO **mi = cm->mi_grid_visible + offset;
- const BLOCK_SIZE subsize = mi[0]->sb_type;
-
- if (subsize == bsize) return PARTITION_NONE;
-
- const int bhigh = mi_size_high[bsize];
- const int bwide = mi_size_wide[bsize];
- const int sshigh = mi_size_high[subsize];
- const int sswide = mi_size_wide[subsize];
-
- if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows &&
- mi_col + bhigh / 2 < cm->mi_cols) {
- // In this case, the block might be using an extended partition
- // type.
- const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
- const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride];
-
- if (sswide == bwide) {
- // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
- // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
- // half was split.
- if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
- assert(sshigh * 2 == bhigh);
-
- if (mbmi_below->sb_type == subsize)
- return PARTITION_HORZ;
- else
- return PARTITION_HORZ_B;
- } else if (sshigh == bhigh) {
- // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
- // PARTITION_VERT_B. To distinguish the latter two, check if the right
- // half was split.
- if (sswide * 4 == bwide) return PARTITION_VERT_4;
- assert(sswide * 2 == bhigh);
-
- if (mbmi_right->sb_type == subsize)
- return PARTITION_VERT;
- else
- return PARTITION_VERT_B;
- } else {
- // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
- // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
- // dimensions, we immediately know this is a split (which will recurse to
- // get to subsize). Otherwise look down and to the right. With
- // PARTITION_VERT_A, the right block will have height bhigh; with
- // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
- // it's PARTITION_SPLIT.
- if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
-
- if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
- if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
-
- return PARTITION_SPLIT;
- }
- }
- const int vert_split = sswide < bwide;
- const int horz_split = sshigh < bhigh;
- const int split_idx = (vert_split << 1) | horz_split;
- assert(split_idx != 0);
-
- static const PARTITION_TYPE base_partitions[4] = {
- PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
- };
-
- return base_partitions[split_idx];
-}
-
-static INLINE void set_use_reference_buffer(AV1_COMMON *const cm, int use) {
- cm->seq_params.frame_id_numbers_present_flag = use;
-}
-
-static INLINE void set_sb_size(SequenceHeader *const seq_params,
- BLOCK_SIZE sb_size) {
- seq_params->sb_size = sb_size;
- seq_params->mib_size = mi_size_wide[seq_params->sb_size];
- seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
-}
-
-// Returns true if the frame is fully lossless at the coded resolution.
-// Note: If super-resolution is used, such a frame will still NOT be lossless at
-// the upscaled resolution.
-static INLINE int is_coded_lossless(const AV1_COMMON *cm,
- const MACROBLOCKD *xd) {
- int coded_lossless = 1;
- if (cm->seg.enabled) {
- for (int i = 0; i < MAX_SEGMENTS; ++i) {
- if (!xd->lossless[i]) {
- coded_lossless = 0;
- break;
- }
- }
- } else {
- coded_lossless = xd->lossless[0];
- }
- return coded_lossless;
-}
-
-static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
- return seq_level_idx < 24 || seq_level_idx == 31;
-}
-
-static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
- assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
- // Since bl.minor is unsigned a comparison will return a warning:
- // comparison is always true due to limited range of data type
- assert(LEVEL_MINOR_MIN == 0);
- assert(bl.minor <= LEVEL_MINOR_MAX);
- return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/media/libaom/src/av1/common/ppc/cfl_ppc.c b/media/libaom/src/av1/common/ppc/cfl_ppc.c
index 026a07809..6f88768f2 100644
--- a/media/libaom/src/av1/common/ppc/cfl_ppc.c
+++ b/media/libaom/src/av1/common/ppc/cfl_ppc.c
@@ -124,27 +124,27 @@ CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
// Based on observation, for small blocks VSX does not outperform C (no 64bit
// load and store intrinsics). So we call the C code for block widths 4.
-cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
- subtract_average_4x4_c, /* 4x4 */
- subtract_average_8x8_vsx, /* 8x8 */
- subtract_average_16x16_vsx, /* 16x16 */
- subtract_average_32x32_vsx, /* 32x32 */
- cfl_subtract_average_null, /* 64x64 (invalid CFL size) */
- subtract_average_4x8_c, /* 4x8 */
- subtract_average_8x4_vsx, /* 8x4 */
- subtract_average_8x16_vsx, /* 8x16 */
- subtract_average_16x8_vsx, /* 16x8 */
- subtract_average_16x32_vsx, /* 16x32 */
- subtract_average_32x16_vsx, /* 32x16 */
- cfl_subtract_average_null, /* 32x64 (invalid CFL size) */
- cfl_subtract_average_null, /* 64x32 (invalid CFL size) */
- subtract_average_4x16_c, /* 4x16 */
- subtract_average_16x4_vsx, /* 16x4 */
- subtract_average_8x32_vsx, /* 8x32 */
- subtract_average_32x8_vsx, /* 32x8 */
- cfl_subtract_average_null, /* 16x64 (invalid CFL size) */
- cfl_subtract_average_null, /* 64x16 (invalid CFL size) */
+ cfl_subtract_average_4x4_c, /* 4x4 */
+ cfl_subtract_average_8x8_vsx, /* 8x8 */
+ cfl_subtract_average_16x16_vsx, /* 16x16 */
+ cfl_subtract_average_32x32_vsx, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_subtract_average_4x8_c, /* 4x8 */
+ cfl_subtract_average_8x4_vsx, /* 8x4 */
+ cfl_subtract_average_8x16_vsx, /* 8x16 */
+ cfl_subtract_average_16x8_vsx, /* 16x8 */
+ cfl_subtract_average_16x32_vsx, /* 16x32 */
+ cfl_subtract_average_32x16_vsx, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_subtract_average_4x16_c, /* 4x16 */
+ cfl_subtract_average_16x4_vsx, /* 16x4 */
+ cfl_subtract_average_8x32_vsx, /* 8x32 */
+ cfl_subtract_average_32x8_vsx, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
};
// Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
// index the function pointer array out of bounds.
diff --git a/media/libaom/src/av1/common/pred_common.h b/media/libaom/src/av1/common/pred_common.h
index 6dba2322d..d1dab97e7 100644
--- a/media/libaom/src/av1/common/pred_common.h
+++ b/media/libaom/src/av1/common/pred_common.h
@@ -12,29 +12,31 @@
#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
#define AOM_AV1_COMMON_PRED_COMMON_H_
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/mvref_common.h"
-#include "av1/common/onyxc_int.h"
#include "aom_dsp/aom_dsp_common.h"
#ifdef __cplusplus
extern "C" {
#endif
-static INLINE int get_segment_id(const AV1_COMMON *const cm,
+static INLINE int get_segment_id(const CommonModeInfoParams *const mi_params,
const uint8_t *segment_ids, BLOCK_SIZE bsize,
int mi_row, int mi_col) {
- const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
const int bw = mi_size_wide[bsize];
const int bh = mi_size_high[bsize];
- const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
- const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
- int x, y, segment_id = MAX_SEGMENTS;
-
- for (y = 0; y < ymis; ++y)
- for (x = 0; x < xmis; ++x)
- segment_id =
- AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+ const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+ int segment_id = MAX_SEGMENTS;
+
+ for (int y = 0; y < ymis; ++y) {
+ for (int x = 0; x < xmis; ++x) {
+ segment_id = AOMMIN(segment_id,
+ segment_ids[mi_offset + y * mi_params->mi_cols + x]);
+ }
+ }
assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
return segment_id;
@@ -42,26 +44,33 @@ static INLINE int get_segment_id(const AV1_COMMON *const cm,
static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
const MACROBLOCKD *const xd,
- int mi_row, int mi_col,
int *cdf_index) {
int prev_ul = -1; // top left segment_id
int prev_l = -1; // left segment_id
int prev_u = -1; // top segment_id
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const uint8_t *seg_map = cm->cur_frame->seg_map;
if ((xd->up_available) && (xd->left_available)) {
- prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
- mi_row - 1, mi_col - 1);
+ prev_ul =
+ get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 1);
}
if (xd->up_available) {
- prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
- mi_row - 1, mi_col - 0);
+ prev_u =
+ get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 0);
}
if (xd->left_available) {
- prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
- mi_row - 0, mi_col - 1);
+ prev_l =
+ get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, mi_col - 1);
}
+ // This property follows from the fact that get_segment_id() returns a
+ // nonnegative value. This allows us to test for all edge cases with a simple
+ // prev_ul < 0 check.
+ assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0));
// Pick CDF index based on number of matching/out-of-bounds segment IDs.
- if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+ if (prev_ul < 0) /* Edge cases */
*cdf_index = 0;
else if ((prev_ul == prev_u) && (prev_ul == prev_l))
*cdf_index = 2;
@@ -90,18 +99,18 @@ static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
static INLINE int get_comp_index_context(const AV1_COMMON *cm,
const MACROBLOCKD *xd) {
MB_MODE_INFO *mbmi = xd->mi[0];
- int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
- int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+ const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+ const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
int bck_frame_index = 0, fwd_frame_index = 0;
- int cur_frame_index = cm->cur_frame->cur_frame_offset;
+ int cur_frame_index = cm->cur_frame->order_hint;
- if (bck_idx >= 0)
- bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+ if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+ if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
- if (fwd_idx >= 0)
- fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
- int fwd = abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index));
- int bck = abs(get_relative_dist(cm, cur_frame_index, bck_frame_index));
+ int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+ fwd_frame_index, cur_frame_index));
+ int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+ cur_frame_index, bck_frame_index));
const MB_MODE_INFO *const above_mi = xd->above_mbmi;
const MB_MODE_INFO *const left_mi = xd->left_mbmi;
@@ -109,14 +118,14 @@ static INLINE int get_comp_index_context(const AV1_COMMON *cm,
int above_ctx = 0, left_ctx = 0;
const int offset = (fwd == bck);
- if (above_mi) {
+ if (above_mi != NULL) {
if (has_second_ref(above_mi))
above_ctx = above_mi->compound_idx;
else if (above_mi->ref_frame[0] == ALTREF_FRAME)
above_ctx = 1;
}
- if (left_mi) {
+ if (left_mi != NULL) {
if (has_second_ref(left_mi))
left_ctx = left_mi->compound_idx;
else if (left_mi->ref_frame[0] == ALTREF_FRAME)
@@ -178,6 +187,7 @@ int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
uint16_t *cache);
static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
}
@@ -198,6 +208,10 @@ static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
}
+static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) {
+ return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)];
+}
+
int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
// == Uni-directional contexts ==
diff --git a/media/libaom/src/av1/common/quant_common.c b/media/libaom/src/av1/common/quant_common.c
index 0e14da7a3..e96d71a3b 100644
--- a/media/libaom/src/av1/common/quant_common.c
+++ b/media/libaom/src/av1/common/quant_common.c
@@ -9,14 +9,14 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
#include "av1/common/common.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/entropy.h"
#include "av1/common/quant_common.h"
#include "av1/common/seg_common.h"
-#include "av1/common/blockd.h"
-static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = {
4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18,
19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30,
31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42,
@@ -38,7 +38,7 @@ static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
1184, 1232, 1282, 1336,
};
-static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = {
4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37,
40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82,
86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132,
@@ -61,7 +61,7 @@ static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
};
-static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = {
4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91,
103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237,
251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405,
@@ -88,7 +88,7 @@ static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
19718, 20521, 21387,
};
-static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = {
4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
@@ -111,7 +111,7 @@ static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
};
-static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = {
4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40,
44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92,
96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149,
@@ -134,7 +134,7 @@ static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
};
-static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = {
4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99,
112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263,
280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456,
@@ -190,39 +190,30 @@ static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
// addition, the minimum allowable quantizer is 4; smaller values will
// underflow to 0 in the actual quantization routines.
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ const int q_clamped = clamp(qindex + delta, 0, MAXQ);
switch (bit_depth) {
- case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
- case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
- case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+ case AOM_BITS_8: return dc_qlookup_QTX[q_clamped];
+ case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped];
+ case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped];
default:
assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
return -1;
}
}
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ const int q_clamped = clamp(qindex + delta, 0, MAXQ);
switch (bit_depth) {
- case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
- case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
- case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+ case AOM_BITS_8: return ac_qlookup_QTX[q_clamped];
+ case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped];
+ case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped];
default:
assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
return -1;
}
}
-// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
-// bits), so QTX == Q3.
-
-int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
- return av1_dc_quant_Q3(qindex, delta, bit_depth);
-}
-
-int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
- return av1_ac_quant_Q3(qindex, delta, bit_depth);
-}
-
int av1_get_qindex(const struct segmentation *seg, int segment_id,
int base_qindex) {
if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
@@ -234,39 +225,82 @@ int av1_get_qindex(const struct segmentation *seg, int segment_id,
}
}
-const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane,
- TX_SIZE tx_size) {
- return &cm->giqmatrix[qmlevel][plane][tx_size][0];
+bool av1_use_qmatrix(const CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int segment_id) {
+ // True if explicit Q matrix levels and this is not a lossless segment.
+ return quant_params->using_qmatrix && !xd->lossless[segment_id];
}
-const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane,
- TX_SIZE tx_size) {
- return &cm->gqmatrix[qmlevel][plane][tx_size][0];
+
+const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel,
+ int plane, TX_SIZE tx_size) {
+ assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL ||
+ qmlevel == NUM_QM_LEVELS - 1);
+ return quant_params->giqmatrix[qmlevel][plane][tx_size];
+}
+const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel,
+ int plane, TX_SIZE tx_size) {
+ assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL ||
+ qmlevel == NUM_QM_LEVELS - 1);
+ return quant_params->gqmatrix[qmlevel][plane][tx_size];
+}
+
+// Returns true if the tx_type corresponds to non-identity transform in both
+// horizontal and vertical directions.
+static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }
+
+const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int seg_id = mbmi->segment_id;
+ const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+ // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+ return is_2d_transform(tx_type)
+ ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+ : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+}
+
+const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int seg_id = mbmi->segment_id;
+ const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+ // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+ return is_2d_transform(tx_type)
+ ? pd->seg_qmatrix[seg_id][qm_tx_size]
+ : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
}
#define QM_TOTAL_SIZE 3344
-static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
-static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+// We only use wt_matrix_ref[q] and iwt_matrix_ref[q]
+// for q = 0, ..., NUM_QM_LEVELS - 2.
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
-void av1_qm_init(AV1_COMMON *cm) {
- const int num_planes = av1_num_planes(cm);
- int q, c, t;
- int current;
- for (q = 0; q < NUM_QM_LEVELS; ++q) {
- for (c = 0; c < num_planes; ++c) {
- current = 0;
- for (t = 0; t < TX_SIZES_ALL; ++t) {
+void av1_qm_init(CommonQuantParams *quant_params, int num_planes) {
+ for (int q = 0; q < NUM_QM_LEVELS; ++q) {
+ for (int c = 0; c < num_planes; ++c) {
+ int current = 0;
+ for (int t = 0; t < TX_SIZES_ALL; ++t) {
const int size = tx_size_2d[t];
const int qm_tx_size = av1_get_adjusted_tx_size(t);
if (q == NUM_QM_LEVELS - 1) {
- cm->gqmatrix[q][c][t] = NULL;
- cm->giqmatrix[q][c][t] = NULL;
+ quant_params->gqmatrix[q][c][t] = NULL;
+ quant_params->giqmatrix[q][c][t] = NULL;
} else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size'
- cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size];
- cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size];
+ assert(t > qm_tx_size);
+ quant_params->gqmatrix[q][c][t] =
+ quant_params->gqmatrix[q][c][qm_tx_size];
+ quant_params->giqmatrix[q][c][t] =
+ quant_params->giqmatrix[q][c][qm_tx_size];
} else {
assert(current + size <= QM_TOTAL_SIZE);
- cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
- cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current];
+ quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+ quant_params->giqmatrix[q][c][t] =
+ &iwt_matrix_ref[q][c >= 1][current];
current += size;
}
}
@@ -274,7 +308,7 @@ void av1_qm_init(AV1_COMMON *cm) {
}
}
-/* Provide 16 sets of quantization matrices for chroma and luma
+/* Provide 15 sets of quantization matrices for chroma and luma
and each TX size. Matrices for different TX sizes are in fact
sub-sampled from the 32x32 and 16x16 sizes, but explicitly
defined here for convenience. Intra and inter matrix sets are the
@@ -283,9 +317,10 @@ void av1_qm_init(AV1_COMMON *cm) {
frame.
Matrices for different QM levels have been rescaled in the
frequency domain according to different nominal viewing
- distances.
+ distances. Matrices for QM level 15 are omitted because they are
+ not used.
*/
-static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
{
{ /* Luma */
/* Size 4x4 */
@@ -6633,427 +6668,9 @@ static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
32, 32, 32, 32 },
},
- {
- { /* Luma */
- /* Size 4x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 32x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32 },
- { /* Chroma */
- /* Size 4x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 32x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32 },
- },
};
-static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
{
{ /* Luma */
/* Size 4x4 */
@@ -13255,422 +12872,4 @@ static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
32, 32, 32, 32 },
},
- {
- { /* Luma */
- /* Size 4x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 32x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32 },
- { /* Chroma */
- /* Size 4x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- /* Size 16x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 32x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 4x16 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 16x4 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- /* Size 8x32 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- /* Size 32x8 */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32 },
- },
};
diff --git a/media/libaom/src/av1/common/quant_common.h b/media/libaom/src/av1/common/quant_common.h
index d1f52a660..9c30204ff 100644
--- a/media/libaom/src/av1/common/quant_common.h
+++ b/media/libaom/src/av1/common/quant_common.h
@@ -12,6 +12,7 @@
#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
#define AOM_AV1_COMMON_QUANT_COMMON_H_
+#include <stdbool.h>
#include "aom/aom_codec.h"
#include "av1/common/seg_common.h"
#include "av1/common/enums.h"
@@ -37,24 +38,43 @@ extern "C" {
#define DEFAULT_QM_LAST 9
struct AV1Common;
+struct CommonQuantParams;
+struct macroblockd;
-int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
int av1_get_qindex(const struct segmentation *seg, int segment_id,
int base_qindex);
+
+// Returns true if we are using quantization matrix.
+bool av1_use_qmatrix(const struct CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int segment_id);
+
// Reduce the large number of quantizers to a smaller number of levels for which
// different matrices may be defined
static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
}
-void av1_qm_init(struct AV1Common *cm);
-const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
- TX_SIZE tx_size);
-const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
- TX_SIZE tx_size);
+
+// Initialize all global quant/dequant matrices.
+void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes);
+
+// Get global dequant matrix.
+const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params,
+ int qmlevel, int plane, TX_SIZE tx_size);
+// Get global quant matrix.
+const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params,
+ int qmlevel, int plane, TX_SIZE tx_size);
+
+// Get either local / global dequant matrix as appropriate.
+const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type);
+// Get either local / global quant matrix as appropriate.
+const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type);
#ifdef __cplusplus
} // extern "C"
diff --git a/media/libaom/src/av1/common/reconinter.c b/media/libaom/src/av1/common/reconinter.c
index 3203efce4..287adddcc 100644
--- a/media/libaom/src/av1/common/reconinter.c
+++ b/media/libaom/src/av1/common/reconinter.c
@@ -20,25 +20,24 @@
#include "aom/aom_integer.h"
#include "aom_dsp/blend.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
#include "av1/common/reconinter.h"
#include "av1/common/reconintra.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/obmc.h"
-
-#define USE_PRECOMPUTED_WEDGE_MASK 1
-#define USE_PRECOMPUTED_WEDGE_SIGN 1
// This function will determine whether or not to create a warped
// prediction.
int av1_allow_warp(const MB_MODE_INFO *const mbmi,
const WarpTypesAllowed *const warp_types,
const WarpedMotionParams *const gm_params,
- int build_for_obmc, int x_scale, int y_scale,
+ int build_for_obmc, const struct scale_factors *const sf,
WarpedMotionParams *final_warp_params) {
- if (x_scale != SCALE_SUBPEL_SHIFTS || y_scale != SCALE_SUBPEL_SHIFTS)
- return 0;
+ // Note: As per the spec, we must test the fixed point scales here, which are
+ // at a higher precision (1 << 14) than the xs and ys in subpel_params (that
+ // have 1 << 10 precision).
+ if (av1_is_scaled(sf)) return 0;
if (final_warp_params != NULL) *final_warp_params = default_warp_params;
@@ -57,48 +56,114 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
return 0;
}
-void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, const SubpelParams *subpel_params,
- const struct scale_factors *sf, int w, int h,
- ConvolveParams *conv_params,
- InterpFilters interp_filters,
- const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, int ref,
- const MB_MODE_INFO *mi, int build_for_obmc,
- const MACROBLOCKD *xd, int can_use_previous) {
- // Make sure the selected motion mode is valid for this configuration
- assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi,
- can_use_previous);
- assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-
- WarpedMotionParams final_warp_params;
- const int do_warp =
- (w >= 8 && h >= 8 &&
- av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
- build_for_obmc, subpel_params->xs, subpel_params->ys,
- &final_warp_params));
- const int is_intrabc = mi->use_intrabc;
- assert(IMPLIES(is_intrabc, !do_warp));
-
- if (do_warp && xd->cur_frame_force_integer_mv == 0) {
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const struct buf_2d *const pre_buf = &pd->pre[ref];
- av1_warp_plane(&final_warp_params,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
- pre_buf->buf0, pre_buf->width, pre_buf->height,
- pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
- pd->subsampling_x, pd->subsampling_y, conv_params);
- } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
- w, h, conv_params, interp_filters, is_intrabc,
- xd->bd);
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+ int block_height, int pix_row, int pix_col,
+ int subsampling_x, int subsampling_y, int bit_depth,
+ int use_hbd_buf, int is_intrabc,
+ const struct scale_factors *sf,
+ const struct buf_2d *ref_buf,
+ int_interpfilters interp_filters) {
+ inter_pred_params->block_width = block_width;
+ inter_pred_params->block_height = block_height;
+ inter_pred_params->pix_row = pix_row;
+ inter_pred_params->pix_col = pix_col;
+ inter_pred_params->subsampling_x = subsampling_x;
+ inter_pred_params->subsampling_y = subsampling_y;
+ inter_pred_params->bit_depth = bit_depth;
+ inter_pred_params->use_hbd_buf = use_hbd_buf;
+ inter_pred_params->is_intrabc = is_intrabc;
+ inter_pred_params->scale_factors = sf;
+ inter_pred_params->ref_frame_buf = *ref_buf;
+ inter_pred_params->mode = TRANSLATION_PRED;
+ inter_pred_params->comp_mode = UNIFORM_SINGLE;
+
+ if (is_intrabc) {
+ inter_pred_params->interp_filter_params[0] = &av1_intrabc_filter_params;
+ inter_pred_params->interp_filter_params[1] = &av1_intrabc_filter_params;
} else {
- inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
- conv_params, interp_filters, is_intrabc);
+ inter_pred_params->interp_filter_params[0] =
+ av1_get_interp_filter_params_with_block_size(
+ interp_filters.as_filters.x_filter, block_width);
+ inter_pred_params->interp_filter_params[1] =
+ av1_get_interp_filter_params_with_block_size(
+ interp_filters.as_filters.y_filter, block_height);
+ }
+}
+
+void av1_init_comp_mode(InterPredParams *inter_pred_params) {
+ inter_pred_params->comp_mode = UNIFORM_COMP;
+}
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+ const WarpTypesAllowed *warp_types, int ref,
+ const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
+ if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8)
+ return;
+
+ if (xd->cur_frame_force_integer_mv) return;
+
+ if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
+ inter_pred_params->scale_factors,
+ &inter_pred_params->warp_params))
+ inter_pred_params->mode = WARP_PRED;
+}
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+ const INTERINTER_COMPOUND_DATA *mask_comp) {
+ inter_pred_params->sb_type = bsize;
+ inter_pred_params->mask_comp = *mask_comp;
+
+ if (inter_pred_params->conv_params.compound_index == 1) {
+ inter_pred_params->conv_params.do_average = 0;
+ inter_pred_params->comp_mode = MASK_COMP;
+ }
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params) {
+ assert(IMPLIES(inter_pred_params->conv_params.is_compound,
+ inter_pred_params->conv_params.dst != NULL));
+
+ // TODO(jingning): av1_warp_plane() can be further cleaned up.
+ if (inter_pred_params->mode == WARP_PRED) {
+ av1_warp_plane(
+ &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
+ inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
+ inter_pred_params->ref_frame_buf.width,
+ inter_pred_params->ref_frame_buf.height,
+ inter_pred_params->ref_frame_buf.stride, dst,
+ inter_pred_params->pix_col, inter_pred_params->pix_row,
+ inter_pred_params->block_width, inter_pred_params->block_height,
+ dst_stride, inter_pred_params->subsampling_x,
+ inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
+ } else if (inter_pred_params->mode == TRANSLATION_PRED) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (inter_pred_params->use_hbd_buf) {
+ highbd_inter_predictor(
+ src, src_stride, dst, dst_stride, subpel_params,
+ inter_pred_params->scale_factors, inter_pred_params->block_width,
+ inter_pred_params->block_height, &inter_pred_params->conv_params,
+ inter_pred_params->interp_filter_params,
+ inter_pred_params->bit_depth);
+ } else {
+ inter_predictor(
+ src, src_stride, dst, dst_stride, subpel_params,
+ inter_pred_params->scale_factors, inter_pred_params->block_width,
+ inter_pred_params->block_height, &inter_pred_params->conv_params,
+ inter_pred_params->interp_filter_params);
+ }
+#else
+ inter_predictor(
+ src, src_stride, dst, dst_stride, subpel_params,
+ inter_pred_params->scale_factors, inter_pred_params->block_width,
+ inter_pred_params->block_height, &inter_pred_params->conv_params,
+ inter_pred_params->interp_filter_params);
+#endif
}
}
-#if USE_PRECOMPUTED_WEDGE_MASK
static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18,
@@ -118,7 +183,8 @@ static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
};
-static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift,
+ int width) {
if (shift >= 0) {
memcpy(dst + shift, src, width - shift);
memset(dst, src[0], shift);
@@ -128,9 +194,7 @@ static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
memset(dst + width - shift, src[width - 1], shift);
}
}
-#endif // USE_PRECOMPUTED_WEDGE_MASK
-#if USE_PRECOMPUTED_WEDGE_SIGN
/* clang-format off */
DECLARE_ALIGNED(16, static uint8_t,
wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
@@ -158,10 +222,6 @@ DECLARE_ALIGNED(16, static uint8_t,
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
};
/* clang-format on */
-#else
-DECLARE_ALIGNED(16, static uint8_t,
- wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
-#endif // USE_PRECOMPUTED_WEDGE_SIGN
// [negative][direction]
DECLARE_ALIGNED(
@@ -173,6 +233,10 @@ DECLARE_ALIGNED(
DECLARE_ALIGNED(16, static uint8_t,
wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
+DECLARE_ALIGNED(16, static uint8_t,
+ smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL]
+ [MAX_WEDGE_SQUARE]);
+
static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
static const wedge_code_type wedge_codebook_16_hgtw[16] = {
@@ -208,23 +272,23 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};
-const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
+const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = {
{ 0, NULL, NULL, NULL },
{ 0, NULL, NULL, NULL },
{ 0, NULL, NULL, NULL },
- { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
wedge_masks[BLOCK_8X8] },
- { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
wedge_masks[BLOCK_8X16] },
- { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
wedge_masks[BLOCK_16X8] },
- { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
wedge_masks[BLOCK_16X16] },
- { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
wedge_masks[BLOCK_16X32] },
- { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
wedge_masks[BLOCK_32X16] },
- { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
wedge_masks[BLOCK_32X32] },
{ 0, NULL, NULL, NULL },
{ 0, NULL, NULL, NULL },
@@ -234,9 +298,9 @@ const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
{ 0, NULL, NULL, NULL },
{ 0, NULL, NULL, NULL },
{ 0, NULL, NULL, NULL },
- { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
wedge_masks[BLOCK_8X32] },
- { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
wedge_masks[BLOCK_32X8] },
{ 0, NULL, NULL, NULL },
{ 0, NULL, NULL, NULL },
@@ -248,12 +312,12 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
const int bh = block_size_high[sb_type];
const int bw = block_size_wide[sb_type];
const wedge_code_type *a =
- wedge_params_lookup[sb_type].codebook + wedge_index;
+ av1_wedge_params_lookup[sb_type].codebook + wedge_index;
int woff, hoff;
- const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+ const uint8_t wsignflip =
+ av1_wedge_params_lookup[sb_type].signflip[wedge_index];
- assert(wedge_index >= 0 &&
- wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+ assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type));
woff = (a->x_offset * bw) >> 3;
hoff = (a->y_offset * bh) >> 3;
master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
@@ -275,10 +339,10 @@ const uint8_t *av1_get_compound_type_mask(
}
}
-static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
- const CONV_BUF_TYPE *src0, int src0_stride,
- const CONV_BUF_TYPE *src1, int src1_stride, int h,
- int w, ConvolveParams *conv_params, int bd) {
+static AOM_INLINE void diffwtd_mask_d16(
+ uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
int round =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
int i, j, m, diff;
@@ -309,9 +373,10 @@ void av1_build_compound_diffwtd_mask_d16_c(
}
}
-static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
- const uint8_t *src0, int src0_stride,
- const uint8_t *src1, int src1_stride, int h, int w) {
+static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
+ int mask_base, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1,
+ int src1_stride, int h, int w) {
int i, j, m, diff;
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
@@ -419,13 +484,12 @@ void av1_build_compound_diffwtd_mask_highbd_c(
}
}
-static void init_wedge_master_masks() {
+static AOM_INLINE void init_wedge_master_masks() {
int i, j;
const int w = MASK_MASTER_SIZE;
const int h = MASK_MASTER_SIZE;
const int stride = MASK_MASTER_STRIDE;
-// Note: index [0] stores the masters, and [1] its complement.
-#if USE_PRECOMPUTED_WEDGE_MASK
+ // Note: index [0] stores the masters, and [1] its complement.
// Generate prototype by shifting the masters
int shift = h / 4;
for (i = 0; i < h; i += 2) {
@@ -443,22 +507,7 @@ static void init_wedge_master_masks() {
wedge_master_vertical,
MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
}
-#else
- static const double smoother_param = 2.85;
- const int a[2] = { 2, 1 };
- const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; ++j) {
- int x = (2 * j + 1 - w);
- int y = (2 * i + 1 - h);
- double d = (a[0] * x + a[1] * y) / asqrt;
- const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
- wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
- const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
- wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
- }
- }
-#endif // USE_PRECOMPUTED_WEDGE_MASK
+
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
@@ -480,57 +529,18 @@ static void init_wedge_master_masks() {
}
}
-#if !USE_PRECOMPUTED_WEDGE_SIGN
-// If the signs for the wedges for various blocksizes are
-// inconsistent flip the sign flag. Do it only once for every
-// wedge codebook.
-static void init_wedge_signs() {
- BLOCK_SIZE sb_type;
- memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
- for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
- const int bw = block_size_wide[sb_type];
- const int bh = block_size_high[sb_type];
- const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
- const int wbits = wedge_params.bits;
- const int wtypes = 1 << wbits;
- int i, w;
- if (wbits) {
- for (w = 0; w < wtypes; ++w) {
- // Get the mask master, i.e. index [0]
- const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
- int avg = 0;
- for (i = 0; i < bw; ++i) avg += mask[i];
- for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
- avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
- // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
- // If default sign is 1:
- // If sign requested is 0, we need to flip the sign and return
- // the complement i.e. index [1] instead. If sign requested is 1
- // we need to flip the sign and return index [0] instead.
- // If default sign is 0:
- // If sign requested is 0, we need to return index [0] the master
- // if sign requested is 1, we need to return the complement index [1]
- // instead.
- wedge_params.signflip[w] = (avg < 32);
- }
- }
- }
-}
-#endif // !USE_PRECOMPUTED_WEDGE_SIGN
-
-static void init_wedge_masks() {
+static AOM_INLINE void init_wedge_masks() {
uint8_t *dst = wedge_mask_buf;
BLOCK_SIZE bsize;
memset(wedge_masks, 0, sizeof(wedge_masks));
for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize];
+ const int wtypes = wedge_params->wedge_types;
+ if (wtypes == 0) continue;
const uint8_t *mask;
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
- const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
- const int wbits = wedge_params->bits;
- const int wtypes = 1 << wbits;
int w;
- if (wbits == 0) continue;
for (w = 0; w < wtypes; ++w) {
mask = get_wedge_mask_inplace(w, 0, bsize);
aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
@@ -548,109 +558,383 @@ static void init_wedge_masks() {
}
}
+/* clang-format off */
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
+ 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
+ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4,
+ 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
+ 32, 16, 16, 16, 8, 8, 8, 4,
+ 4, 4, 2, 2, 2, 1, 1, 1,
+ 8, 8, 4, 4, 2, 2
+};
+/* clang-format on */
+
+static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
+ BLOCK_SIZE plane_bsize,
+ INTERINTRA_MODE mode) {
+ int i, j;
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ const int size_scale = ii_size_scales[plane_bsize];
+
+ switch (mode) {
+ case II_V_PRED:
+ for (i = 0; i < bh; ++i) {
+ memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+ mask += stride;
+ }
+ break;
+
+ case II_H_PRED:
+ for (i = 0; i < bh; ++i) {
+ for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+ mask += stride;
+ }
+ break;
+
+ case II_SMOOTH_PRED:
+ for (i = 0; i < bh; ++i) {
+ for (j = 0; j < bw; ++j)
+ mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+ mask += stride;
+ }
+ break;
+
+ case II_DC_PRED:
+ default:
+ for (i = 0; i < bh; ++i) {
+ memset(mask, 32, bw * sizeof(mask[0]));
+ mask += stride;
+ }
+ break;
+ }
+}
+
+static AOM_INLINE void init_smooth_interintra_masks() {
+ for (int m = 0; m < INTERINTRA_MODES; ++m) {
+ for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
+ const int bw = block_size_wide[bs];
+ const int bh = block_size_high[bs];
+ if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
+ build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs,
+ m);
+ }
+ }
+}
+
// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
void av1_init_wedge_masks() {
init_wedge_master_masks();
-#if !USE_PRECOMPUTED_WEDGE_SIGN
- init_wedge_signs();
-#endif // !USE_PRECOMPUTED_WEDGE_SIGN
init_wedge_masks();
+ init_smooth_interintra_masks();
}
-static void build_masked_compound_no_round(
+static AOM_INLINE void build_masked_compound_no_round(
uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
const CONV_BUF_TYPE *src1, int src1_stride,
const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
- int w, ConvolveParams *conv_params, MACROBLOCKD *xd) {
- // Derive subsampling from h and w passed in. May be refactored to
- // pass in subsampling factors directly.
- const int subh = (2 << mi_size_high_log2[sb_type]) == h;
- const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ int w, InterPredParams *inter_pred_params) {
+ const int ssy = inter_pred_params->subsampling_y;
+ const int ssx = inter_pred_params->subsampling_x;
const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ const int mask_stride = block_size_wide[sb_type];
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (inter_pred_params->use_hbd_buf) {
aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, block_size_wide[sb_type],
- w, h, subw, subh, conv_params, xd->bd);
- else
+ src1_stride, mask, mask_stride, w, h, ssx,
+ ssy, &inter_pred_params->conv_params,
+ inter_pred_params->bit_depth);
+ } else {
aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, block_size_wide[sb_type], w,
- h, subw, subh, conv_params);
+ src1_stride, mask, mask_stride, w, h, ssx, ssy,
+ &inter_pred_params->conv_params);
+ }
+#else
+ aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, ssx, ssy,
+ &inter_pred_params->conv_params);
+#endif
}
-void av1_make_masked_inter_predictor(
- const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
- const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
- int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
- const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
- MACROBLOCKD *xd, int can_use_previous) {
- MB_MODE_INFO *mi = xd->mi[0];
- (void)dst;
- (void)dst_stride;
- mi->interinter_comp.seg_mask = xd->seg_mask;
- const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp;
-
-// We're going to call av1_make_inter_predictor to generate a prediction into
-// a temporary buffer, then will blend that temporary buffer with that from
-// the other reference.
-//
-#define INTER_PRED_BYTES_PER_PIXEL 2
-
- DECLARE_ALIGNED(32, uint8_t,
- tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
-#undef INTER_PRED_BYTES_PER_PIXEL
-
- uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+ uint8_t *dst, int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params) {
+ const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
+ BLOCK_SIZE sb_type = inter_pred_params->sb_type;
+
+ // We're going to call av1_make_inter_predictor to generate a prediction into
+ // a temporary buffer, then will blend that temporary buffer with that from
+ // the other reference.
+ DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
+ uint8_t *tmp_dst =
+ inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
const int tmp_buf_stride = MAX_SB_SIZE;
- CONV_BUF_TYPE *org_dst = conv_params->dst;
- int org_dst_stride = conv_params->dst_stride;
+ CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
+ int org_dst_stride = inter_pred_params->conv_params.dst_stride;
CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
- conv_params->dst = tmp_buf16;
- conv_params->dst_stride = tmp_buf_stride;
- assert(conv_params->do_average == 0);
+ inter_pred_params->conv_params.dst = tmp_buf16;
+ inter_pred_params->conv_params.dst_stride = tmp_buf_stride;
+ assert(inter_pred_params->conv_params.do_average == 0);
// This will generate a prediction in tmp_buf for the second reference
- av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params,
- sf, w, h, conv_params, interp_filters, warp_types,
- p_col, p_row, plane, ref, mi, 0, xd,
- can_use_previous);
+ av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
+ inter_pred_params, subpel_params);
- if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+ if (!inter_pred_params->conv_params.plane &&
+ comp_data->type == COMPOUND_DIFFWTD) {
av1_build_compound_diffwtd_mask_d16(
comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
- tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd);
+ tmp_buf16, tmp_buf_stride, inter_pred_params->block_height,
+ inter_pred_params->block_width, &inter_pred_params->conv_params,
+ inter_pred_params->bit_depth);
+ }
+ build_masked_compound_no_round(
+ dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride,
+ comp_data, sb_type, inter_pred_params->block_height,
+ inter_pred_params->block_width, inter_pred_params);
+}
+
+void av1_build_one_inter_predictor(
+ uint8_t *dst, int dst_stride, const MV *const src_mv,
+ InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+ int ref, CalcSubpelParamsFunc calc_subpel_params_func) {
+ SubpelParams subpel_params;
+ uint8_t *src;
+ int src_stride;
+ calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src,
+ &subpel_params, &src_stride);
+
+ if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
+ inter_pred_params->comp_mode == UNIFORM_COMP) {
+ av1_make_inter_predictor(src, src_stride, dst, dst_stride,
+ inter_pred_params, &subpel_params);
+ } else {
+ av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+ inter_pred_params, &subpel_params);
}
- build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
- tmp_buf16, tmp_buf_stride, comp_data,
- mi->sb_type, h, w, conv_params, xd);
}
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
- int order_idx, int *fwd_offset, int *bck_offset,
- int *use_jnt_comp_avg, int is_compound) {
+// True if the following hold:
+// 1. Not intrabc and not build_for_obmc
+// 2. A U or V plane
+// 3. If the block size differs from the base block size
+// 4. If sub-sampled, none of the previous blocks around the sub-sample
+// are intrabc or inter-blocks
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
+ int is_intrabc, int build_for_obmc) {
+ if (is_intrabc || build_for_obmc) {
+ return false;
+ }
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ if ((block_size_wide[bsize] >= 8 || !ss_x) &&
+ (block_size_high[bsize] >= 8 || !ss_y)) {
+ return false;
+ }
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+ const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+
+ for (int row = row_start; row <= 0; ++row) {
+ for (int col = col_start; col <= 0; ++col) {
+ const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ if (!is_inter_block(this_mbmi)) return false;
+ if (is_intrabc_block(this_mbmi)) return false;
+ }
+ }
+ return true;
+}
+
+static void build_inter_predictors_sub8x8(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+ int bw, int bh, int mi_x, int mi_y,
+ CalcSubpelParamsFunc calc_subpel_params_func) {
+ const BLOCK_SIZE bsize = mi->sb_type;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const bool ss_x = pd->subsampling_x;
+ const bool ss_y = pd->subsampling_y;
+ const int b4_w = block_size_wide[bsize] >> ss_x;
+ const int b4_h = block_size_high[bsize] >> ss_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ const int b8_w = block_size_wide[plane_bsize];
+ const int b8_h = block_size_high[plane_bsize];
+ const int is_compound = has_second_ref(mi);
+ assert(!is_compound);
+ assert(!is_intrabc_block(mi));
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+ const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ int row = row_start;
+ for (int y = 0; y < b8_h; y += b4_h) {
+ int col = col_start;
+ for (int x = 0; x < b8_w; x += b4_w) {
+ MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ int tmp_dst_stride = 8;
+ assert(bw < 8 || bh < 8);
+ (void)bw;
+ (void)bh;
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+ int ref = 0;
+ const RefCntBuffer *ref_buf =
+ get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+ const struct scale_factors *ref_scale_factors =
+ get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
+ const struct scale_factors *const sf = ref_scale_factors;
+ const struct buf_2d pre_buf = {
+ NULL,
+ (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
+ ref_buf->buf.uv_crop_width,
+ ref_buf->buf.uv_crop_height,
+ ref_buf->buf.uv_stride,
+ };
+
+ const MV mv = this_mbmi->mv[ref].as_mv;
+
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
+ pre_x + x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
+ &pre_buf, this_mbmi->interp_filters);
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+ inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+
+ av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+ &inter_pred_params, xd, mi_x + x, mi_y + y,
+ ref, calc_subpel_params_func);
+
+ ++col;
+ }
+ ++row;
+ }
+}
+
+static void build_inter_predictors_8x8_and_bigger(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh, int mi_x, int mi_y,
+ CalcSubpelParamsFunc calc_subpel_params_func) {
+ const int is_compound = has_second_ref(mi);
+ const int is_intrabc = is_intrabc_block(mi);
+ assert(IMPLIES(is_intrabc, !is_compound));
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+
+ int is_global[2] = { 0, 0 };
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+ }
+
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const int row_start =
+ (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+ const int col_start =
+ (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+ const MV mv = mi->mv[ref].as_mv;
+ const WarpTypesAllowed warp_types = { is_global[ref],
+ mi->motion_mode == WARPED_CAUSAL };
+
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
+ mi->interp_filters);
+ if (is_compound) av1_init_comp_mode(&inter_pred_params);
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+ av1_dist_wtd_comp_weight_assign(
+ cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+ &inter_pred_params.conv_params.bck_offset,
+ &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
+
+ if (!build_for_obmc)
+ av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+ if (is_masked_compound_type(mi->interinter_comp.type)) {
+ av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp);
+ // Assign physical buffer.
+ inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+ }
+
+ av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
+ xd, mi_x, mi_y, ref, calc_subpel_params_func);
+ }
+}
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh, int mi_x,
+ int mi_y,
+ CalcSubpelParamsFunc calc_subpel_params_func) {
+ if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi),
+ build_for_obmc)) {
+ build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y,
+ calc_subpel_params_func);
+ } else {
+ build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+ bh, mi_x, mi_y,
+ calc_subpel_params_func);
+ }
+}
+
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+ const MB_MODE_INFO *mbmi, int order_idx,
+ int *fwd_offset, int *bck_offset,
+ int *use_dist_wtd_comp_avg,
+ int is_compound) {
assert(fwd_offset != NULL && bck_offset != NULL);
if (!is_compound || mbmi->compound_idx) {
- *use_jnt_comp_avg = 0;
+ *use_dist_wtd_comp_avg = 0;
return;
}
- *use_jnt_comp_avg = 1;
- const int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
- const int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
- const int cur_frame_index = cm->cur_frame->cur_frame_offset;
+ *use_dist_wtd_comp_avg = 1;
+ const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+ const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
+ const int cur_frame_index = cm->cur_frame->order_hint;
int bck_frame_index = 0, fwd_frame_index = 0;
- if (bck_idx >= 0) {
- bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
- }
-
- if (fwd_idx >= 0) {
- fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
- }
+ if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+ if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
- int d0 = clamp(abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)),
+ int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+ fwd_frame_index, cur_frame_index)),
0, MAX_FRAME_DISTANCE);
- int d1 = clamp(abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)),
+ int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+ cur_frame_index, bck_frame_index)),
0, MAX_FRAME_DISTANCE);
const int order = d0 <= d1;
@@ -708,10 +992,9 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
// obmc_mask_N[overlap_position]
static const uint8_t obmc_mask_1[1] = { 64 };
+DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 };
-static const uint8_t obmc_mask_2[2] = { 45, 64 };
-
-static const uint8_t obmc_mask_4[4] = { 39, 50, 59, 64 };
+DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 };
static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
@@ -743,19 +1026,21 @@ const uint8_t *av1_get_obmc_mask(int length) {
}
}
-static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
- uint8_t mi_hw, MB_MODE_INFO *mi,
- void *fun_ctxt, const int num_planes) {
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
+ int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *mi, void *fun_ctxt,
+ const int num_planes) {
(void)xd;
- (void)rel_mi_rc;
- (void)mi_hw;
+ (void)rel_mi_row;
+ (void)rel_mi_col;
+ (void)op_mi_size;
+ (void)dir;
(void)mi;
++*(int *)fun_ctxt;
(void)num_planes;
}
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col) {
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
MB_MODE_INFO *mbmi = xd->mi[0];
mbmi->overlappable_neighbors[0] = 0;
@@ -763,9 +1048,9 @@ void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
- foreach_overlappable_nb_above(cm, xd, mi_col, INT_MAX, increment_int_ptr,
+ foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
&mbmi->overlappable_neighbors[0]);
- foreach_overlappable_nb_left(cm, xd, mi_row, INT_MAX, increment_int_ptr,
+ foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
&mbmi->overlappable_neighbors[1]);
}
@@ -806,21 +1091,20 @@ struct obmc_inter_pred_ctxt {
int *adjacent_stride;
};
-static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
- uint8_t above_mi_width,
- MB_MODE_INFO *above_mi,
- void *fun_ctxt,
- const int num_planes) {
+static INLINE void build_obmc_inter_pred_above(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
(void)above_mi;
+ (void)rel_mi_row;
+ (void)dir;
struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
const int overlap =
AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
for (int plane = 0; plane < num_planes; ++plane) {
const struct macroblockd_plane *pd = &xd->plane[plane];
- const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+ const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
const int bh = overlap >> pd->subsampling_y;
const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
@@ -831,32 +1115,36 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
const int tmp_stride = ctxt->adjacent_stride[plane];
const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
const uint8_t *const mask = av1_get_obmc_mask(bh);
-
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int is_hbd = is_cur_buf_hbd(xd);
if (is_hbd)
aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
tmp_stride, mask, bw, bh, xd->bd);
else
aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
mask, bw, bh);
+#else
+ aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+ bw, bh);
+#endif
}
}
-static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
- uint8_t left_mi_height,
- MB_MODE_INFO *left_mi,
- void *fun_ctxt,
- const int num_planes) {
+static INLINE void build_obmc_inter_pred_left(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
(void)left_mi;
+ (void)rel_mi_col;
+ (void)dir;
struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
const int overlap =
AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
- const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
for (int plane = 0; plane < num_planes; ++plane) {
const struct macroblockd_plane *pd = &xd->plane[plane];
const int bw = overlap >> pd->subsampling_x;
- const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
+ const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y;
const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
@@ -867,12 +1155,18 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
const uint8_t *const mask = av1_get_obmc_mask(bw);
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int is_hbd = is_cur_buf_hbd(xd);
if (is_hbd)
aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
tmp_stride, mask, bw, bh, xd->bd);
else
aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
mask, bw, bh);
+#else
+ aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+ bw, bh);
+#endif
}
}
@@ -881,7 +1175,6 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
// prediction. We assume the original prediction (bmc) is stored in
// xd->plane[].dst.buf
void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
uint8_t *above[MAX_MB_PLANE],
int above_stride[MAX_MB_PLANE],
uint8_t *left[MAX_MB_PLANE],
@@ -890,23 +1183,54 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
// handle above row
struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
- foreach_overlappable_nb_above(cm, xd, mi_col,
+ foreach_overlappable_nb_above(cm, xd,
max_neighbor_obmc[mi_size_wide_log2[bsize]],
build_obmc_inter_pred_above, &ctxt_above);
// handle left column
struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
- foreach_overlappable_nb_left(cm, xd, mi_row,
+ foreach_overlappable_nb_left(cm, xd,
max_neighbor_obmc[mi_size_high_log2[bsize]],
build_obmc_inter_pred_left, &ctxt_left);
}
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+ int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes) {
+ const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type);
+ const int ref_mi_row = xd->mi_row + mi_row_offset;
+ const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+ ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+ ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+ NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+
+ const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(ctxt->cm, frame);
+
+ xd->block_ref_scale_factors[0] = sf;
+ if ((!av1_is_valid_scale(sf)))
+ aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+
+ av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+ num_planes);
+}
+
void av1_setup_build_prediction_by_above_pred(
MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
const int num_planes) {
const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
- const int above_mi_col = ctxt->mi_col + rel_mi_col;
+ const int above_mi_col = xd->mi_col + rel_mi_col;
av1_modify_neighbor_predictor_for_obmc(above_mbmi);
@@ -922,19 +1246,21 @@ void av1_setup_build_prediction_by_above_pred(
for (int ref = 0; ref < num_refs; ++ref) {
const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
- const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
-
- xd->block_refs[ref] = ref_buf;
- if ((!av1_is_valid_scale(&ref_buf->sf)))
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(ctxt->cm, frame);
+ xd->block_ref_scale_factors[ref] = sf;
+ if ((!av1_is_valid_scale(sf)))
aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
"Reference frame has invalid dimensions");
- av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
- &ref_buf->sf, num_planes);
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
+ num_planes);
}
xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
- xd->mb_to_right_edge = ctxt->mb_to_far_edge +
- (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+ xd->mb_to_right_edge =
+ ctxt->mb_to_far_edge +
+ (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8;
}
void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
@@ -943,7 +1269,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
struct build_prediction_ctxt *ctxt,
const int num_planes) {
const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
- const int left_mi_row = ctxt->mi_row + rel_mi_row;
+ const int left_mi_row = xd->mi_row + rel_mi_row;
av1_modify_neighbor_predictor_for_obmc(left_mbmi);
@@ -959,91 +1285,34 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
for (int ref = 0; ref < num_refs; ++ref) {
const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
- const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const ref_scale_factors =
+ get_ref_scale_factors_const(ctxt->cm, frame);
- xd->block_refs[ref] = ref_buf;
- if ((!av1_is_valid_scale(&ref_buf->sf)))
+ xd->block_ref_scale_factors[ref] = ref_scale_factors;
+ if ((!av1_is_valid_scale(ref_scale_factors)))
aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
"Reference frame has invalid dimensions");
- av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
- &ref_buf->sf, num_planes);
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
+ ref_scale_factors, num_planes);
}
- xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
+ xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
xd->mb_to_bottom_edge =
ctxt->mb_to_far_edge +
- (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+ GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
}
-/* clang-format off */
-static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
- 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
- 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
- 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8,
- 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4,
- 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-};
-static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
- 32, 16, 16, 16, 8, 8, 8, 4,
- 4, 4, 2, 2, 2, 1, 1, 1,
- 8, 8, 4, 4, 2, 2
-};
-/* clang-format on */
-
-static void build_smooth_interintra_mask(uint8_t *mask, int stride,
- BLOCK_SIZE plane_bsize,
- INTERINTRA_MODE mode) {
- int i, j;
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
- const int size_scale = ii_size_scales[plane_bsize];
-
- switch (mode) {
- case II_V_PRED:
- for (i = 0; i < bh; ++i) {
- memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
- mask += stride;
- }
- break;
-
- case II_H_PRED:
- for (i = 0; i < bh; ++i) {
- for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
- mask += stride;
- }
- break;
-
- case II_SMOOTH_PRED:
- for (i = 0; i < bh; ++i) {
- for (j = 0; j < bw; ++j)
- mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
- mask += stride;
- }
- break;
-
- case II_DC_PRED:
- default:
- for (i = 0; i < bh; ++i) {
- memset(mask, 32, bw * sizeof(mask[0]));
- mask += stride;
- }
- break;
- }
-}
-
-static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
- int wedge_index, int wedge_sign,
- BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
- uint8_t *comppred, int compstride,
- const uint8_t *interpred, int interstride,
- const uint8_t *intrapred, int intrastride) {
+static AOM_INLINE void combine_interintra(
+ INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+ int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+ uint8_t *comppred, int compstride, const uint8_t *interpred,
+ int interstride, const uint8_t *intrapred, int intrastride) {
const int bw = block_size_wide[plane_bsize];
const int bh = block_size_high[plane_bsize];
if (use_wedge_interintra) {
- if (is_interintra_wedge_used(bsize)) {
+ if (av1_is_wedge_used(bsize)) {
const uint8_t *mask =
av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
const int subw = 2 * mi_size_wide[bsize] == bw;
@@ -1055,22 +1324,22 @@ static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
return;
}
- uint8_t mask[MAX_SB_SQUARE];
- build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+ const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
interstride, mask, bw, bw, bh, 0, 0);
}
-static void combine_interintra_highbd(
- INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
- int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void combine_interintra_highbd(
+ INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+ int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
uint8_t *comppred8, int compstride, const uint8_t *interpred8,
int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
const int bw = block_size_wide[plane_bsize];
const int bh = block_size_high[plane_bsize];
if (use_wedge_interintra) {
- if (is_interintra_wedge_used(bsize)) {
+ if (av1_is_wedge_used(bsize)) {
const uint8_t *mask =
av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
const int subh = 2 * mi_size_high[bsize] == bh;
@@ -1088,12 +1357,13 @@ static void combine_interintra_highbd(
interpred8, interstride, mask, bw, bw, bh, 0, 0,
bd);
}
+#endif
void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
MACROBLOCKD *xd,
BLOCK_SIZE bsize, int plane,
- BUFFER_SET *ctx, uint8_t *dst,
- int dst_stride) {
+ const BUFFER_SET *ctx,
+ uint8_t *dst, int dst_stride) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const int ssx = xd->plane[plane].subsampling_x;
const int ssy = xd->plane[plane].subsampling_y;
@@ -1116,28 +1386,30 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const int ssx = xd->plane[plane].subsampling_x;
const int ssy = xd->plane[plane].subsampling_y;
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
combine_interintra_highbd(
xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
- xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
- bsize, plane_bsize, xd->plane[plane].dst.buf,
- xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred,
- intra_stride, xd->bd);
+ xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+ plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+ inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
return;
}
+#endif
combine_interintra(
xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
- xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
- bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+ xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+ plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
inter_pred, inter_stride, intra_pred, intra_stride);
}
// build interintra_predictors for one plane
-void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *pred, int stride,
- BUFFER_SET *ctx, int plane,
- BLOCK_SIZE bsize) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
+ const BUFFER_SET *ctx, int plane,
+ BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ if (is_cur_buf_hbd(xd)) {
DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
av1_build_intra_predictors_for_interintra(
cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
@@ -1152,11 +1424,3 @@ void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
MAX_SB_SIZE);
}
}
-
-void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *upred, uint8_t *vpred,
- int ustride, int vstride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize) {
- av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
- av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
-}
diff --git a/media/libaom/src/av1/common/reconinter.h b/media/libaom/src/av1/common/reconinter.h
index db86c777e..fe3c6a621 100644
--- a/media/libaom/src/av1/common/reconinter.h
+++ b/media/libaom/src/av1/common/reconinter.h
@@ -12,9 +12,9 @@
#ifndef AOM_AV1_COMMON_RECONINTER_H_
#define AOM_AV1_COMMON_RECONINTER_H_
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
#include "av1/common/warped_motion.h"
#include "aom/aom_integer.h"
@@ -35,8 +35,7 @@
extern "C" {
#endif
-// Set to (1 << 5) if the 32-ary codebooks are used for any bock size
-#define MAX_WEDGE_TYPES (1 << 4)
+#define MAX_WEDGE_TYPES 16
#define MAX_WEDGE_SIZE_LOG2 5 // 32x32
#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
@@ -47,7 +46,7 @@ extern "C" {
#define WEDGE_NONE -1
// Angles are with respect to horizontal anti-clockwise
-typedef enum {
+enum {
WEDGE_HORIZONTAL = 0,
WEDGE_VERTICAL = 1,
WEDGE_OBLIQUE27 = 2,
@@ -55,7 +54,7 @@ typedef enum {
WEDGE_OBLIQUE117 = 4,
WEDGE_OBLIQUE153 = 5,
WEDGE_DIRECTIONS
-} WedgeDirectionType;
+} UENUM1BYTE(WedgeDirectionType);
// 3-tuple: {direction, x_offset, y_offset}
typedef struct {
@@ -67,13 +66,13 @@ typedef struct {
typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
typedef struct {
- int bits;
+ int wedge_types;
const wedge_code_type *codebook;
uint8_t *signflip;
wedge_masks_type *masks;
} wedge_params_type;
-extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL];
+extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL];
typedef struct SubpelParams {
int xs;
@@ -84,8 +83,6 @@ typedef struct SubpelParams {
struct build_prediction_ctxt {
const AV1_COMMON *cm;
- int mi_row;
- int mi_col;
uint8_t **tmp_buf;
int *tmp_width;
int *tmp_height;
@@ -93,6 +90,55 @@ struct build_prediction_ctxt {
int mb_to_far_edge;
};
+typedef enum InterPredMode {
+ TRANSLATION_PRED,
+ WARP_PRED,
+} InterPredMode;
+
+typedef enum InterCompMode {
+ UNIFORM_SINGLE,
+ UNIFORM_COMP,
+ MASK_COMP,
+} InterCompMode;
+
+typedef struct InterPredParams {
+ InterPredMode mode;
+ InterCompMode comp_mode;
+ WarpedMotionParams warp_params;
+ ConvolveParams conv_params;
+ const InterpFilterParams *interp_filter_params[2];
+ int block_width;
+ int block_height;
+ int pix_row;
+ int pix_col;
+ struct buf_2d ref_frame_buf;
+ int subsampling_x;
+ int subsampling_y;
+ const struct scale_factors *scale_factors;
+ int bit_depth;
+ int use_hbd_buf;
+ INTERINTER_COMPOUND_DATA mask_comp;
+ BLOCK_SIZE sb_type;
+ int is_intrabc;
+} InterPredParams;
+
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+ int block_height, int pix_row, int pix_col,
+ int subsampling_x, int subsampling_y, int bit_depth,
+ int use_hbd_buf, int is_intrabc,
+ const struct scale_factors *sf,
+ const struct buf_2d *ref_buf,
+ int_interpfilters interp_filters);
+
+void av1_init_comp_mode(InterPredParams *inter_pred_params);
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+ const WarpTypesAllowed *warp_types, int ref,
+ const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+ const INTERINTER_COMPOUND_DATA *mask_comp);
+
static INLINE int has_scale(int xs, int ys) {
return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
}
@@ -108,53 +154,47 @@ static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
assert(sp->ys <= SUBPEL_SHIFTS);
}
-static INLINE void inter_predictor(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const SubpelParams *subpel_params,
- const struct scale_factors *sf, int w, int h,
- ConvolveParams *conv_params,
- InterpFilters interp_filters,
- int is_intrabc) {
+static INLINE void inter_predictor(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+ int h, ConvolveParams *conv_params,
+ const InterpFilterParams *interp_filters[2]) {
assert(conv_params->do_average == 0 || conv_params->do_average == 1);
assert(sf);
const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
- assert(IMPLIES(is_intrabc, !is_scaled));
if (is_scaled) {
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_params->subpel_x,
subpel_params->xs, subpel_params->subpel_y,
- subpel_params->ys, 1, conv_params, sf, is_intrabc);
+ subpel_params->ys, 1, conv_params, sf);
} else {
SubpelParams sp = *subpel_params;
revert_scale_extra_bits(&sp);
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
- sp.ys, 0, conv_params, sf, is_intrabc);
+ sp.ys, 0, conv_params, sf);
}
}
-static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const SubpelParams *subpel_params,
- const struct scale_factors *sf, int w,
- int h, ConvolveParams *conv_params,
- InterpFilters interp_filters,
- int is_intrabc, int bd) {
+static INLINE void highbd_inter_predictor(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+ int h, ConvolveParams *conv_params,
+ const InterpFilterParams *interp_filters[2], int bd) {
assert(conv_params->do_average == 0 || conv_params->do_average == 1);
assert(sf);
const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
- assert(IMPLIES(is_intrabc, !is_scaled));
if (is_scaled) {
- av1_highbd_convolve_2d_facade(
- src, src_stride, dst, dst_stride, w, h, interp_filters,
- subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
- subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
+ av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, subpel_params->subpel_x,
+ subpel_params->xs, subpel_params->subpel_y,
+ subpel_params->ys, 1, conv_params, sf, bd);
} else {
SubpelParams sp = *subpel_params;
revert_scale_extra_bits(&sp);
- av1_highbd_convolve_2d_facade(
- src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
- sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
+ av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, sp.subpel_x, sp.xs,
+ sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
}
}
@@ -167,9 +207,10 @@ static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
const int comp_allowed = is_comp_ref_allowed(sb_type);
switch (type) {
case COMPOUND_AVERAGE:
+ case COMPOUND_DISTWTD:
case COMPOUND_DIFFWTD: return comp_allowed;
case COMPOUND_WEDGE:
- return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
+ return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0;
default: assert(0); return 0;
}
}
@@ -187,39 +228,41 @@ static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
return 0;
}
-static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
- return wedge_params_lookup[sb_type].bits;
-}
-
-static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
- const int wbits = wedge_params_lookup[sb_type].bits;
- return (wbits > 0) ? wbits + 1 : 0;
-}
-
-static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
- return wedge_params_lookup[sb_type].bits > 0;
+static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
+ return av1_wedge_params_lookup[sb_type].wedge_types;
}
-static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
- return wedge_params_lookup[sb_type].bits;
+static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) {
+ return av1_wedge_params_lookup[sb_type].wedge_types > 0;
}
void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, const SubpelParams *subpel_params,
- const struct scale_factors *sf, int w, int h,
- ConvolveParams *conv_params,
- InterpFilters interp_filters,
- const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, int ref,
- const MB_MODE_INFO *mi, int build_for_obmc,
- const MACROBLOCKD *xd, int can_use_previous);
-
-void av1_make_masked_inter_predictor(
- const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
- const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
- int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
- const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
- MACROBLOCKD *xd, int can_use_previous);
+ int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params);
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+ uint8_t *dst, int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params);
+
+typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv,
+ InterPredParams *const inter_pred_params,
+ MACROBLOCKD *xd, int mi_x, int mi_y,
+ int ref, uint8_t **pre,
+ SubpelParams *subpel_params,
+ int *src_stride);
+
+void av1_build_one_inter_predictor(
+ uint8_t *dst, int dst_stride, const MV *const src_mv,
+ InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+ int ref, CalcSubpelParamsFunc calc_subpel_params_func);
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh, int mi_x,
+ int mi_y,
+ CalcSubpelParamsFunc calc_subpel_params_func);
// TODO(jkoleszar): yet another mv clamping function :-(
static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
@@ -236,22 +279,26 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
(int16_t)(src_mv->col * (1 << (1 - ss_x))) };
assert(ss_x <= 1);
assert(ss_y <= 1);
+ const SubpelMvLimits mv_limits = {
+ xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+ xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+ xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+ xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom
+ };
- clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
- xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
- xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
- xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+ clamp_mv(&clamped_mv, &mv_limits);
return clamped_mv;
}
-static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
- const struct scale_factors *sf) {
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+ int stride,
+ const struct scale_factors *sf) {
const int x =
sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
const int y =
sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
- return y * stride + x;
+ return (int64_t)y * stride + x;
}
static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
@@ -296,6 +343,11 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
return 1;
}
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+ int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes);
+
void av1_setup_build_prediction_by_above_pred(
MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
@@ -306,56 +358,53 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
struct build_prediction_ctxt *ctxt,
const int num_planes);
void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
uint8_t *above[MAX_MB_PLANE],
int above_stride[MAX_MB_PLANE],
uint8_t *left[MAX_MB_PLANE],
int left_stride[MAX_MB_PLANE]);
const uint8_t *av1_get_obmc_mask(int length);
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd);
#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
void av1_init_wedge_masks();
-static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
- int wedge_sign,
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
+ int8_t wedge_sign,
BLOCK_SIZE sb_type) {
- return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+ return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
}
const uint8_t *av1_get_compound_type_mask(
const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
// build interintra_predictors for one plane
-void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *pred, int stride,
- BUFFER_SET *ctx, int plane,
- BLOCK_SIZE bsize);
-
-void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *upred, uint8_t *vpred,
- int ustride, int vstride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize);
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
+ const BUFFER_SET *ctx, int plane,
+ BLOCK_SIZE bsize);
-void av1_build_intra_predictors_for_interintra(
- const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
- BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+ MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int plane,
+ const BUFFER_SET *ctx,
+ uint8_t *dst, int dst_stride);
void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const uint8_t *inter_pred, int inter_stride,
const uint8_t *intra_pred, int intra_stride);
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
- int order_idx, int *fwd_offset, int *bck_offset,
- int *use_jnt_comp_avg, int is_compound);
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+ const MB_MODE_INFO *mbmi, int order_idx,
+ int *fwd_offset, int *bck_offset,
+ int *use_dist_wtd_comp_avg,
+ int is_compound);
int av1_allow_warp(const MB_MODE_INFO *const mbmi,
const WarpTypesAllowed *const warp_types,
const WarpedMotionParams *const gm_params,
- int build_for_obmc, int x_scale, int y_scale,
+ int build_for_obmc, const struct scale_factors *const sf,
WarpedMotionParams *final_warp_params);
#ifdef __cplusplus
diff --git a/media/libaom/src/av1/common/reconintra.c b/media/libaom/src/av1/common/reconintra.c
index 71a52e73e..1307a0313 100644
--- a/media/libaom/src/av1/common/reconintra.c
+++ b/media/libaom/src/av1/common/reconintra.c
@@ -20,9 +20,9 @@
#include "aom_ports/aom_once.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
-#include "av1/common/reconintra.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
enum {
NEED_LEFT = 1 << 1,
@@ -198,7 +198,7 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
int col_off, int ss_x, int ss_y) {
if (!top_available || !right_available) return 0;
- const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int bw_unit = mi_size_wide[bsize];
const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
const int top_right_count_unit = tx_size_wide_unit[txsz];
@@ -405,7 +405,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
// Bottom-left pixels are in the bottom-left block, which is not available.
return 0;
} else {
- const int bh_unit = block_size_high[bsize] >> tx_size_high_log2[0];
+ const int bh_unit = mi_size_high[bsize];
const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
const int bottom_left_count_unit = tx_size_high_unit[txsz];
@@ -422,10 +422,9 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
// and/or bottom-left superblocks. But only the left superblock is
// available, so check if all required pixels fall in that superblock.
if (blk_col_in_sb == 0) {
- const int blk_start_row_off = blk_row_in_sb
- << (bh_in_mi_log2 + MI_SIZE_LOG2 -
- tx_size_wide_log2[0]) >>
- ss_y;
+ const int blk_start_row_off =
+ blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
+ ss_y;
const int row_off_in_sb = blk_start_row_off + row_off;
const int sb_height_unit = sb_mi_size >> ss_y;
return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
@@ -453,11 +452,13 @@ typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
+#if CONFIG_AV1_HIGHBITDEPTH
typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above, const uint16_t *left,
int bd);
static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
+#endif
static void init_intra_predictors_internal(void) {
assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
@@ -499,7 +500,7 @@ static void init_intra_predictors_internal(void) {
INIT_ALL_SIZES(dc_pred[0][1], dc_top);
INIT_ALL_SIZES(dc_pred[1][0], dc_left);
INIT_ALL_SIZES(dc_pred[1][1], dc);
-
+#if CONFIG_AV1_HIGHBITDEPTH
INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
@@ -510,6 +511,7 @@ static void init_intra_predictors_internal(void) {
INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif
#undef intra_pred_allsizes
}
@@ -556,33 +558,37 @@ void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
const uint8_t *above, const uint8_t *left,
int upsample_above, int upsample_left, int dx,
int dy) {
- int r, c, x, y, shift1, shift2, val, base1, base2;
-
assert(dx > 0);
assert(dy > 0);
const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ (void)min_base_y;
const int frac_bits_x = 6 - upsample_above;
const int frac_bits_y = 6 - upsample_left;
- const int base_inc_x = 1 << upsample_above;
- x = -dx;
- for (r = 0; r < bh; ++r, x -= dx, dst += stride) {
- base1 = x >> frac_bits_x;
- y = (r << 6) - dy;
- for (c = 0; c < bw; ++c, base1 += base_inc_x, y -= dy) {
- if (base1 >= min_base_x) {
- shift1 = ((x * (1 << upsample_above)) & 0x3F) >> 1;
- val = above[base1] * (32 - shift1) + above[base1 + 1] * shift1;
+
+ for (int r = 0; r < bh; ++r) {
+ for (int c = 0; c < bw; ++c) {
+ int val;
+ int y = r + 1;
+ int x = (c << 6) - y * dx;
+ const int base_x = x >> frac_bits_x;
+ if (base_x >= min_base_x) {
+ const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+ val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
val = ROUND_POWER_OF_TWO(val, 5);
} else {
- base2 = y >> frac_bits_y;
- assert(base2 >= -(1 << upsample_left));
- shift2 = ((y * (1 << upsample_left)) & 0x3F) >> 1;
- val = left[base2] * (32 - shift2) + left[base2 + 1] * shift2;
+ x = c + 1;
+ y = (r << 6) - x * dy;
+ const int base_y = y >> frac_bits_y;
+ assert(base_y >= min_base_y);
+ const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+ val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
val = ROUND_POWER_OF_TWO(val, 5);
}
dst[c] = val;
}
+ dst += stride;
}
}
@@ -643,6 +649,7 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
// Directional prediction, zone 1: 0 < angle < 90
void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
int bh, const uint16_t *above,
@@ -688,30 +695,33 @@ void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
int bh, const uint16_t *above,
const uint16_t *left, int upsample_above,
int upsample_left, int dx, int dy, int bd) {
- int r, c, x, y, shift, val, base;
-
(void)bd;
assert(dx > 0);
assert(dy > 0);
const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ (void)min_base_y;
const int frac_bits_x = 6 - upsample_above;
const int frac_bits_y = 6 - upsample_left;
- for (r = 0; r < bh; ++r) {
- for (c = 0; c < bw; ++c) {
- y = r + 1;
- x = (c << 6) - y * dx;
- base = x >> frac_bits_x;
- if (base >= min_base_x) {
- shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
- val = above[base] * (32 - shift) + above[base + 1] * shift;
+
+ for (int r = 0; r < bh; ++r) {
+ for (int c = 0; c < bw; ++c) {
+ int val;
+ int y = r + 1;
+ int x = (c << 6) - y * dx;
+ const int base_x = x >> frac_bits_x;
+ if (base_x >= min_base_x) {
+ const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+ val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
val = ROUND_POWER_OF_TWO(val, 5);
} else {
x = c + 1;
y = (r << 6) - x * dy;
- base = y >> frac_bits_y;
- shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
- val = left[base] * (32 - shift) + left[base + 1] * shift;
+ const int base_y = y >> frac_bits_y;
+ assert(base_y >= min_base_y);
+ const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+ val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
val = ROUND_POWER_OF_TWO(val, 5);
}
dst[c] = val;
@@ -778,6 +788,7 @@ static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
}
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
DECLARE_ALIGNED(16, const int8_t,
av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
@@ -843,10 +854,6 @@ void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
assert(bw <= 32 && bh <= 32);
- // The initialization is just for silencing Jenkins static analysis warnings
- for (r = 0; r < bh + 1; ++r)
- memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
@@ -881,6 +888,7 @@ void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
TX_SIZE tx_size,
const uint16_t *above,
@@ -893,10 +901,6 @@ static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
assert(bw <= 32 && bh <= 32);
- // The initialization is just for silencing Jenkins static analysis warnings
- for (r = 0; r < bh + 1; ++r)
- memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
@@ -931,6 +935,7 @@ static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
dst += stride;
}
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
if (plane == 0) {
@@ -1008,9 +1013,9 @@ static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
if (!strength) return;
- const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
- { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
- };
+ const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 } };
const int filt = strength - 1;
uint8_t edge[129];
@@ -1041,9 +1046,9 @@ static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
if (!strength) return;
- const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
- { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
- };
+ const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 } };
const int filt = strength - 1;
uint16_t edge[129];
@@ -1061,6 +1066,7 @@ void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
const int kernel[3] = { 5, 6, 5 };
@@ -1070,6 +1076,7 @@ static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
p_above[-1] = s;
p_left[-1] = s;
}
+#endif
void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
// interpolate half-sample positions
@@ -1117,7 +1124,7 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
p[2 * i] = in[i + 2];
}
}
-
+#if CONFIG_AV1_HIGHBITDEPTH
static void build_intra_predictors_high(
const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
int dst_stride, PREDICTION_MODE mode, int angle_delta,
@@ -1144,7 +1151,7 @@ static void build_intra_predictors_high(
int base = 128 << (xd->bd - 8);
// The default values if ref pixels are not available:
- // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+ // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
// base+1 A B .. Y Z
// base+1 C D .. W X
// base+1 E F .. U V
@@ -1182,7 +1189,7 @@ static void build_intra_predictors_high(
// NEED_LEFT
if (need_left) {
- int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+ int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
if (use_filter_intra) need_bottom = 0;
if (is_dr_mode) need_bottom = p_angle > 180;
const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
@@ -1207,7 +1214,7 @@ static void build_intra_predictors_high(
// NEED_ABOVE
if (need_above) {
- int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+ int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
if (use_filter_intra) need_right = 0;
if (is_dr_mode) need_right = p_angle < 90;
const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
@@ -1302,6 +1309,7 @@ static void build_intra_predictors_high(
pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
}
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
int ref_stride, uint8_t *dst, int dst_stride,
@@ -1328,7 +1336,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
// The default values if ref pixels are not available:
- // 127 127 127 .. 127 127 127 127 127 127
+ // 128 127 127 .. 127 127 127 127 127 127
// 129 A B .. Y Z
// 129 C D .. W X
// 129 E F .. U V
@@ -1367,10 +1375,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
// NEED_LEFT
if (need_left) {
- int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+ int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
if (use_filter_intra) need_bottom = 0;
if (is_dr_mode) need_bottom = p_angle > 180;
- const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+ // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
+ // due to the avx2 mask load is with dword granularity.
+ // so we initialize 3 extra bytes to silence valgrind complain.
+ const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
i = 0;
if (n_left_px > 0) {
for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
@@ -1392,7 +1403,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
// NEED_ABOVE
if (need_above) {
- int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+ int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
if (use_filter_intra) need_right = 0;
if (is_dr_mode) need_right = p_angle < 90;
const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
@@ -1486,6 +1497,57 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
}
}
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+ int subsampling_y) {
+ assert(subsampling_x >= 0 && subsampling_x < 2);
+ assert(subsampling_y >= 0 && subsampling_y < 2);
+ BLOCK_SIZE bs = bsize;
+ switch (bsize) {
+ case BLOCK_4X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X8;
+ break;
+ case BLOCK_4X8:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X8;
+ break;
+ case BLOCK_8X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_8X8;
+ break;
+ case BLOCK_4X16:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X16;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X16;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X16;
+ break;
+ case BLOCK_16X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_16X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_16X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_16X8;
+ break;
+ default: break;
+ }
+ return bs;
+}
+
void av1_predict_intra_block(
const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
@@ -1494,8 +1556,8 @@ void av1_predict_intra_block(
const MB_MODE_INFO *const mbmi = xd->mi[0];
const int txwpx = tx_size_wide[tx_size];
const int txhpx = tx_size_high[tx_size];
- const int x = col_off << tx_size_wide_log2[0];
- const int y = row_off << tx_size_high_log2[0];
+ const int x = col_off << MI_SIZE_LOG2;
+ const int y = row_off << MI_SIZE_LOG2;
if (use_palette) {
int r, c;
@@ -1503,7 +1565,7 @@ void av1_predict_intra_block(
xd->color_index_map_offset[plane != 0];
const uint16_t *const palette =
mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
for (r = 0; r < txhpx; ++r) {
for (c = 0; c < txwpx; ++c) {
@@ -1521,15 +1583,15 @@ void av1_predict_intra_block(
return;
}
- BLOCK_SIZE bsize = mbmi->sb_type;
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int txw = tx_size_wide_unit[tx_size];
const int txh = tx_size_high_unit[tx_size];
- const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
- : xd->up_available);
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const int have_top =
+ row_off || (ss_y ? xd->chroma_up_available : xd->up_available);
const int have_left =
- col_off ||
- (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
+ col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
const int xr_chr_offset = 0;
@@ -1537,32 +1599,35 @@ void av1_predict_intra_block(
// Distance between the right edge of this prediction block to
// the frame right edge
- const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
- (wpx - x - txwpx) - xr_chr_offset;
+ const int xr =
+ (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset;
// Distance between the bottom edge of this prediction block to
// the frame bottom edge
- const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
- (hpx - y - txhpx) - yd_chr_offset;
+ const int yd =
+ (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset;
const int right_available =
- mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end;
+ mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
const int bottom_available =
- (yd > 0) &&
- (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end);
+ (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
const PARTITION_TYPE partition = mbmi->partition;
+ BLOCK_SIZE bsize = mbmi->sb_type;
// force 4x4 chroma component block size.
- bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+ if (ss_x || ss_y) {
+ bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+ }
- const int have_top_right = has_top_right(
- cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size,
- row_off, col_off, pd->subsampling_x, pd->subsampling_y);
- const int have_bottom_left = has_bottom_left(
- cm, bsize, mi_row, mi_col, bottom_available, have_left, partition,
- tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+ const int have_top_right =
+ has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
+ partition, tx_size, row_off, col_off, ss_x, ss_y);
+ const int have_bottom_left =
+ has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
+ partition, tx_size, row_off, col_off, ss_x, ss_y);
const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
build_intra_predictors_high(
xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
filter_intra_mode, tx_size, disable_edge_filter,
@@ -1572,7 +1637,7 @@ void av1_predict_intra_block(
have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
return;
}
-
+#endif
build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
angle_delta, filter_intra_mode, tx_size,
disable_edge_filter,
@@ -1588,8 +1653,7 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
const MB_MODE_INFO *const mbmi = xd->mi[0];
struct macroblockd_plane *const pd = &xd->plane[plane];
const int dst_stride = pd->dst.stride;
- uint8_t *dst =
- &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
const PREDICTION_MODE mode =
(plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
diff --git a/media/libaom/src/av1/common/reconintra.h b/media/libaom/src/av1/common/reconintra.h
index 07853aba0..9d203569c 100644
--- a/media/libaom/src/av1/common/reconintra.h
+++ b/media/libaom/src/av1/common/reconintra.h
@@ -15,8 +15,8 @@
#include <stdlib.h>
#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
-#include "av1/common/onyxc_int.h"
#ifdef __cplusplus
extern "C" {
@@ -26,13 +26,11 @@ void av1_init_intra_predictors(void);
void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
int plane, int blk_col, int blk_row,
TX_SIZE tx_size);
-void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
- int bw, int bh, TX_SIZE tx_size,
- PREDICTION_MODE mode, int angle_delta,
- int use_palette,
- FILTER_INTRA_MODE filter_intra_mode,
- const uint8_t *ref, int ref_stride, uint8_t *dst,
- int dst_stride, int aoff, int loff, int plane);
+void av1_predict_intra_block(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+ TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+ FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+ uint8_t *dst, int dst_stride, int col_off, int row_off, int plane);
// Mapping of interintra to intra mode for use in the intra component
static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
@@ -56,8 +54,8 @@ static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
}
static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
- return frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
- cm->allow_intrabc;
+ return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
+ cm->features.allow_intrabc;
}
static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
@@ -76,6 +74,40 @@ static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+static const int16_t dr_intra_derivative[90] = {
+ // More evenly spread out angles and limited to 10-bit
+ // Values that are 0 will never be used
+ // Approx angle
+ 0, 0, 0, //
+ 1023, 0, 0, // 3, ...
+ 547, 0, 0, // 6, ...
+ 372, 0, 0, 0, 0, // 9, ...
+ 273, 0, 0, // 14, ...
+ 215, 0, 0, // 17, ...
+ 178, 0, 0, // 20, ...
+ 151, 0, 0, // 23, ... (113 & 203 are base angles)
+ 132, 0, 0, // 26, ...
+ 116, 0, 0, // 29, ...
+ 102, 0, 0, 0, // 32, ...
+ 90, 0, 0, // 36, ...
+ 80, 0, 0, // 39, ...
+ 71, 0, 0, // 42, ...
+ 64, 0, 0, // 45, ... (45 & 135 are base angles)
+ 57, 0, 0, // 48, ...
+ 51, 0, 0, // 51, ...
+ 45, 0, 0, 0, // 54, ...
+ 40, 0, 0, // 58, ...
+ 35, 0, 0, // 61, ...
+ 31, 0, 0, // 64, ...
+ 27, 0, 0, // 67, ... (67 & 157 are base angles)
+ 23, 0, 0, // 70, ...
+ 19, 0, 0, // 73, ...
+ 15, 0, 0, 0, 0, // 76, ...
+ 11, 0, 0, // 81, ...
+ 7, 0, 0, // 84, ...
+ 3, 0, 0, // 87, ...
+};
+
// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
// If angle > 0 && angle < 90, dx = -((int)(256 / t));
// If angle > 90 && angle < 180, dx = (int)(256 / t);
@@ -110,7 +142,7 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
int type) {
const int d = abs(delta);
const int blk_wh = bs0 + bs1;
- if (d <= 0 || d >= 40) return 0;
+ if (d == 0 || d >= 40) return 0;
return type ? (blk_wh <= 8) : (blk_wh <= 16);
}
#ifdef __cplusplus
diff --git a/media/libaom/src/av1/common/resize.c b/media/libaom/src/av1/common/resize.c
index d61a20aa2..98f28f7b5 100644
--- a/media/libaom/src/av1/common/resize.c
+++ b/media/libaom/src/av1/common/resize.c
@@ -313,6 +313,91 @@ static void interpolate_core(const uint8_t *const input, int in_length,
}
}
+static void interpolate_core_double_prec(const double *const input,
+ int in_length, double *output,
+ int out_length,
+ const int16_t *interp_filters,
+ int interp_taps) {
+ const int32_t delta =
+ (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+ out_length;
+ const int32_t offset =
+ in_length > out_length
+ ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length
+ : -(((int32_t)(out_length - in_length)
+ << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length;
+ double *optr = output;
+ int x, x1, x2, k, int_pel, sub_pel;
+ double sum;
+ int32_t y;
+
+ x = 0;
+ y = offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = out_length - 1;
+ y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+ in_length) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+ ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k) {
+ const int pk = int_pel - interp_taps / 2 + 1 + k;
+ sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+ }
+ *optr++ = sum / (1 << FILTER_BITS);
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+ *optr++ = sum / (1 << FILTER_BITS);
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+ *optr++ = sum / (1 << FILTER_BITS);
+ }
+ // End part.
+ for (; x < out_length; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] *
+ input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+ *optr++ = sum / (1 << FILTER_BITS);
+ }
+ }
+}
+
static void interpolate(const uint8_t *const input, int in_length,
uint8_t *output, int out_length) {
const InterpKernel *interp_filters =
@@ -322,6 +407,15 @@ static void interpolate(const uint8_t *const input, int in_length,
SUBPEL_TAPS);
}
+static void interpolate_double_prec(const double *const input, int in_length,
+ double *output, int out_length) {
+ const InterpKernel *interp_filters =
+ choose_interp_filter(in_length, out_length);
+
+ interpolate_core_double_prec(input, in_length, output, out_length,
+ &interp_filters[0][0], SUBPEL_TAPS);
+}
+
int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
}
@@ -337,7 +431,6 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
}
-#ifndef __clang_analyzer__
static void down2_symeven(const uint8_t *const input, int length,
uint8_t *output) {
// Actual filter len = 2 * filter_len_half.
@@ -392,7 +485,6 @@ static void down2_symeven(const uint8_t *const input, int length,
}
}
}
-#endif
static void down2_symodd(const uint8_t *const input, int length,
uint8_t *output) {
@@ -505,6 +597,12 @@ static void resize_multistep(const uint8_t *const input, int length,
}
}
+static void upscale_multistep_double_prec(const double *const input, int length,
+ double *output, int olength) {
+ assert(length < olength);
+ interpolate_double_prec(input, length, output, olength);
+}
+
static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
int i;
uint8_t *iptr = img;
@@ -523,9 +621,29 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
}
}
-static void resize_plane(const uint8_t *const input, int height, int width,
- int in_stride, uint8_t *output, int height2,
- int width2, int out_stride) {
+static void fill_col_to_arr_double_prec(double *img, int stride, int len,
+ double *arr) {
+ int i;
+ double *iptr = img;
+ double *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void fill_arr_to_col_double_prec(double *img, int stride, int len,
+ double *arr) {
+ int i;
+ double *iptr = img;
+ double *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2, int width2,
+ int out_stride) {
int i;
uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
uint8_t *tmpbuf =
@@ -554,6 +672,33 @@ Error:
aom_free(arrbuf2);
}
+void av1_upscale_plane_double_prec(const double *const input, int height,
+ int width, int in_stride, double *output,
+ int height2, int width2, int out_stride) {
+ int i;
+ double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height);
+ double *arrbuf = (double *)aom_malloc(sizeof(double) * height);
+ double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2);
+ if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ for (i = 0; i < height; ++i)
+ upscale_multistep_double_prec(input + in_stride * i, width,
+ intbuf + width2 * i, width2);
+ for (i = 0; i < width2; ++i) {
+ fill_col_to_arr_double_prec(intbuf + i, width2, height, arrbuf);
+ upscale_multistep_double_prec(arrbuf, height, arrbuf2, height2);
+ fill_arr_to_col_double_prec(output + i, out_stride, height2, arrbuf2);
+ }
+
+Error:
+ aom_free(intbuf);
+ aom_free(arrbuf);
+ aom_free(arrbuf2);
+}
+
static void upscale_normative_rect(const uint8_t *const input, int height,
int width, int in_stride, uint8_t *output,
int height2, int width2, int out_stride,
@@ -613,6 +758,7 @@ static void upscale_normative_rect(const uint8_t *const input, int height,
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
static void highbd_interpolate_core(const uint16_t *const input, int in_length,
uint16_t *output, int out_length, int bd,
const int16_t *interp_filters,
@@ -705,7 +851,6 @@ static void highbd_interpolate(const uint16_t *const input, int in_length,
&interp_filters[0][0], SUBPEL_TAPS);
}
-#ifndef __clang_analyzer__
static void highbd_down2_symeven(const uint16_t *const input, int length,
uint16_t *output, int bd) {
// Actual filter len = 2 * filter_len_half.
@@ -813,7 +958,6 @@ static void highbd_down2_symodd(const uint16_t *const input, int length,
}
}
}
-#endif
static void highbd_resize_multistep(const uint16_t *const input, int length,
uint16_t *output, int olength,
@@ -871,10 +1015,9 @@ static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
}
}
-static void highbd_resize_plane(const uint8_t *const input, int height,
- int width, int in_stride, uint8_t *output,
- int height2, int width2, int out_stride,
- int bd) {
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride, int bd) {
int i;
uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
uint16_t *tmpbuf =
@@ -963,17 +1106,18 @@ static void highbd_upscale_normative_rect(const uint8_t *const input,
aom_free(tmp_right);
}
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
void av1_resize_frame420(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth) {
- resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
- resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, owidth / 2,
- ouv_stride);
- resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, owidth / 2,
- ouv_stride);
+ av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride);
+ av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride);
}
void av1_resize_frame422(const uint8_t *const y, int y_stride,
@@ -981,11 +1125,11 @@ void av1_resize_frame422(const uint8_t *const y, int y_stride,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth) {
- resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
- resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
- ouv_stride);
- resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
- ouv_stride);
+ av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+ ouv_stride);
+ av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+ ouv_stride);
}
void av1_resize_frame444(const uint8_t *const y, int y_stride,
@@ -993,23 +1137,26 @@ void av1_resize_frame444(const uint8_t *const y, int y_stride,
int uv_stride, int height, int width, uint8_t *oy,
int oy_stride, uint8_t *ou, uint8_t *ov,
int ouv_stride, int oheight, int owidth) {
- resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
- resize_plane(u, height, width, uv_stride, ou, oheight, owidth, ouv_stride);
- resize_plane(v, height, width, uv_stride, ov, oheight, owidth, ouv_stride);
+ av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride);
+ av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride);
}
+#if CONFIG_AV1_HIGHBITDEPTH
void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width,
uint8_t *oy, int oy_stride, uint8_t *ou,
uint8_t *ov, int ouv_stride, int oheight,
int owidth, int bd) {
- highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
- oy_stride, bd);
- highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
- owidth / 2, ouv_stride, bd);
- highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
- owidth / 2, ouv_stride, bd);
+ av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride, bd);
+ av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride, bd);
}
void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
@@ -1018,12 +1165,12 @@ void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
uint8_t *oy, int oy_stride, uint8_t *ou,
uint8_t *ov, int ouv_stride, int oheight,
int owidth, int bd) {
- highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
- oy_stride, bd);
- highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
- ouv_stride, bd);
- highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
- ouv_stride, bd);
+ av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+ owidth / 2, ouv_stride, bd);
+ av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+ owidth / 2, ouv_stride, bd);
}
void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
@@ -1032,13 +1179,14 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
uint8_t *oy, int oy_stride, uint8_t *ou,
uint8_t *ov, int ouv_stride, int oheight,
int owidth, int bd) {
- highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
- oy_stride, bd);
- highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
- ouv_stride, bd);
- highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
- ouv_stride, bd);
+ av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride, bd);
+ av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride, bd);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst, int bd,
@@ -1049,16 +1197,24 @@ void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
// the static analysis warnings.
for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
const int is_uv = i > 0;
+#if CONFIG_AV1_HIGHBITDEPTH
if (src->flags & YV12_FLAG_HIGHBITDEPTH)
- highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
- src->crop_widths[is_uv], src->strides[is_uv],
- dst->buffers[i], dst->crop_heights[is_uv],
- dst->crop_widths[is_uv], dst->strides[is_uv], bd);
+ av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv], bd);
else
- resize_plane(src->buffers[i], src->crop_heights[is_uv],
- src->crop_widths[is_uv], src->strides[is_uv],
- dst->buffers[i], dst->crop_heights[is_uv],
- dst->crop_widths[is_uv], dst->strides[is_uv]);
+ av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv]);
+#else
+ (void)bd;
+ av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv]);
+#endif
}
aom_extend_frame_borders(dst, num_planes);
}
@@ -1079,7 +1235,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
upscaled_plane_width, x_step_qn);
- for (int j = 0; j < cm->tile_cols; j++) {
+ for (int j = 0; j < cm->tiles.cols; j++) {
av1_tile_set_col(&tile_col, cm, j);
// Determine the limits of this tile column in both the source
// and destination images.
@@ -1092,7 +1248,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
int upscaled_x1;
- if (j == cm->tile_cols - 1) {
+ if (j == cm->tiles.cols - 1) {
// Note that we can't just use AOMMIN here - due to rounding,
// (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
// upscaled_plane_width.
@@ -1106,8 +1262,9 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
const int dst_width = upscaled_x1 - upscaled_x0;
const int pad_left = (j == 0);
- const int pad_right = (j == cm->tile_cols - 1);
+ const int pad_right = (j == cm->tiles.cols - 1);
+#if CONFIG_AV1_HIGHBITDEPTH
if (cm->seq_params.use_highbitdepth)
highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
dst_ptr, rows, dst_width, dst_stride,
@@ -1117,7 +1274,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
rows, dst_width, dst_stride, x_step_qn, x0_qn,
pad_left, pad_right);
-
+#else
+ upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows,
+ dst_width, dst_stride, x_step_qn, x0_qn, pad_left,
+ pad_right);
+#endif
// Update the fractional pixel offset to prepare for the next tile column.
x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
}
@@ -1155,10 +1316,19 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
// denominator.
static void calculate_scaled_size_helper(int *dim, int denom) {
if (denom != SCALE_NUMERATOR) {
+ // We need to ensure the constraint in "Appendix A" of the spec:
+ // * FrameWidth is greater than or equal to 16
+ // * FrameHeight is greater than or equal to 16
+ // For this, we clamp the downscaled dimension to at least 16. One
+ // exception: if original dimension itself was < 16, then we keep the
+ // downscaled dimension to be same as the original, to ensure that resizing
+ // is valid.
+ const int min_dim = AOMMIN(16, *dim);
// Use this version if we need *dim to be even
// *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
// *width <<= 1;
*dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
+ *dim = AOMMAX(*dim, min_dim);
}
}
@@ -1201,17 +1371,18 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
const int num_planes = av1_num_planes(cm);
if (!av1_superres_scaled(cm)) return;
const SequenceHeader *const seq_params = &cm->seq_params;
+ const int byte_alignment = cm->features.byte_alignment;
YV12_BUFFER_CONFIG copy_buffer;
memset(&copy_buffer, 0, sizeof(copy_buffer));
- YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
+ YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf;
const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
if (aom_alloc_frame_buffer(
&copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+ AOM_BORDER_IN_PIXELS, byte_alignment))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate copy buffer for superres upscaling");
@@ -1225,27 +1396,31 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
// Realloc the current frame buffer at a higher resolution in place.
if (pool != NULL) {
// Use callbacks if on the decoder.
- aom_codec_frame_buffer_t *fb =
- &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer;
+ aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer;
aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
void *cb_priv = pool->cb_priv;
+ lock_buffer_pool(pool);
// Realloc with callback does not release the frame buffer - release first.
- if (release_fb_cb(cb_priv, fb))
+ if (release_fb_cb(cb_priv, fb)) {
+ unlock_buffer_pool(pool);
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to free current frame buffer before superres upscaling");
-
+ }
// aom_realloc_frame_buffer() leaves config data for frame_to_show intact
if (aom_realloc_frame_buffer(
frame_to_show, cm->superres_upscaled_width,
cm->superres_upscaled_height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
+ AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) {
+ unlock_buffer_pool(pool);
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate current frame buffer for superres upscaling");
+ }
+ unlock_buffer_pool(pool);
} else {
// Make a copy of the config data for frame_to_show in copy_buffer
copy_buffer_config(frame_to_show, &copy_buffer);
@@ -1256,7 +1431,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
frame_to_show, cm->superres_upscaled_width,
cm->superres_upscaled_height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+ AOM_BORDER_IN_PIXELS, byte_alignment))
aom_internal_error(
&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to reallocate current frame buffer for superres upscaling");
diff --git a/media/libaom/src/av1/common/resize.h b/media/libaom/src/av1/common/resize.h
index 9a59a8d63..8ee859e5c 100644
--- a/media/libaom/src/av1/common/resize.h
+++ b/media/libaom/src/av1/common/resize.h
@@ -14,7 +14,7 @@
#include <stdio.h>
#include "aom/aom_integer.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#ifdef __cplusplus
extern "C" {
@@ -23,6 +23,9 @@ extern "C" {
void av1_resize_plane(const uint8_t *const input, int height, int width,
int in_stride, uint8_t *output, int height2, int width2,
int out_stride);
+void av1_upscale_plane_double_prec(const double *const input, int height,
+ int width, int in_stride, double *output,
+ int height2, int width2, int out_stride);
void av1_resize_frame420(const uint8_t *const y, int y_stride,
const uint8_t *const u, const uint8_t *const v,
int uv_stride, int height, int width, uint8_t *oy,
diff --git a/media/libaom/src/av1/common/restoration.c b/media/libaom/src/av1/common/restoration.c
index d276a915b..a0f37ad63 100644
--- a/media/libaom/src/av1/common/restoration.c
+++ b/media/libaom/src/av1/common/restoration.c
@@ -17,7 +17,7 @@
#include "config/aom_scale_rtcd.h"
#include "aom_mem/aom_mem.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/resize.h"
#include "av1/common/restoration.h"
#include "aom_dsp/aom_dsp_common.h"
@@ -28,7 +28,7 @@
// The 's' values are calculated based on original 'r' and 'e' values in the
// spec using GenSgrprojVtable().
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
-const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
+const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
{ { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
{ { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
{ { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
@@ -111,7 +111,7 @@ int sgrproj_mtable[SGRPROJ_PARAMS][2];
static void GenSgrprojVtable() {
for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
- const sgr_params_type *const params = &sgr_params[i];
+ const sgr_params_type *const params = &av1_sgr_params[i];
for (int j = 0; j < 2; ++j) {
const int e = params->e[j];
const int r = params->r[j];
@@ -153,6 +153,7 @@ static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
static void extend_frame_highbd(uint16_t *data, int width, int height,
int stride, int border_horz, int border_vert) {
uint16_t *data_p;
@@ -173,13 +174,24 @@ static void extend_frame_highbd(uint16_t *data, int width, int height,
}
}
-void extend_frame(uint8_t *data, int width, int height, int stride,
- int border_horz, int border_vert, int highbd) {
- if (highbd)
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+ int src_stride, uint16_t *dst, int dst_stride) {
+ for (int i = 0; i < height; ++i)
+ memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
+}
+#endif
+
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
border_horz, border_vert);
- else
- extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
+ return;
+ }
+#endif
+ (void)highbd;
+ extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
}
static void copy_tile_lowbd(int width, int height, const uint8_t *src,
@@ -188,19 +200,17 @@ static void copy_tile_lowbd(int width, int height, const uint8_t *src,
memcpy(dst + i * dst_stride, src + i * src_stride, width);
}
-static void copy_tile_highbd(int width, int height, const uint16_t *src,
- int src_stride, uint16_t *dst, int dst_stride) {
- for (int i = 0; i < height; ++i)
- memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
-}
-
static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int highbd) {
- if (highbd)
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
CONVERT_TO_SHORTPTR(dst), dst_stride);
- else
- copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+ return;
+ }
+#endif
+ (void)highbd;
+ copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
}
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
@@ -212,11 +222,10 @@ static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
// rules:
//
// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
-// This extension is done by a call to extend_frame() at the start of the loop
-// restoration process, so the value of copy_above/copy_below doesn't strictly
-// matter.
-// However, by setting *copy_above = *copy_below = 1 whenever loop filtering
-// across tiles is disabled, we can allow
+// This extension is done by a call to av1_extend_frame() at the start of the
+// loop restoration process, so the value of copy_above/copy_below doesn't
+// strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
+// loop filtering across tiles is disabled, we can allow
// {setup,restore}_processing_stripe_boundary to assume that the top/bottom
// data has always been copied, simplifying the behaviour at the left and
// right edges of tiles.
@@ -620,7 +629,7 @@ static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
assert(0 && "Invalid value of r in self-guided filter");
}
-void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
if (params->r[0] == 0) {
xq[0] = 0;
xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
@@ -633,7 +642,7 @@ void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
}
}
-const int32_t x_by_xplus1[256] = {
+const int32_t av1_x_by_xplus1[256] = {
// Special case: Map 0 -> 1 (corresponding to a value of 1/256)
// instead of 0. See comments in selfguided_restoration_internal() for why
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
@@ -656,7 +665,7 @@ const int32_t x_by_xplus1[256] = {
256,
};
-const int32_t one_by_x[MAX_NELEM] = {
+const int32_t av1_one_by_x[MAX_NELEM] = {
4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
};
@@ -665,7 +674,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
int dgd_stride, int bit_depth,
int sgr_params_idx, int radius_idx,
int pass, int32_t *A, int32_t *B) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -726,7 +735,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
// Further, in the calculation of B[k] below, if z == 0 and r == 2,
// then A[k] "should be" 0. But then we can end up setting B[k] to a value
// slightly above 2^(8 + bit depth), due to rounding in the value of
- // one_by_x[25-1].
+ // av1_one_by_x[25-1].
//
// Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
// This fixes the above issues (256 - A[k] fits in a uint8, and we can't
@@ -738,17 +747,17 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
// would be a bad idea, as that corresponds to the case where the image
// is very variable, when we want to preserve the local pixel value as
// much as possible.
- A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
+ A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
// SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
- // one_by_x[n - 1] = round(2^12 / n)
+ // av1_one_by_x[n - 1] = round(2^12 / n)
// => the product here is < 2^(20 + bit_depth) <= 2^32,
// and B[k] is set to a value < 2^(8 + bit depth)
- // This holds even with the rounding in one_by_x and in the overall
+ // This holds even with the rounding in av1_one_by_x and in the overall
// result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
(uint32_t)B[k] *
- (uint32_t)one_by_x[n - 1],
+ (uint32_t)av1_one_by_x[n - 1],
SGRPROJ_RECIP_BITS);
}
}
@@ -757,7 +766,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height,
static void selfguided_restoration_fast_internal(
int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
@@ -883,7 +892,7 @@ int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
}
}
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
// If params->r == 0 we skip the corresponding filter. We only allow one of
// the radii to be 0, as having both equal to 0 would be equivalent to
// skipping SGR entirely.
@@ -899,11 +908,11 @@ int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
return 0;
}
-void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
- int stride, int eps, const int *xqd,
- uint8_t *dst8, int dst_stride,
- int32_t *tmpbuf, int bit_depth,
- int highbd) {
+void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -912,9 +921,9 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
(void)ret;
assert(!ret);
- const sgr_params_type *const params = &sgr_params[eps];
+ const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
- decode_xq(xqd, xq, params);
+ av1_decode_xq(xqd, xq, params);
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
const int k = i * width + j;
@@ -950,12 +959,13 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
- apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
- rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
- dst + j, dst_stride, tmpbuf, bit_depth, 0);
+ av1_apply_selfguided_restoration(
+ src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+ rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
}
}
+#if CONFIG_AV1_HIGHBITDEPTH
static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
int stripe_width, int stripe_height,
int procunit_width, const uint8_t *src8,
@@ -984,11 +994,12 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
int32_t *tmpbuf, int bit_depth) {
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
- apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
- rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
- dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+ av1_apply_selfguided_restoration(
+ src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+ rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
}
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
int stripe_width, int stripe_height,
@@ -996,12 +1007,18 @@ typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
int src_stride, uint8_t *dst, int dst_stride,
int32_t *tmpbuf, int bit_depth);
+#if CONFIG_AV1_HIGHBITDEPTH
#define NUM_STRIPE_FILTERS 4
-
static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
sgrproj_filter_stripe_highbd
};
+#else
+#define NUM_STRIPE_FILTERS 2
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+ wiener_filter_stripe, sgrproj_filter_stripe
+};
+#endif // CONFIG_AV1_HIGHBITDEPTH
// Filter one restoration unit
void av1_loop_restoration_filter_unit(
@@ -1072,13 +1089,6 @@ void av1_loop_restoration_filter_unit(
}
}
-static void filter_frame_on_tile(int tile_row, int tile_col, void *priv,
- AV1_COMMON *cm) {
- (void)tile_col;
- FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
- ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1];
-}
-
static void filter_frame_on_unit(const RestorationTileLimits *limits,
const AV1PixelRect *tile_rect,
int rest_unit_idx, void *priv, int32_t *tmpbuf,
@@ -1106,8 +1116,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
const int frame_height = frame->crop_heights[0];
if (aom_realloc_frame_buffer(
lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
- seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL) < 0)
+ seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+ cm->features.byte_alignment, NULL, NULL, NULL) < 0)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate restoration dst buffer");
@@ -1127,9 +1137,9 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
const int plane_height = frame->crop_heights[is_uv];
FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
- extend_frame(frame->buffers[plane], plane_width, plane_height,
- frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
- highbd);
+ av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
+ frame->strides[is_uv], RESTORATION_BORDER,
+ RESTORATION_BORDER, highbd);
lr_plane_ctxt->rsi = rsi;
lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
@@ -1141,7 +1151,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
lr_plane_ctxt->data_stride = frame->strides[is_uv];
lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
- filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
+ lr_plane_ctxt->tile_stripe0 = 0;
}
}
@@ -1150,10 +1160,10 @@ void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
int vstart, int vend);
- static const copy_fun copy_funs[3] = {
- aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
- };
-
+ static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
+ aom_yv12_partial_coloc_copy_u,
+ aom_yv12_partial_coloc_copy_v };
+ assert(num_planes <= 3);
for (int plane = 0; plane < num_planes; ++plane) {
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
@@ -1180,7 +1190,7 @@ static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
AV1_COMMON *cm, int optimized_lr,
void *lr_ctxt) {
- assert(!cm->all_lossless);
+ assert(!cm->features.all_lossless);
const int num_planes = av1_num_planes(cm);
AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
@@ -1308,7 +1318,7 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
if (bsize != cm->seq_params.sb_size) return 0;
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
- assert(!cm->all_lossless);
+ assert(!cm->features.all_lossless);
const int is_uv = plane > 0;
diff --git a/media/libaom/src/av1/common/restoration.h b/media/libaom/src/av1/common/restoration.h
index d834f9270..3b80dd5a9 100644
--- a/media/libaom/src/av1/common/restoration.h
+++ b/media/libaom/src/av1/common/restoration.h
@@ -22,6 +22,8 @@
extern "C" {
#endif
+// Border for Loop restoration buffer
+#define AOM_RESTORATION_FRAME_BORDER 32
#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
@@ -120,6 +122,7 @@ extern "C" {
// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN_REDUCED (WIENER_WIN - 2)
#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
#define WIENER_FILT_PREC_BITS 7
@@ -275,18 +278,18 @@ typedef struct AV1LrStruct {
YV12_BUFFER_CONFIG *dst;
} AV1LrStruct;
-extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
+extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS];
extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
-extern const int32_t x_by_xplus1[256];
-extern const int32_t one_by_x[MAX_NELEM];
+extern const int32_t av1_x_by_xplus1[256];
+extern const int32_t av1_one_by_x[MAX_NELEM];
void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
int is_uv);
void av1_free_restoration_struct(RestorationInfo *rst_info);
-void extend_frame(uint8_t *data, int width, int height, int stride,
- int border_horz, int border_vert, int highbd);
-void decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert, int highbd);
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
// Filter a single loop restoration unit.
//
diff --git a/media/libaom/src/av1/common/scale.c b/media/libaom/src/av1/common/scale.c
index c525fe229..3b14c0a2c 100644
--- a/media/libaom/src/av1/common/scale.c
+++ b/media/libaom/src/av1/common/scale.c
@@ -37,7 +37,7 @@ static INLINE int scaled_y(int val, const struct scale_factors *sf) {
// Note: Expect val to be in q4 precision
static int unscaled_value(int val, const struct scale_factors *sf) {
(void)sf;
- return val << SCALE_EXTRA_BITS;
+ return val * (1 << SCALE_EXTRA_BITS);
}
static int get_fixed_point_scale_factor(int other_size, int this_size) {
@@ -88,39 +88,41 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
// AV1 convolve functions
// Special case convolve functions should produce the same result as
// av1_convolve_2d.
- // subpel_x_q4 == 0 && subpel_y_q4 == 0
+ // subpel_x_qn == 0 && subpel_y_qn == 0
sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
- // subpel_x_q4 == 0
+ // subpel_x_qn == 0
sf->convolve[0][1][0] = av1_convolve_y_sr;
- // subpel_y_q4 == 0
+ // subpel_y_qn == 0
sf->convolve[1][0][0] = av1_convolve_x_sr;
- // subpel_x_q4 != 0 && subpel_y_q4 != 0
+ // subpel_x_qn != 0 && subpel_y_qn != 0
sf->convolve[1][1][0] = av1_convolve_2d_sr;
- // subpel_x_q4 == 0 && subpel_y_q4 == 0
- sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
- // subpel_x_q4 == 0
- sf->convolve[0][1][1] = av1_jnt_convolve_y;
- // subpel_y_q4 == 0
- sf->convolve[1][0][1] = av1_jnt_convolve_x;
- // subpel_x_q4 != 0 && subpel_y_q4 != 0
- sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+ // subpel_x_qn == 0 && subpel_y_qn == 0
+ sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
+ // subpel_x_qn == 0
+ sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
+ // subpel_y_qn == 0
+ sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
+ // subpel_x_qn != 0 && subpel_y_qn != 0
+ sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
+#if CONFIG_AV1_HIGHBITDEPTH
// AV1 High BD convolve functions
// Special case convolve functions should produce the same result as
// av1_highbd_convolve_2d.
- // subpel_x_q4 == 0 && subpel_y_q4 == 0
+ // subpel_x_qn == 0 && subpel_y_qn == 0
sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
- // subpel_x_q4 == 0
+ // subpel_x_qn == 0
sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
- // subpel_y_q4 == 0
+ // subpel_y_qn == 0
sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
- // subpel_x_q4 != 0 && subpel_y_q4 != 0
+ // subpel_x_qn != 0 && subpel_y_qn != 0
sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
- // subpel_x_q4 == 0 && subpel_y_q4 == 0
- sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
- // subpel_x_q4 == 0
- sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
- // subpel_y_q4 == 0
- sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
- // subpel_x_q4 != 0 && subpel_y_q4 != 0
- sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
+ // subpel_x_qn == 0 && subpel_y_qn == 0
+ sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
+ // subpel_x_qn == 0
+ sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
+ // subpel_y_qn == 0
+ sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
+ // subpel_x_qn != 0 && subpel_y_qn != 0
+ sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
+#endif
}
diff --git a/media/libaom/src/av1/common/scale.h b/media/libaom/src/av1/common/scale.h
index 748e958c3..16b40bde8 100644
--- a/media/libaom/src/av1/common/scale.h
+++ b/media/libaom/src/av1/common/scale.h
@@ -45,11 +45,13 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
int other_h, int this_w, int this_h);
static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+ assert(sf != NULL);
return sf->x_scale_fp != REF_INVALID_SCALE &&
sf->y_scale_fp != REF_INVALID_SCALE;
}
static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+ assert(sf != NULL);
return av1_is_valid_scale(sf) &&
(sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
}
diff --git a/media/libaom/src/av1/common/scan.c b/media/libaom/src/av1/common/scan.c
index 31a787b53..c1d4f3581 100644
--- a/media/libaom/src/av1/common/scan.c
+++ b/media/libaom/src/av1/common/scan.c
@@ -14,9 +14,9 @@
#include "av1/common/common_data.h"
#include "av1/common/scan.h"
-DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
- 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
-};
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6,
+ 9, 12, 13, 10, 7, 11, 14, 15 };
DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
@@ -839,1546 +839,9 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023
};
-// Neighborhood 2-tuples for various scans and blocksizes,
-// in {top, left} order for each position in corresponding scan order.
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 4, 4, 1, 4, 1, 1, 2, 2, 2, 5, 5,
- 8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 4, 4, 8, 8, 0, 0, 1, 4, 5, 8, 9, 12, 1,
- 1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4,
- 4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 4, 1, 1, 4, 4, 2, 5, 5, 8, 6,
- 9, 2, 2, 8, 8, 3, 6, 9, 12, 7, 10, 10, 13, 12, 12, 13, 16,
- 11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
- 24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 0,
- 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 1, 1,
- 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2, 2, 3,
- 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4,
- 4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 12, 12,
- 13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
- 24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 8, 1, 1, 8, 8, 2, 9, 9, 16, 10,
- 17, 2, 2, 16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3, 3, 4, 11,
- 19, 26, 12, 19, 4, 4, 20, 27, 5, 12, 13, 20, 21, 28, 5, 5, 6,
- 13, 14, 21, 22, 29, 6, 6, 7, 14, 15, 22, 23, 30, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 8, 8, 16, 16, 0, 0, 1, 8, 9, 16, 17, 24, 1,
- 1, 2, 9, 10, 17, 18, 25, 2, 2, 3, 10, 11, 18, 19, 26, 3, 3,
- 4, 11, 12, 19, 20, 27, 4, 4, 5, 12, 13, 20, 21, 28, 5, 5, 6,
- 13, 14, 21, 22, 29, 6, 6, 7, 14, 15, 22, 23, 30, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0,
- 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, 7, 14, 8, 8,
- 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17,
- 24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 4, 1, 1, 4, 4, 2, 5, 5, 8, 6, 9, 2,
- 2, 8, 8, 3, 6, 9, 12, 7, 10, 10, 13, 12, 12, 13, 16, 11, 14, 14, 17,
- 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
- 24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
- 34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
- 46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
- 53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 16, 1, 1, 16, 16, 2, 17, 17, 32, 18, 33, 2,
- 2, 32, 32, 3, 18, 33, 48, 19, 34, 34, 49, 3, 3, 4, 19, 35, 50, 20, 35,
- 4, 4, 36, 51, 5, 20, 21, 36, 37, 52, 5, 5, 6, 21, 22, 37, 38, 53, 6,
- 6, 7, 22, 23, 38, 39, 54, 7, 7, 8, 23, 24, 39, 40, 55, 8, 8, 9, 24,
- 25, 40, 41, 56, 9, 9, 10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
- 58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
- 14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, 4, 5,
- 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 12, 12, 13, 16, 14, 17,
- 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
- 24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
- 34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
- 46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
- 53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 0, 0, 1, 16, 2, 17,
- 3, 18, 4, 19, 5, 20, 6, 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12,
- 27, 13, 28, 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36,
- 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31,
- 46, 32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 40, 55,
- 41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28, 32,
- 32, 36, 36, 40, 40, 44, 44, 48, 48, 52, 52, 56, 56, 0, 0, 1, 4, 5, 8,
- 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45,
- 48, 49, 52, 53, 56, 57, 60, 1, 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21,
- 22, 25, 26, 29, 30, 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58,
- 61, 2, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34,
- 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 1, 16, 17, 32, 33, 48, 1, 1, 2,
- 17, 18, 33, 34, 49, 2, 2, 3, 18, 19, 34, 35, 50, 3, 3, 4, 19, 20, 35,
- 36, 51, 4, 4, 5, 20, 21, 36, 37, 52, 5, 5, 6, 21, 22, 37, 38, 53, 6,
- 6, 7, 22, 23, 38, 39, 54, 7, 7, 8, 23, 24, 39, 40, 55, 8, 8, 9, 24,
- 25, 40, 41, 56, 9, 9, 10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
- 58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
- 14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 8, 8, 8, 2, 2, 2,
- 9, 9, 16, 16, 16, 3, 3, 3, 10, 10, 17, 17, 24, 24, 24,
- 4, 4, 4, 11, 11, 18, 18, 25, 25, 32, 32, 32, 5, 5, 5,
- 12, 12, 19, 19, 26, 26, 33, 33, 40, 40, 40, 6, 6, 6, 13,
- 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 7, 14, 14,
- 21, 21, 28, 28, 35, 35, 42, 42, 49, 49, 56, 56, 56, 15, 22,
- 22, 29, 29, 36, 36, 43, 43, 50, 50, 57, 57, 64, 64, 64, 23,
- 30, 30, 37, 37, 44, 44, 51, 51, 58, 58, 65, 65, 72, 72, 72,
- 31, 38, 38, 45, 45, 52, 52, 59, 59, 66, 66, 73, 73, 80, 80,
- 80, 39, 46, 46, 53, 53, 60, 60, 67, 67, 74, 74, 81, 81, 88,
- 88, 88, 47, 54, 54, 61, 61, 68, 68, 75, 75, 82, 82, 89, 89,
- 96, 96, 96, 55, 62, 62, 69, 69, 76, 76, 83, 83, 90, 90, 97,
- 97, 104, 104, 104, 63, 70, 70, 77, 77, 84, 84, 91, 91, 98, 98,
- 105, 105, 112, 112, 112, 71, 78, 78, 85, 85, 92, 92, 99, 99, 106,
- 106, 113, 113, 120, 120, 120, 79, 86, 86, 93, 93, 100, 100, 107, 107,
- 114, 114, 121, 121, 128, 128, 128, 87, 94, 94, 101, 101, 108, 108, 115,
- 115, 122, 122, 129, 129, 136, 136, 136, 95, 102, 102, 109, 109, 116, 116,
- 123, 123, 130, 130, 137, 137, 144, 144, 144, 103, 110, 110, 117, 117, 124,
- 124, 131, 131, 138, 138, 145, 145, 152, 152, 152, 111, 118, 118, 125, 125,
- 132, 132, 139, 139, 146, 146, 153, 153, 160, 160, 160, 119, 126, 126, 133,
- 133, 140, 140, 147, 147, 154, 154, 161, 161, 168, 168, 168, 127, 134, 134,
- 141, 141, 148, 148, 155, 155, 162, 162, 169, 169, 176, 176, 176, 135, 142,
- 142, 149, 149, 156, 156, 163, 163, 170, 170, 177, 177, 184, 184, 184, 143,
- 150, 150, 157, 157, 164, 164, 171, 171, 178, 178, 185, 185, 192, 192, 192,
- 151, 158, 158, 165, 165, 172, 172, 179, 179, 186, 186, 193, 193, 200, 200,
- 200, 159, 166, 166, 173, 173, 180, 180, 187, 187, 194, 194, 201, 201, 208,
- 208, 208, 167, 174, 174, 181, 181, 188, 188, 195, 195, 202, 202, 209, 209,
- 216, 216, 216, 175, 182, 182, 189, 189, 196, 196, 203, 203, 210, 210, 217,
- 217, 224, 224, 224, 183, 190, 190, 197, 197, 204, 204, 211, 211, 218, 218,
- 225, 225, 232, 232, 232, 191, 198, 198, 205, 205, 212, 212, 219, 219, 226,
- 226, 233, 233, 240, 240, 240, 199, 206, 206, 213, 213, 220, 220, 227, 227,
- 234, 234, 241, 241, 248, 207, 214, 214, 221, 221, 228, 228, 235, 235, 242,
- 242, 249, 215, 222, 222, 229, 229, 236, 236, 243, 243, 250, 223, 230, 230,
- 237, 237, 244, 244, 251, 231, 238, 238, 245, 245, 252, 239, 246, 246, 253,
- 247, 254, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 32, 32, 32, 2, 2, 2,
- 33, 33, 64, 64, 64, 3, 3, 3, 34, 34, 65, 65, 96, 96, 96,
- 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128, 5, 5, 5,
- 36, 36, 67, 67, 98, 98, 129, 129, 160, 160, 160, 6, 6, 6, 37,
- 37, 68, 68, 99, 99, 130, 130, 161, 161, 192, 192, 192, 7, 7, 7,
- 38, 38, 69, 69, 100, 100, 131, 131, 162, 162, 193, 193, 224, 8, 8,
- 8, 39, 39, 70, 70, 101, 101, 132, 132, 163, 163, 194, 194, 225, 9,
- 9, 9, 40, 40, 71, 71, 102, 102, 133, 133, 164, 164, 195, 195, 226,
- 10, 10, 10, 41, 41, 72, 72, 103, 103, 134, 134, 165, 165, 196, 196,
- 227, 11, 11, 11, 42, 42, 73, 73, 104, 104, 135, 135, 166, 166, 197,
- 197, 228, 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167,
- 198, 198, 229, 13, 13, 13, 44, 44, 75, 75, 106, 106, 137, 137, 168,
- 168, 199, 199, 230, 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138,
- 169, 169, 200, 200, 231, 15, 15, 15, 46, 46, 77, 77, 108, 108, 139,
- 139, 170, 170, 201, 201, 232, 16, 16, 16, 47, 47, 78, 78, 109, 109,
- 140, 140, 171, 171, 202, 202, 233, 17, 17, 17, 48, 48, 79, 79, 110,
- 110, 141, 141, 172, 172, 203, 203, 234, 18, 18, 18, 49, 49, 80, 80,
- 111, 111, 142, 142, 173, 173, 204, 204, 235, 19, 19, 19, 50, 50, 81,
- 81, 112, 112, 143, 143, 174, 174, 205, 205, 236, 20, 20, 20, 51, 51,
- 82, 82, 113, 113, 144, 144, 175, 175, 206, 206, 237, 21, 21, 21, 52,
- 52, 83, 83, 114, 114, 145, 145, 176, 176, 207, 207, 238, 22, 22, 22,
- 53, 53, 84, 84, 115, 115, 146, 146, 177, 177, 208, 208, 239, 23, 23,
- 23, 54, 54, 85, 85, 116, 116, 147, 147, 178, 178, 209, 209, 240, 24,
- 24, 24, 55, 55, 86, 86, 117, 117, 148, 148, 179, 179, 210, 210, 241,
- 25, 25, 25, 56, 56, 87, 87, 118, 118, 149, 149, 180, 180, 211, 211,
- 242, 26, 26, 26, 57, 57, 88, 88, 119, 119, 150, 150, 181, 181, 212,
- 212, 243, 27, 27, 27, 58, 58, 89, 89, 120, 120, 151, 151, 182, 182,
- 213, 213, 244, 28, 28, 28, 59, 59, 90, 90, 121, 121, 152, 152, 183,
- 183, 214, 214, 245, 29, 29, 29, 60, 60, 91, 91, 122, 122, 153, 153,
- 184, 184, 215, 215, 246, 30, 30, 30, 61, 61, 92, 92, 123, 123, 154,
- 154, 185, 185, 216, 216, 247, 31, 62, 62, 93, 93, 124, 124, 155, 155,
- 186, 186, 217, 217, 248, 63, 94, 94, 125, 125, 156, 156, 187, 187, 218,
- 218, 249, 95, 126, 126, 157, 157, 188, 188, 219, 219, 250, 127, 158, 158,
- 189, 189, 220, 220, 251, 159, 190, 190, 221, 221, 252, 191, 222, 222, 253,
- 223, 254, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 0, 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13,
- 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14,
- 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28,
- 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29,
- 36, 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43,
- 37, 44, 38, 45, 39, 46, 40, 40, 41, 48, 42, 49, 43, 50, 44,
- 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56, 50, 57, 51, 58,
- 52, 59, 53, 60, 54, 61, 55, 62, 56, 56, 57, 64, 58, 65, 59,
- 66, 60, 67, 61, 68, 62, 69, 63, 70, 64, 64, 65, 72, 66, 73,
- 67, 74, 68, 75, 69, 76, 70, 77, 71, 78, 72, 72, 73, 80, 74,
- 81, 75, 82, 76, 83, 77, 84, 78, 85, 79, 86, 80, 80, 81, 88,
- 82, 89, 83, 90, 84, 91, 85, 92, 86, 93, 87, 94, 88, 88, 89,
- 96, 90, 97, 91, 98, 92, 99, 93, 100, 94, 101, 95, 102, 96, 96,
- 97, 104, 98, 105, 99, 106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
- 104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
- 112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
- 126, 120, 120, 121, 128, 122, 129, 123, 130, 124, 131, 125, 132, 126, 133,
- 127, 134, 128, 128, 129, 136, 130, 137, 131, 138, 132, 139, 133, 140, 134,
- 141, 135, 142, 136, 136, 137, 144, 138, 145, 139, 146, 140, 147, 141, 148,
- 142, 149, 143, 150, 144, 144, 145, 152, 146, 153, 147, 154, 148, 155, 149,
- 156, 150, 157, 151, 158, 152, 152, 153, 160, 154, 161, 155, 162, 156, 163,
- 157, 164, 158, 165, 159, 166, 160, 160, 161, 168, 162, 169, 163, 170, 164,
- 171, 165, 172, 166, 173, 167, 174, 168, 168, 169, 176, 170, 177, 171, 178,
- 172, 179, 173, 180, 174, 181, 175, 182, 176, 176, 177, 184, 178, 185, 179,
- 186, 180, 187, 181, 188, 182, 189, 183, 190, 184, 184, 185, 192, 186, 193,
- 187, 194, 188, 195, 189, 196, 190, 197, 191, 198, 192, 192, 193, 200, 194,
- 201, 195, 202, 196, 203, 197, 204, 198, 205, 199, 206, 200, 200, 201, 208,
- 202, 209, 203, 210, 204, 211, 205, 212, 206, 213, 207, 214, 208, 208, 209,
- 216, 210, 217, 211, 218, 212, 219, 213, 220, 214, 221, 215, 222, 216, 216,
- 217, 224, 218, 225, 219, 226, 220, 227, 221, 228, 222, 229, 223, 230, 224,
- 224, 225, 232, 226, 233, 227, 234, 228, 235, 229, 236, 230, 237, 231, 238,
- 232, 232, 233, 240, 234, 241, 235, 242, 236, 243, 237, 244, 238, 245, 239,
- 246, 240, 240, 241, 248, 242, 249, 243, 250, 244, 251, 245, 252, 246, 253,
- 247, 254, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
- 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21,
- 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
- 29, 29, 30, 30, 0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5,
- 36, 6, 37, 7, 38, 8, 39, 9, 40, 10, 41, 11, 42, 12, 43,
- 13, 44, 14, 45, 15, 46, 16, 47, 17, 48, 18, 49, 19, 50, 20,
- 51, 21, 52, 22, 53, 23, 54, 24, 55, 25, 56, 26, 57, 27, 58,
- 28, 59, 29, 60, 30, 61, 31, 62, 32, 32, 33, 64, 34, 65, 35,
- 66, 36, 67, 37, 68, 38, 69, 39, 70, 40, 71, 41, 72, 42, 73,
- 43, 74, 44, 75, 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50,
- 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88,
- 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65,
- 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103,
- 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110, 80,
- 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117, 87, 118,
- 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124, 94, 125, 95,
- 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102, 133,
- 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
- 141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
- 118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
- 156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
- 133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
- 171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
- 148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
- 186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
- 163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
- 201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
- 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
- 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
- 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
- 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
- 208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
- 246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
- 223, 254, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48,
- 48, 56, 56, 64, 64, 72, 72, 80, 80, 88, 88, 96, 96, 104, 104,
- 112, 112, 120, 120, 128, 128, 136, 136, 144, 144, 152, 152, 160, 160, 168,
- 168, 176, 176, 184, 184, 192, 192, 200, 200, 208, 208, 216, 216, 224, 224,
- 232, 232, 240, 240, 0, 0, 1, 8, 9, 16, 17, 24, 25, 32, 33,
- 40, 41, 48, 49, 56, 57, 64, 65, 72, 73, 80, 81, 88, 89, 96,
- 97, 104, 105, 112, 113, 120, 121, 128, 129, 136, 137, 144, 145, 152, 153,
- 160, 161, 168, 169, 176, 177, 184, 185, 192, 193, 200, 201, 208, 209, 216,
- 217, 224, 225, 232, 233, 240, 241, 248, 1, 1, 2, 9, 10, 17, 18,
- 25, 26, 33, 34, 41, 42, 49, 50, 57, 58, 65, 66, 73, 74, 81,
- 82, 89, 90, 97, 98, 105, 106, 113, 114, 121, 122, 129, 130, 137, 138,
- 145, 146, 153, 154, 161, 162, 169, 170, 177, 178, 185, 186, 193, 194, 201,
- 202, 209, 210, 217, 218, 225, 226, 233, 234, 241, 242, 249, 2, 2, 3,
- 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59, 66,
- 67, 74, 75, 82, 83, 90, 91, 98, 99, 106, 107, 114, 115, 122, 123,
- 130, 131, 138, 139, 146, 147, 154, 155, 162, 163, 170, 171, 178, 179, 186,
- 187, 194, 195, 202, 203, 210, 211, 218, 219, 226, 227, 234, 235, 242, 243,
- 250, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51,
- 52, 59, 60, 67, 68, 75, 76, 83, 84, 91, 92, 99, 100, 107, 108,
- 115, 116, 123, 124, 131, 132, 139, 140, 147, 148, 155, 156, 163, 164, 171,
- 172, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 219, 220, 227, 228,
- 235, 236, 243, 244, 251, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36,
- 37, 44, 45, 52, 53, 60, 61, 68, 69, 76, 77, 84, 85, 92, 93,
- 100, 101, 108, 109, 116, 117, 124, 125, 132, 133, 140, 141, 148, 149, 156,
- 157, 164, 165, 172, 173, 180, 181, 188, 189, 196, 197, 204, 205, 212, 213,
- 220, 221, 228, 229, 236, 237, 244, 245, 252, 5, 5, 6, 13, 14, 21,
- 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 62, 69, 70, 77, 78,
- 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125, 126, 133, 134, 141,
- 142, 149, 150, 157, 158, 165, 166, 173, 174, 181, 182, 189, 190, 197, 198,
- 205, 206, 213, 214, 221, 222, 229, 230, 237, 238, 245, 246, 253, 6, 6,
- 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63,
- 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
- 127, 134, 135, 142, 143, 150, 151, 158, 159, 166, 167, 174, 175, 182, 183,
- 190, 191, 198, 199, 206, 207, 214, 215, 222, 223, 230, 231, 238, 239, 246,
- 247, 254, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192, 192,
- 0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161, 192, 193, 224,
- 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161, 162, 193, 194, 225,
- 2, 2, 3, 34, 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195, 226,
- 3, 3, 4, 35, 36, 67, 68, 99, 100, 131, 132, 163, 164, 195, 196, 227,
- 4, 4, 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
- 5, 5, 6, 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229,
- 6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
- 7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200, 231,
- 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201, 232,
- 9, 9, 10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233,
- 10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202, 203, 234,
- 11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
- 12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236,
- 13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237,
- 14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, 238,
- 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207, 208, 239,
- 16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
- 17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209, 210, 241,
- 18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
- 19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243,
- 20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244,
- 21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
- 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, 215, 246,
- 23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247,
- 24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216, 217, 248,
- 25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
- 26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218, 219, 250,
- 27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251,
- 28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252,
- 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, 253,
- 30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
- 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0, 0, 1,
- 8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1, 1, 2, 9, 10, 17,
- 18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2, 2, 3, 10, 11, 18, 19, 26, 27,
- 34, 35, 42, 43, 50, 51, 58, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43,
- 44, 51, 52, 59, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53,
- 60, 5, 5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6, 6,
- 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0, 0, 1,
- 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, 7, 14, 8, 8, 9, 16, 10, 17,
- 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20,
- 27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36,
- 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39,
- 46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
- 49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 2, 2, 2, 9, 9, 16, 16,
- 16, 24, 24, 17, 24, 10, 17, 3, 10, 3, 3, 4, 4, 4, 11, 11, 18, 18, 25,
- 25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5, 12, 5, 5, 6,
- 6, 6, 13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49,
- 35, 42, 28, 35, 21, 28, 14, 21, 7, 14, 15, 22, 22, 29, 29, 36, 36, 43, 43,
- 50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52,
- 52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 8, 8, 8, 2, 2, 2,
- 9, 9, 16, 16, 16, 3, 3, 3, 10, 10, 17, 17, 24, 24, 24,
- 4, 4, 4, 11, 11, 18, 18, 25, 25, 32, 32, 32, 5, 5, 5,
- 12, 12, 19, 19, 26, 26, 33, 33, 40, 40, 40, 6, 6, 6, 13,
- 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 7, 14, 14,
- 21, 21, 28, 28, 35, 35, 42, 42, 49, 49, 56, 56, 56, 15, 22,
- 22, 29, 29, 36, 36, 43, 43, 50, 50, 57, 57, 64, 64, 64, 23,
- 30, 30, 37, 37, 44, 44, 51, 51, 58, 58, 65, 65, 72, 72, 72,
- 31, 38, 38, 45, 45, 52, 52, 59, 59, 66, 66, 73, 73, 80, 80,
- 80, 39, 46, 46, 53, 53, 60, 60, 67, 67, 74, 74, 81, 81, 88,
- 88, 88, 47, 54, 54, 61, 61, 68, 68, 75, 75, 82, 82, 89, 89,
- 96, 96, 96, 55, 62, 62, 69, 69, 76, 76, 83, 83, 90, 90, 97,
- 97, 104, 104, 104, 63, 70, 70, 77, 77, 84, 84, 91, 91, 98, 98,
- 105, 105, 112, 112, 112, 71, 78, 78, 85, 85, 92, 92, 99, 99, 106,
- 106, 113, 113, 120, 79, 86, 86, 93, 93, 100, 100, 107, 107, 114, 114,
- 121, 87, 94, 94, 101, 101, 108, 108, 115, 115, 122, 95, 102, 102, 109,
- 109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119,
- 126, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 16, 16, 16, 2, 2, 2,
- 17, 17, 32, 32, 32, 3, 3, 3, 18, 18, 33, 33, 48, 48, 48,
- 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 5, 5, 5,
- 20, 20, 35, 35, 50, 50, 65, 65, 80, 80, 80, 6, 6, 6, 21,
- 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 7, 7, 7,
- 22, 22, 37, 37, 52, 52, 67, 67, 82, 82, 97, 97, 112, 8, 8,
- 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98, 113, 9,
- 9, 9, 24, 24, 39, 39, 54, 54, 69, 69, 84, 84, 99, 99, 114,
- 10, 10, 10, 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100,
- 115, 11, 11, 11, 26, 26, 41, 41, 56, 56, 71, 71, 86, 86, 101,
- 101, 116, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87,
- 102, 102, 117, 13, 13, 13, 28, 28, 43, 43, 58, 58, 73, 73, 88,
- 88, 103, 103, 118, 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74,
- 89, 89, 104, 104, 119, 15, 30, 30, 45, 45, 60, 60, 75, 75, 90,
- 90, 105, 105, 120, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106,
- 121, 47, 62, 62, 77, 77, 92, 92, 107, 107, 122, 63, 78, 78, 93,
- 93, 108, 108, 123, 79, 94, 94, 109, 109, 124, 95, 110, 110, 125, 111,
- 126, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48,
- 56, 56, 64, 64, 72, 72, 80, 80, 88, 88, 96, 96, 104, 104, 112, 112,
- 0, 0, 1, 8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56,
- 57, 64, 65, 72, 73, 80, 81, 88, 89, 96, 97, 104, 105, 112, 113, 120,
- 1, 1, 2, 9, 10, 17, 18, 25, 26, 33, 34, 41, 42, 49, 50, 57,
- 58, 65, 66, 73, 74, 81, 82, 89, 90, 97, 98, 105, 106, 113, 114, 121,
- 2, 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58,
- 59, 66, 67, 74, 75, 82, 83, 90, 91, 98, 99, 106, 107, 114, 115, 122,
- 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51, 52, 59,
- 60, 67, 68, 75, 76, 83, 84, 91, 92, 99, 100, 107, 108, 115, 116, 123,
- 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53, 60,
- 61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124,
- 5, 5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61,
- 62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125,
- 6, 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62,
- 63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
- 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, 96,
- 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96, 97, 112,
- 1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97, 98, 113,
- 2, 2, 3, 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114,
- 3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99, 100, 115,
- 4, 4, 5, 20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
- 5, 5, 6, 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
- 6, 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
- 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
- 8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
- 9, 9, 10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
- 10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
- 11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
- 12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
- 13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
- 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
- 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 0, 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13,
- 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14,
- 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28,
- 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29,
- 36, 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43,
- 37, 44, 38, 45, 39, 46, 40, 40, 41, 48, 42, 49, 43, 50, 44,
- 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56, 50, 57, 51, 58,
- 52, 59, 53, 60, 54, 61, 55, 62, 56, 56, 57, 64, 58, 65, 59,
- 66, 60, 67, 61, 68, 62, 69, 63, 70, 64, 64, 65, 72, 66, 73,
- 67, 74, 68, 75, 69, 76, 70, 77, 71, 78, 72, 72, 73, 80, 74,
- 81, 75, 82, 76, 83, 77, 84, 78, 85, 79, 86, 80, 80, 81, 88,
- 82, 89, 83, 90, 84, 91, 85, 92, 86, 93, 87, 94, 88, 88, 89,
- 96, 90, 97, 91, 98, 92, 99, 93, 100, 94, 101, 95, 102, 96, 96,
- 97, 104, 98, 105, 99, 106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
- 104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
- 112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
- 126, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
- 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6,
- 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28,
- 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21,
- 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43,
- 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36,
- 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58,
- 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51,
- 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73,
- 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66,
- 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88,
- 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81,
- 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103,
- 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96,
- 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118,
- 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
- 126, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 16, 16, 16, 2, 2, 2,
- 17, 17, 32, 32, 32, 3, 3, 3, 18, 18, 33, 33, 48, 48, 48,
- 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 5, 5, 5,
- 20, 20, 35, 35, 50, 50, 65, 65, 80, 80, 80, 6, 6, 6, 21,
- 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 7, 7, 7,
- 22, 22, 37, 37, 52, 52, 67, 67, 82, 82, 97, 97, 112, 112, 112,
- 8, 8, 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98,
- 113, 113, 128, 128, 128, 9, 9, 9, 24, 24, 39, 39, 54, 54, 69,
- 69, 84, 84, 99, 99, 114, 114, 129, 129, 144, 144, 144, 10, 10, 10,
- 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100, 115, 115, 130,
- 130, 145, 145, 160, 160, 160, 11, 11, 11, 26, 26, 41, 41, 56, 56,
- 71, 71, 86, 86, 101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176,
- 176, 176, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87,
- 102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
- 13, 13, 13, 28, 28, 43, 43, 58, 58, 73, 73, 88, 88, 103, 103,
- 118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208,
- 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74, 89, 89, 104, 104,
- 119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
- 224, 224, 15, 30, 30, 45, 45, 60, 60, 75, 75, 90, 90, 105, 105,
- 120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225,
- 225, 240, 240, 240, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106,
- 121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226,
- 226, 241, 241, 256, 256, 256, 47, 62, 62, 77, 77, 92, 92, 107, 107,
- 122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227,
- 227, 242, 242, 257, 257, 272, 272, 272, 63, 78, 78, 93, 93, 108, 108,
- 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228,
- 228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79, 94, 94, 109, 109,
- 124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229,
- 229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95, 110, 110,
- 125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
- 230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111,
- 126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231,
- 231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336,
- 336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232,
- 232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337,
- 352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233,
- 233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338,
- 353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
- 234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339,
- 354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235,
- 235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340,
- 355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236,
- 236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341,
- 356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237,
- 237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342,
- 357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238,
- 238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343,
- 358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448,
- 239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344,
- 359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464,
- 464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345,
- 360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465,
- 465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346,
- 361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466,
- 466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362,
- 377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482,
- 482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393,
- 408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334,
- 334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439,
- 454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395,
- 395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351,
- 366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471,
- 471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442,
- 457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443,
- 443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444,
- 459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475,
- 475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447,
- 462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494,
- 494, 509, 495, 510, 0, 0
-};
-
DECLARE_ALIGNED(16, static const int16_t,
- default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 32, 32, 32, 2, 2, 2,
- 33, 33, 64, 64, 64, 3, 3, 3, 34, 34, 65, 65, 96, 96, 96,
- 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128, 5, 5, 5,
- 36, 36, 67, 67, 98, 98, 129, 129, 160, 160, 160, 6, 6, 6, 37,
- 37, 68, 68, 99, 99, 130, 130, 161, 161, 192, 192, 192, 7, 7, 7,
- 38, 38, 69, 69, 100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224,
- 8, 8, 8, 39, 39, 70, 70, 101, 101, 132, 132, 163, 163, 194, 194,
- 225, 225, 256, 256, 256, 9, 9, 9, 40, 40, 71, 71, 102, 102, 133,
- 133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10, 10, 10,
- 41, 41, 72, 72, 103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258,
- 258, 289, 289, 320, 320, 320, 11, 11, 11, 42, 42, 73, 73, 104, 104,
- 135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352,
- 352, 352, 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167,
- 198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
- 13, 13, 13, 44, 44, 75, 75, 106, 106, 137, 137, 168, 168, 199, 199,
- 230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416,
- 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138, 169, 169, 200, 200,
- 231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448,
- 448, 448, 15, 15, 15, 46, 46, 77, 77, 108, 108, 139, 139, 170, 170,
- 201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418,
- 418, 449, 449, 480, 16, 16, 16, 47, 47, 78, 78, 109, 109, 140, 140,
- 171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388,
- 388, 419, 419, 450, 450, 481, 17, 17, 17, 48, 48, 79, 79, 110, 110,
- 141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358,
- 358, 389, 389, 420, 420, 451, 451, 482, 18, 18, 18, 49, 49, 80, 80,
- 111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328,
- 328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19, 19, 19, 50, 50,
- 81, 81, 112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298,
- 298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20, 20, 20,
- 51, 51, 82, 82, 113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268,
- 268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21,
- 21, 21, 52, 52, 83, 83, 114, 114, 145, 145, 176, 176, 207, 207, 238,
- 238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455,
- 486, 22, 22, 22, 53, 53, 84, 84, 115, 115, 146, 146, 177, 177, 208,
- 208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425,
- 456, 456, 487, 23, 23, 23, 54, 54, 85, 85, 116, 116, 147, 147, 178,
- 178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395,
- 426, 426, 457, 457, 488, 24, 24, 24, 55, 55, 86, 86, 117, 117, 148,
- 148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365,
- 396, 396, 427, 427, 458, 458, 489, 25, 25, 25, 56, 56, 87, 87, 118,
- 118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335,
- 366, 366, 397, 397, 428, 428, 459, 459, 490, 26, 26, 26, 57, 57, 88,
- 88, 119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305,
- 336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27, 27, 27, 58,
- 58, 89, 89, 120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275,
- 306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28, 28,
- 28, 59, 59, 90, 90, 121, 121, 152, 152, 183, 183, 214, 214, 245, 245,
- 276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493,
- 29, 29, 29, 60, 60, 91, 91, 122, 122, 153, 153, 184, 184, 215, 215,
- 246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463,
- 463, 494, 30, 30, 30, 61, 61, 92, 92, 123, 123, 154, 154, 185, 185,
- 216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433,
- 433, 464, 464, 495, 31, 62, 62, 93, 93, 124, 124, 155, 155, 186, 186,
- 217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434,
- 434, 465, 465, 496, 63, 94, 94, 125, 125, 156, 156, 187, 187, 218, 218,
- 249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466,
- 466, 497, 95, 126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281,
- 312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158,
- 158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375,
- 406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283,
- 283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191,
- 222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439,
- 439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378,
- 409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379,
- 379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380,
- 411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443,
- 443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383,
- 414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478,
- 478, 509, 479, 510, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96,
- 96, 112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
- 224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336,
- 336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448,
- 464, 464, 480, 480, 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65,
- 80, 81, 96, 97, 112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192,
- 193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305,
- 320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432,
- 433, 448, 449, 464, 465, 480, 481, 496, 1, 1, 2, 17, 18, 33, 34,
- 49, 50, 65, 66, 81, 82, 97, 98, 113, 114, 129, 130, 145, 146, 161,
- 162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274,
- 289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401,
- 402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2, 2, 3,
- 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114, 115, 130,
- 131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243,
- 258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370,
- 371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483,
- 498, 3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99,
- 100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
- 227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339,
- 340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452,
- 467, 468, 483, 484, 499, 4, 4, 5, 20, 21, 36, 37, 52, 53, 68,
- 69, 84, 85, 100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181,
- 196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308,
- 309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421,
- 436, 437, 452, 453, 468, 469, 484, 485, 500, 5, 5, 6, 21, 22, 37,
- 38, 53, 54, 69, 70, 85, 86, 101, 102, 117, 118, 133, 134, 149, 150,
- 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277,
- 278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390,
- 405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6, 6,
- 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118, 119,
- 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246,
- 247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359,
- 374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486,
- 487, 502, 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88,
- 103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215,
- 216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328,
- 343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455,
- 456, 471, 472, 487, 488, 503, 8, 8, 9, 24, 25, 40, 41, 56, 57,
- 72, 73, 88, 89, 104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184,
- 185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297,
- 312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424,
- 425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9, 9, 10, 25, 26,
- 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121, 122, 137, 138, 153,
- 154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266,
- 281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393,
- 394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10,
- 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
- 123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235,
- 250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362,
- 363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475,
- 490, 491, 506, 11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91,
- 92, 107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204,
- 219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331,
- 332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444,
- 459, 460, 475, 476, 491, 492, 507, 12, 12, 13, 28, 29, 44, 45, 60,
- 61, 76, 77, 92, 93, 108, 109, 124, 125, 140, 141, 156, 157, 172, 173,
- 188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300,
- 301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413,
- 428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13, 13, 14, 29,
- 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125, 126, 141, 142,
- 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269,
- 270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382,
- 397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509,
- 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111,
- 126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
- 239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351,
- 366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478,
- 479, 494, 495, 510, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192,
- 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416,
- 448, 448, 0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161,
- 192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
- 417, 448, 449, 480, 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130,
- 161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
- 386, 417, 418, 449, 450, 481, 2, 2, 3, 34, 35, 66, 67, 98, 99,
- 130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354,
- 355, 386, 387, 418, 419, 450, 451, 482, 3, 3, 4, 35, 36, 67, 68,
- 99, 100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323,
- 324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4, 4, 5, 36, 37,
- 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292,
- 293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5, 5, 6,
- 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261,
- 262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6,
- 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
- 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455,
- 486, 7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199,
- 200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424,
- 455, 456, 487, 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168,
- 169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393,
- 424, 425, 456, 457, 488, 9, 9, 10, 41, 42, 73, 74, 105, 106, 137,
- 138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362,
- 393, 394, 425, 426, 457, 458, 489, 10, 10, 11, 42, 43, 74, 75, 106,
- 107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331,
- 362, 363, 394, 395, 426, 427, 458, 459, 490, 11, 11, 12, 43, 44, 75,
- 76, 107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300,
- 331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12, 12, 13, 44,
- 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269,
- 300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13, 13,
- 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237, 238,
- 269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493,
- 14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207,
- 238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462,
- 463, 494, 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176,
- 207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431,
- 432, 463, 464, 495, 16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145,
- 176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400,
- 401, 432, 433, 464, 465, 496, 17, 17, 18, 49, 50, 81, 82, 113, 114,
- 145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
- 370, 401, 402, 433, 434, 465, 466, 497, 18, 18, 19, 50, 51, 82, 83,
- 114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338,
- 339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19, 19, 20, 51, 52,
- 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307,
- 308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20, 20, 21,
- 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276,
- 277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21,
- 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
- 246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470,
- 501, 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214,
- 215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439,
- 470, 471, 502, 23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183,
- 184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
- 439, 440, 471, 472, 503, 24, 24, 25, 56, 57, 88, 89, 120, 121, 152,
- 153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377,
- 408, 409, 440, 441, 472, 473, 504, 25, 25, 26, 57, 58, 89, 90, 121,
- 122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346,
- 377, 378, 409, 410, 441, 442, 473, 474, 505, 26, 26, 27, 58, 59, 90,
- 91, 122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315,
- 346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27, 27, 28, 59,
- 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284,
- 315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28, 28,
- 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252, 253,
- 284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508,
- 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222,
- 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477,
- 478, 509, 30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191,
- 222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446,
- 447, 478, 479, 510, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
- 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6,
- 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28,
- 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21,
- 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43,
- 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36,
- 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58,
- 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51,
- 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73,
- 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66,
- 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88,
- 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81,
- 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103,
- 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96,
- 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118,
- 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
- 126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
- 119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
- 141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
- 134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
- 156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
- 149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
- 171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
- 164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
- 186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
- 179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
- 201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
- 194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
- 216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
- 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
- 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
- 224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
- 246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
- 239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246,
- 261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268,
- 254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261,
- 276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283,
- 269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276,
- 291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298,
- 284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291,
- 306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313,
- 299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306,
- 321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328,
- 314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321,
- 336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343,
- 329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336,
- 336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358,
- 344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351,
- 366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373,
- 359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366,
- 381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388,
- 374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381,
- 396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403,
- 389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396,
- 411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418,
- 404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411,
- 426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433,
- 419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426,
- 441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448,
- 434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441,
- 456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448,
- 449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456,
- 471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478,
- 464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471,
- 486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493,
- 479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486,
- 501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508,
- 494, 509, 495, 510, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
- 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21,
- 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
- 29, 29, 30, 30, 0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5,
- 36, 6, 37, 7, 38, 8, 39, 9, 40, 10, 41, 11, 42, 12, 43,
- 13, 44, 14, 45, 15, 46, 16, 47, 17, 48, 18, 49, 19, 50, 20,
- 51, 21, 52, 22, 53, 23, 54, 24, 55, 25, 56, 26, 57, 27, 58,
- 28, 59, 29, 60, 30, 61, 31, 62, 32, 32, 33, 64, 34, 65, 35,
- 66, 36, 67, 37, 68, 38, 69, 39, 70, 40, 71, 41, 72, 42, 73,
- 43, 74, 44, 75, 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50,
- 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88,
- 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65,
- 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103,
- 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110, 80,
- 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117, 87, 118,
- 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124, 94, 125, 95,
- 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102, 133,
- 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
- 141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
- 118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
- 156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
- 133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
- 171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
- 148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
- 186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
- 163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
- 201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
- 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
- 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
- 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
- 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
- 208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
- 246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
- 223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230,
- 261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268,
- 238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245,
- 276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283,
- 253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260,
- 291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
- 268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275,
- 306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
- 283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290,
- 321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328,
- 298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305,
- 336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343,
- 313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320,
- 320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358,
- 328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335,
- 366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373,
- 343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350,
- 381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
- 358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365,
- 396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403,
- 373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380,
- 411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
- 388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395,
- 426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433,
- 403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410,
- 441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448,
- 418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425,
- 456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463,
- 433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440,
- 471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478,
- 448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455,
- 486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493,
- 463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470,
- 501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508,
- 478, 509, 479, 510, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96,
- 96, 112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
- 224, 224, 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81,
- 96, 97, 112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208,
- 209, 224, 225, 240, 1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66,
- 81, 82, 97, 98, 113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193,
- 194, 209, 210, 225, 226, 241, 2, 2, 3, 18, 19, 34, 35, 50, 51,
- 66, 67, 82, 83, 98, 99, 114, 115, 130, 131, 146, 147, 162, 163, 178,
- 179, 194, 195, 210, 211, 226, 227, 242, 3, 3, 4, 19, 20, 35, 36,
- 51, 52, 67, 68, 83, 84, 99, 100, 115, 116, 131, 132, 147, 148, 163,
- 164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4, 4, 5, 20, 21,
- 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116, 117, 132, 133, 148,
- 149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5, 5, 6,
- 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117, 118, 133,
- 134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6,
- 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
- 119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231,
- 246, 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103,
- 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
- 231, 232, 247, 8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88,
- 89, 104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201,
- 216, 217, 232, 233, 248, 9, 9, 10, 25, 26, 41, 42, 57, 58, 73,
- 74, 89, 90, 105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186,
- 201, 202, 217, 218, 233, 234, 249, 10, 10, 11, 26, 27, 42, 43, 58,
- 59, 74, 75, 90, 91, 106, 107, 122, 123, 138, 139, 154, 155, 170, 171,
- 186, 187, 202, 203, 218, 219, 234, 235, 250, 11, 11, 12, 27, 28, 43,
- 44, 59, 60, 75, 76, 91, 92, 107, 108, 123, 124, 139, 140, 155, 156,
- 171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12, 12, 13, 28,
- 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124, 125, 140, 141,
- 156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13, 13,
- 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125, 126,
- 141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253,
- 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111,
- 126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
- 239, 254, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
- 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6,
- 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28,
- 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21,
- 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43,
- 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36,
- 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58,
- 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51,
- 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73,
- 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66,
- 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88,
- 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81,
- 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103,
- 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96,
- 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118,
- 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
- 126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
- 119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
- 141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
- 134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
- 156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
- 149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
- 171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
- 164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
- 186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
- 179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
- 201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
- 194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
- 216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
- 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
- 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
- 224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
- 246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
- 239, 254, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 2, 2, 2,
- 17, 17, 32, 32, 32, 48, 48, 33, 48, 18, 33, 3, 18, 3, 3,
- 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 80, 80, 65,
- 80, 50, 65, 35, 50, 20, 35, 5, 20, 5, 5, 6, 6, 6, 21,
- 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 112, 112, 97,
- 112, 82, 97, 67, 82, 52, 67, 37, 52, 22, 37, 7, 22, 7, 7,
- 8, 8, 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98,
- 113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99, 114, 84, 99,
- 69, 84, 54, 69, 39, 54, 24, 39, 9, 24, 9, 9, 10, 10, 10,
- 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100, 115, 115, 130,
- 130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116,
- 131, 101, 116, 86, 101, 71, 86, 56, 71, 41, 56, 26, 41, 11, 26,
- 11, 11, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87,
- 102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
- 208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103,
- 118, 88, 103, 73, 88, 58, 73, 43, 58, 28, 43, 13, 28, 13, 13,
- 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74, 89, 89, 104, 104,
- 119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
- 224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135,
- 150, 120, 135, 105, 120, 90, 105, 75, 90, 60, 75, 45, 60, 30, 45,
- 15, 30, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106, 121, 121,
- 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241,
- 227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122,
- 137, 107, 122, 92, 107, 77, 92, 62, 77, 47, 62, 63, 78, 78, 93,
- 93, 108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198,
- 213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184,
- 154, 169, 139, 154, 124, 139, 109, 124, 94, 109, 79, 94, 95, 110, 110,
- 125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
- 230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141,
- 156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202,
- 202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173,
- 188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
- 234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206,
- 221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253,
- 239, 254, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160,
- 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384,
- 416, 416, 448, 448, 480, 480, 512, 512, 544, 544, 576, 576, 608, 608,
- 640, 640, 672, 672, 704, 704, 736, 736, 768, 768, 800, 800, 832, 832,
- 864, 864, 896, 896, 928, 928, 960, 960, 0, 0, 1, 32, 33, 64,
- 65, 96, 97, 128, 129, 160, 161, 192, 193, 224, 225, 256, 257, 288,
- 289, 320, 321, 352, 353, 384, 385, 416, 417, 448, 449, 480, 481, 512,
- 513, 544, 545, 576, 577, 608, 609, 640, 641, 672, 673, 704, 705, 736,
- 737, 768, 769, 800, 801, 832, 833, 864, 865, 896, 897, 928, 929, 960,
- 961, 992, 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161,
- 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
- 386, 417, 418, 449, 450, 481, 482, 513, 514, 545, 546, 577, 578, 609,
- 610, 641, 642, 673, 674, 705, 706, 737, 738, 769, 770, 801, 802, 833,
- 834, 865, 866, 897, 898, 929, 930, 961, 962, 993, 2, 2, 3, 34,
- 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195, 226, 227, 258,
- 259, 290, 291, 322, 323, 354, 355, 386, 387, 418, 419, 450, 451, 482,
- 483, 514, 515, 546, 547, 578, 579, 610, 611, 642, 643, 674, 675, 706,
- 707, 738, 739, 770, 771, 802, 803, 834, 835, 866, 867, 898, 899, 930,
- 931, 962, 963, 994, 3, 3, 4, 35, 36, 67, 68, 99, 100, 131,
- 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323, 324, 355,
- 356, 387, 388, 419, 420, 451, 452, 483, 484, 515, 516, 547, 548, 579,
- 580, 611, 612, 643, 644, 675, 676, 707, 708, 739, 740, 771, 772, 803,
- 804, 835, 836, 867, 868, 899, 900, 931, 932, 963, 964, 995, 4, 4,
- 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
- 229, 260, 261, 292, 293, 324, 325, 356, 357, 388, 389, 420, 421, 452,
- 453, 484, 485, 516, 517, 548, 549, 580, 581, 612, 613, 644, 645, 676,
- 677, 708, 709, 740, 741, 772, 773, 804, 805, 836, 837, 868, 869, 900,
- 901, 932, 933, 964, 965, 996, 5, 5, 6, 37, 38, 69, 70, 101,
- 102, 133, 134, 165, 166, 197, 198, 229, 230, 261, 262, 293, 294, 325,
- 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 486, 517, 518, 549,
- 550, 581, 582, 613, 614, 645, 646, 677, 678, 709, 710, 741, 742, 773,
- 774, 805, 806, 837, 838, 869, 870, 901, 902, 933, 934, 965, 966, 997,
- 6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198,
- 199, 230, 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422,
- 423, 454, 455, 486, 487, 518, 519, 550, 551, 582, 583, 614, 615, 646,
- 647, 678, 679, 710, 711, 742, 743, 774, 775, 806, 807, 838, 839, 870,
- 871, 902, 903, 934, 935, 966, 967, 998, 7, 7, 8, 39, 40, 71,
- 72, 103, 104, 135, 136, 167, 168, 199, 200, 231, 232, 263, 264, 295,
- 296, 327, 328, 359, 360, 391, 392, 423, 424, 455, 456, 487, 488, 519,
- 520, 551, 552, 583, 584, 615, 616, 647, 648, 679, 680, 711, 712, 743,
- 744, 775, 776, 807, 808, 839, 840, 871, 872, 903, 904, 935, 936, 967,
- 968, 999, 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168,
- 169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392,
- 393, 424, 425, 456, 457, 488, 489, 520, 521, 552, 553, 584, 585, 616,
- 617, 648, 649, 680, 681, 712, 713, 744, 745, 776, 777, 808, 809, 840,
- 841, 872, 873, 904, 905, 936, 937, 968, 969, 1000, 9, 9, 10, 41,
- 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233, 234, 265,
- 266, 297, 298, 329, 330, 361, 362, 393, 394, 425, 426, 457, 458, 489,
- 490, 521, 522, 553, 554, 585, 586, 617, 618, 649, 650, 681, 682, 713,
- 714, 745, 746, 777, 778, 809, 810, 841, 842, 873, 874, 905, 906, 937,
- 938, 969, 970, 1001, 10, 10, 11, 42, 43, 74, 75, 106, 107, 138,
- 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331, 362,
- 363, 394, 395, 426, 427, 458, 459, 490, 491, 522, 523, 554, 555, 586,
- 587, 618, 619, 650, 651, 682, 683, 714, 715, 746, 747, 778, 779, 810,
- 811, 842, 843, 874, 875, 906, 907, 938, 939, 970, 971, 1002, 11, 11,
- 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
- 236, 267, 268, 299, 300, 331, 332, 363, 364, 395, 396, 427, 428, 459,
- 460, 491, 492, 523, 524, 555, 556, 587, 588, 619, 620, 651, 652, 683,
- 684, 715, 716, 747, 748, 779, 780, 811, 812, 843, 844, 875, 876, 907,
- 908, 939, 940, 971, 972, 1003, 12, 12, 13, 44, 45, 76, 77, 108,
- 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269, 300, 301, 332,
- 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 493, 524, 525, 556,
- 557, 588, 589, 620, 621, 652, 653, 684, 685, 716, 717, 748, 749, 780,
- 781, 812, 813, 844, 845, 876, 877, 908, 909, 940, 941, 972, 973, 1004,
- 13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205,
- 206, 237, 238, 269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429,
- 430, 461, 462, 493, 494, 525, 526, 557, 558, 589, 590, 621, 622, 653,
- 654, 685, 686, 717, 718, 749, 750, 781, 782, 813, 814, 845, 846, 877,
- 878, 909, 910, 941, 942, 973, 974, 1005, 14, 14, 15, 46, 47, 78,
- 79, 110, 111, 142, 143, 174, 175, 206, 207, 238, 239, 270, 271, 302,
- 303, 334, 335, 366, 367, 398, 399, 430, 431, 462, 463, 494, 495, 526,
- 527, 558, 559, 590, 591, 622, 623, 654, 655, 686, 687, 718, 719, 750,
- 751, 782, 783, 814, 815, 846, 847, 878, 879, 910, 911, 942, 943, 974,
- 975, 1006, 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175,
- 176, 207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399,
- 400, 431, 432, 463, 464, 495, 496, 527, 528, 559, 560, 591, 592, 623,
- 624, 655, 656, 687, 688, 719, 720, 751, 752, 783, 784, 815, 816, 847,
- 848, 879, 880, 911, 912, 943, 944, 975, 976, 1007, 16, 16, 17, 48,
- 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240, 241, 272,
- 273, 304, 305, 336, 337, 368, 369, 400, 401, 432, 433, 464, 465, 496,
- 497, 528, 529, 560, 561, 592, 593, 624, 625, 656, 657, 688, 689, 720,
- 721, 752, 753, 784, 785, 816, 817, 848, 849, 880, 881, 912, 913, 944,
- 945, 976, 977, 1008, 17, 17, 18, 49, 50, 81, 82, 113, 114, 145,
- 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
- 370, 401, 402, 433, 434, 465, 466, 497, 498, 529, 530, 561, 562, 593,
- 594, 625, 626, 657, 658, 689, 690, 721, 722, 753, 754, 785, 786, 817,
- 818, 849, 850, 881, 882, 913, 914, 945, 946, 977, 978, 1009, 18, 18,
- 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
- 243, 274, 275, 306, 307, 338, 339, 370, 371, 402, 403, 434, 435, 466,
- 467, 498, 499, 530, 531, 562, 563, 594, 595, 626, 627, 658, 659, 690,
- 691, 722, 723, 754, 755, 786, 787, 818, 819, 850, 851, 882, 883, 914,
- 915, 946, 947, 978, 979, 1010, 19, 19, 20, 51, 52, 83, 84, 115,
- 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307, 308, 339,
- 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 500, 531, 532, 563,
- 564, 595, 596, 627, 628, 659, 660, 691, 692, 723, 724, 755, 756, 787,
- 788, 819, 820, 851, 852, 883, 884, 915, 916, 947, 948, 979, 980, 1011,
- 20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212,
- 213, 244, 245, 276, 277, 308, 309, 340, 341, 372, 373, 404, 405, 436,
- 437, 468, 469, 500, 501, 532, 533, 564, 565, 596, 597, 628, 629, 660,
- 661, 692, 693, 724, 725, 756, 757, 788, 789, 820, 821, 852, 853, 884,
- 885, 916, 917, 948, 949, 980, 981, 1012, 21, 21, 22, 53, 54, 85,
- 86, 117, 118, 149, 150, 181, 182, 213, 214, 245, 246, 277, 278, 309,
- 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470, 501, 502, 533,
- 534, 565, 566, 597, 598, 629, 630, 661, 662, 693, 694, 725, 726, 757,
- 758, 789, 790, 821, 822, 853, 854, 885, 886, 917, 918, 949, 950, 981,
- 982, 1013, 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182,
- 183, 214, 215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406,
- 407, 438, 439, 470, 471, 502, 503, 534, 535, 566, 567, 598, 599, 630,
- 631, 662, 663, 694, 695, 726, 727, 758, 759, 790, 791, 822, 823, 854,
- 855, 886, 887, 918, 919, 950, 951, 982, 983, 1014, 23, 23, 24, 55,
- 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247, 248, 279,
- 280, 311, 312, 343, 344, 375, 376, 407, 408, 439, 440, 471, 472, 503,
- 504, 535, 536, 567, 568, 599, 600, 631, 632, 663, 664, 695, 696, 727,
- 728, 759, 760, 791, 792, 823, 824, 855, 856, 887, 888, 919, 920, 951,
- 952, 983, 984, 1015, 24, 24, 25, 56, 57, 88, 89, 120, 121, 152,
- 153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376,
- 377, 408, 409, 440, 441, 472, 473, 504, 505, 536, 537, 568, 569, 600,
- 601, 632, 633, 664, 665, 696, 697, 728, 729, 760, 761, 792, 793, 824,
- 825, 856, 857, 888, 889, 920, 921, 952, 953, 984, 985, 1016, 25, 25,
- 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
- 250, 281, 282, 313, 314, 345, 346, 377, 378, 409, 410, 441, 442, 473,
- 474, 505, 506, 537, 538, 569, 570, 601, 602, 633, 634, 665, 666, 697,
- 698, 729, 730, 761, 762, 793, 794, 825, 826, 857, 858, 889, 890, 921,
- 922, 953, 954, 985, 986, 1017, 26, 26, 27, 58, 59, 90, 91, 122,
- 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315, 346,
- 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 507, 538, 539, 570,
- 571, 602, 603, 634, 635, 666, 667, 698, 699, 730, 731, 762, 763, 794,
- 795, 826, 827, 858, 859, 890, 891, 922, 923, 954, 955, 986, 987, 1018,
- 27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219,
- 220, 251, 252, 283, 284, 315, 316, 347, 348, 379, 380, 411, 412, 443,
- 444, 475, 476, 507, 508, 539, 540, 571, 572, 603, 604, 635, 636, 667,
- 668, 699, 700, 731, 732, 763, 764, 795, 796, 827, 828, 859, 860, 891,
- 892, 923, 924, 955, 956, 987, 988, 1019, 28, 28, 29, 60, 61, 92,
- 93, 124, 125, 156, 157, 188, 189, 220, 221, 252, 253, 284, 285, 316,
- 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508, 509, 540,
- 541, 572, 573, 604, 605, 636, 637, 668, 669, 700, 701, 732, 733, 764,
- 765, 796, 797, 828, 829, 860, 861, 892, 893, 924, 925, 956, 957, 988,
- 989, 1020, 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189,
- 190, 221, 222, 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413,
- 414, 445, 446, 477, 478, 509, 510, 541, 542, 573, 574, 605, 606, 637,
- 638, 669, 670, 701, 702, 733, 734, 765, 766, 797, 798, 829, 830, 861,
- 862, 893, 894, 925, 926, 957, 958, 989, 990, 1021, 30, 30, 31, 62,
- 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254, 255, 286,
- 287, 318, 319, 350, 351, 382, 383, 414, 415, 446, 447, 478, 479, 510,
- 511, 542, 543, 574, 575, 606, 607, 638, 639, 670, 671, 702, 703, 734,
- 735, 766, 767, 798, 799, 830, 831, 862, 863, 894, 895, 926, 927, 958,
- 959, 990, 991, 1022, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
- 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
- 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
- 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26,
- 27, 27, 28, 28, 29, 29, 30, 30, 0, 0, 1, 32, 2, 33,
- 3, 34, 4, 35, 5, 36, 6, 37, 7, 38, 8, 39, 9, 40,
- 10, 41, 11, 42, 12, 43, 13, 44, 14, 45, 15, 46, 16, 47,
- 17, 48, 18, 49, 19, 50, 20, 51, 21, 52, 22, 53, 23, 54,
- 24, 55, 25, 56, 26, 57, 27, 58, 28, 59, 29, 60, 30, 61,
- 31, 62, 32, 32, 33, 64, 34, 65, 35, 66, 36, 67, 37, 68,
- 38, 69, 39, 70, 40, 71, 41, 72, 42, 73, 43, 74, 44, 75,
- 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50, 81, 51, 82,
- 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88, 58, 89,
- 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65, 96,
- 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103,
- 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110,
- 80, 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117,
- 87, 118, 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124,
- 94, 125, 95, 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131,
- 101, 132, 102, 133, 103, 134, 104, 135, 105, 136, 106, 137, 107, 138,
- 108, 139, 109, 140, 110, 141, 111, 142, 112, 143, 113, 144, 114, 145,
- 115, 146, 116, 147, 117, 148, 118, 149, 119, 150, 120, 151, 121, 152,
- 122, 153, 123, 154, 124, 155, 125, 156, 126, 157, 127, 158, 128, 128,
- 129, 160, 130, 161, 131, 162, 132, 163, 133, 164, 134, 165, 135, 166,
- 136, 167, 137, 168, 138, 169, 139, 170, 140, 171, 141, 172, 142, 173,
- 143, 174, 144, 175, 145, 176, 146, 177, 147, 178, 148, 179, 149, 180,
- 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155, 186, 156, 187,
- 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193, 163, 194,
- 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170, 201,
- 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
- 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215,
- 185, 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222,
- 192, 192, 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229,
- 199, 230, 200, 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236,
- 206, 237, 207, 238, 208, 239, 209, 240, 210, 241, 211, 242, 212, 243,
- 213, 244, 214, 245, 215, 246, 216, 247, 217, 248, 218, 249, 219, 250,
- 220, 251, 221, 252, 222, 253, 223, 254, 224, 224, 225, 256, 226, 257,
- 227, 258, 228, 259, 229, 260, 230, 261, 231, 262, 232, 263, 233, 264,
- 234, 265, 235, 266, 236, 267, 237, 268, 238, 269, 239, 270, 240, 271,
- 241, 272, 242, 273, 243, 274, 244, 275, 245, 276, 246, 277, 247, 278,
- 248, 279, 249, 280, 250, 281, 251, 282, 252, 283, 253, 284, 254, 285,
- 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260, 291, 261, 292,
- 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298, 268, 299,
- 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275, 306,
- 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
- 283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320,
- 290, 321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327,
- 297, 328, 298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334,
- 304, 335, 305, 336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341,
- 311, 342, 312, 343, 313, 344, 314, 345, 315, 346, 316, 347, 317, 348,
- 318, 349, 319, 350, 320, 320, 321, 352, 322, 353, 323, 354, 324, 355,
- 325, 356, 326, 357, 327, 358, 328, 359, 329, 360, 330, 361, 331, 362,
- 332, 363, 333, 364, 334, 365, 335, 366, 336, 367, 337, 368, 338, 369,
- 339, 370, 340, 371, 341, 372, 342, 373, 343, 374, 344, 375, 345, 376,
- 346, 377, 347, 378, 348, 379, 349, 380, 350, 381, 351, 382, 352, 352,
- 353, 384, 354, 385, 355, 386, 356, 387, 357, 388, 358, 389, 359, 390,
- 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365, 396, 366, 397,
- 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403, 373, 404,
- 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380, 411,
- 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
- 388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425,
- 395, 426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432,
- 402, 433, 403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439,
- 409, 440, 410, 441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446,
- 416, 416, 417, 448, 418, 449, 419, 450, 420, 451, 421, 452, 422, 453,
- 423, 454, 424, 455, 425, 456, 426, 457, 427, 458, 428, 459, 429, 460,
- 430, 461, 431, 462, 432, 463, 433, 464, 434, 465, 435, 466, 436, 467,
- 437, 468, 438, 469, 439, 470, 440, 471, 441, 472, 442, 473, 443, 474,
- 444, 475, 445, 476, 446, 477, 447, 478, 448, 448, 449, 480, 450, 481,
- 451, 482, 452, 483, 453, 484, 454, 485, 455, 486, 456, 487, 457, 488,
- 458, 489, 459, 490, 460, 491, 461, 492, 462, 493, 463, 494, 464, 495,
- 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470, 501, 471, 502,
- 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508, 478, 509,
- 479, 510, 480, 480, 481, 512, 482, 513, 483, 514, 484, 515, 485, 516,
- 486, 517, 487, 518, 488, 519, 489, 520, 490, 521, 491, 522, 492, 523,
- 493, 524, 494, 525, 495, 526, 496, 527, 497, 528, 498, 529, 499, 530,
- 500, 531, 501, 532, 502, 533, 503, 534, 504, 535, 505, 536, 506, 537,
- 507, 538, 508, 539, 509, 540, 510, 541, 511, 542, 512, 512, 513, 544,
- 514, 545, 515, 546, 516, 547, 517, 548, 518, 549, 519, 550, 520, 551,
- 521, 552, 522, 553, 523, 554, 524, 555, 525, 556, 526, 557, 527, 558,
- 528, 559, 529, 560, 530, 561, 531, 562, 532, 563, 533, 564, 534, 565,
- 535, 566, 536, 567, 537, 568, 538, 569, 539, 570, 540, 571, 541, 572,
- 542, 573, 543, 574, 544, 544, 545, 576, 546, 577, 547, 578, 548, 579,
- 549, 580, 550, 581, 551, 582, 552, 583, 553, 584, 554, 585, 555, 586,
- 556, 587, 557, 588, 558, 589, 559, 590, 560, 591, 561, 592, 562, 593,
- 563, 594, 564, 595, 565, 596, 566, 597, 567, 598, 568, 599, 569, 600,
- 570, 601, 571, 602, 572, 603, 573, 604, 574, 605, 575, 606, 576, 576,
- 577, 608, 578, 609, 579, 610, 580, 611, 581, 612, 582, 613, 583, 614,
- 584, 615, 585, 616, 586, 617, 587, 618, 588, 619, 589, 620, 590, 621,
- 591, 622, 592, 623, 593, 624, 594, 625, 595, 626, 596, 627, 597, 628,
- 598, 629, 599, 630, 600, 631, 601, 632, 602, 633, 603, 634, 604, 635,
- 605, 636, 606, 637, 607, 638, 608, 608, 609, 640, 610, 641, 611, 642,
- 612, 643, 613, 644, 614, 645, 615, 646, 616, 647, 617, 648, 618, 649,
- 619, 650, 620, 651, 621, 652, 622, 653, 623, 654, 624, 655, 625, 656,
- 626, 657, 627, 658, 628, 659, 629, 660, 630, 661, 631, 662, 632, 663,
- 633, 664, 634, 665, 635, 666, 636, 667, 637, 668, 638, 669, 639, 670,
- 640, 640, 641, 672, 642, 673, 643, 674, 644, 675, 645, 676, 646, 677,
- 647, 678, 648, 679, 649, 680, 650, 681, 651, 682, 652, 683, 653, 684,
- 654, 685, 655, 686, 656, 687, 657, 688, 658, 689, 659, 690, 660, 691,
- 661, 692, 662, 693, 663, 694, 664, 695, 665, 696, 666, 697, 667, 698,
- 668, 699, 669, 700, 670, 701, 671, 702, 672, 672, 673, 704, 674, 705,
- 675, 706, 676, 707, 677, 708, 678, 709, 679, 710, 680, 711, 681, 712,
- 682, 713, 683, 714, 684, 715, 685, 716, 686, 717, 687, 718, 688, 719,
- 689, 720, 690, 721, 691, 722, 692, 723, 693, 724, 694, 725, 695, 726,
- 696, 727, 697, 728, 698, 729, 699, 730, 700, 731, 701, 732, 702, 733,
- 703, 734, 704, 704, 705, 736, 706, 737, 707, 738, 708, 739, 709, 740,
- 710, 741, 711, 742, 712, 743, 713, 744, 714, 745, 715, 746, 716, 747,
- 717, 748, 718, 749, 719, 750, 720, 751, 721, 752, 722, 753, 723, 754,
- 724, 755, 725, 756, 726, 757, 727, 758, 728, 759, 729, 760, 730, 761,
- 731, 762, 732, 763, 733, 764, 734, 765, 735, 766, 736, 736, 737, 768,
- 738, 769, 739, 770, 740, 771, 741, 772, 742, 773, 743, 774, 744, 775,
- 745, 776, 746, 777, 747, 778, 748, 779, 749, 780, 750, 781, 751, 782,
- 752, 783, 753, 784, 754, 785, 755, 786, 756, 787, 757, 788, 758, 789,
- 759, 790, 760, 791, 761, 792, 762, 793, 763, 794, 764, 795, 765, 796,
- 766, 797, 767, 798, 768, 768, 769, 800, 770, 801, 771, 802, 772, 803,
- 773, 804, 774, 805, 775, 806, 776, 807, 777, 808, 778, 809, 779, 810,
- 780, 811, 781, 812, 782, 813, 783, 814, 784, 815, 785, 816, 786, 817,
- 787, 818, 788, 819, 789, 820, 790, 821, 791, 822, 792, 823, 793, 824,
- 794, 825, 795, 826, 796, 827, 797, 828, 798, 829, 799, 830, 800, 800,
- 801, 832, 802, 833, 803, 834, 804, 835, 805, 836, 806, 837, 807, 838,
- 808, 839, 809, 840, 810, 841, 811, 842, 812, 843, 813, 844, 814, 845,
- 815, 846, 816, 847, 817, 848, 818, 849, 819, 850, 820, 851, 821, 852,
- 822, 853, 823, 854, 824, 855, 825, 856, 826, 857, 827, 858, 828, 859,
- 829, 860, 830, 861, 831, 862, 832, 832, 833, 864, 834, 865, 835, 866,
- 836, 867, 837, 868, 838, 869, 839, 870, 840, 871, 841, 872, 842, 873,
- 843, 874, 844, 875, 845, 876, 846, 877, 847, 878, 848, 879, 849, 880,
- 850, 881, 851, 882, 852, 883, 853, 884, 854, 885, 855, 886, 856, 887,
- 857, 888, 858, 889, 859, 890, 860, 891, 861, 892, 862, 893, 863, 894,
- 864, 864, 865, 896, 866, 897, 867, 898, 868, 899, 869, 900, 870, 901,
- 871, 902, 872, 903, 873, 904, 874, 905, 875, 906, 876, 907, 877, 908,
- 878, 909, 879, 910, 880, 911, 881, 912, 882, 913, 883, 914, 884, 915,
- 885, 916, 886, 917, 887, 918, 888, 919, 889, 920, 890, 921, 891, 922,
- 892, 923, 893, 924, 894, 925, 895, 926, 896, 896, 897, 928, 898, 929,
- 899, 930, 900, 931, 901, 932, 902, 933, 903, 934, 904, 935, 905, 936,
- 906, 937, 907, 938, 908, 939, 909, 940, 910, 941, 911, 942, 912, 943,
- 913, 944, 914, 945, 915, 946, 916, 947, 917, 948, 918, 949, 919, 950,
- 920, 951, 921, 952, 922, 953, 923, 954, 924, 955, 925, 956, 926, 957,
- 927, 958, 928, 928, 929, 960, 930, 961, 931, 962, 932, 963, 933, 964,
- 934, 965, 935, 966, 936, 967, 937, 968, 938, 969, 939, 970, 940, 971,
- 941, 972, 942, 973, 943, 974, 944, 975, 945, 976, 946, 977, 947, 978,
- 948, 979, 949, 980, 950, 981, 951, 982, 952, 983, 953, 984, 954, 985,
- 955, 986, 956, 987, 957, 988, 958, 989, 959, 990, 960, 960, 961, 992,
- 962, 993, 963, 994, 964, 995, 965, 996, 966, 997, 967, 998, 968, 999,
- 969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006,
- 976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013,
- 983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
- 990, 1021, 991, 1022, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
- default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
- 0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 2, 2,
- 2, 33, 33, 64, 64, 64, 96, 96, 65, 96, 34, 65, 3, 34,
- 3, 3, 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128,
- 160, 160, 129, 160, 98, 129, 67, 98, 36, 67, 5, 36, 5, 5,
- 6, 6, 6, 37, 37, 68, 68, 99, 99, 130, 130, 161, 161, 192,
- 192, 192, 224, 224, 193, 224, 162, 193, 131, 162, 100, 131, 69, 100,
- 38, 69, 7, 38, 7, 7, 8, 8, 8, 39, 39, 70, 70, 101,
- 101, 132, 132, 163, 163, 194, 194, 225, 225, 256, 256, 256, 288, 288,
- 257, 288, 226, 257, 195, 226, 164, 195, 133, 164, 102, 133, 71, 102,
- 40, 71, 9, 40, 9, 9, 10, 10, 10, 41, 41, 72, 72, 103,
- 103, 134, 134, 165, 165, 196, 196, 227, 227, 258, 258, 289, 289, 320,
- 320, 320, 352, 352, 321, 352, 290, 321, 259, 290, 228, 259, 197, 228,
- 166, 197, 135, 166, 104, 135, 73, 104, 42, 73, 11, 42, 11, 11,
- 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167, 198,
- 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
- 416, 416, 385, 416, 354, 385, 323, 354, 292, 323, 261, 292, 230, 261,
- 199, 230, 168, 199, 137, 168, 106, 137, 75, 106, 44, 75, 13, 44,
- 13, 13, 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138, 169,
- 169, 200, 200, 231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386,
- 386, 417, 417, 448, 448, 448, 480, 480, 449, 480, 418, 449, 387, 418,
- 356, 387, 325, 356, 294, 325, 263, 294, 232, 263, 201, 232, 170, 201,
- 139, 170, 108, 139, 77, 108, 46, 77, 15, 46, 15, 15, 16, 16,
- 16, 47, 47, 78, 78, 109, 109, 140, 140, 171, 171, 202, 202, 233,
- 233, 264, 264, 295, 295, 326, 326, 357, 357, 388, 388, 419, 419, 450,
- 450, 481, 481, 512, 512, 512, 544, 544, 513, 544, 482, 513, 451, 482,
- 420, 451, 389, 420, 358, 389, 327, 358, 296, 327, 265, 296, 234, 265,
- 203, 234, 172, 203, 141, 172, 110, 141, 79, 110, 48, 79, 17, 48,
- 17, 17, 18, 18, 18, 49, 49, 80, 80, 111, 111, 142, 142, 173,
- 173, 204, 204, 235, 235, 266, 266, 297, 297, 328, 328, 359, 359, 390,
- 390, 421, 421, 452, 452, 483, 483, 514, 514, 545, 545, 576, 576, 576,
- 608, 608, 577, 608, 546, 577, 515, 546, 484, 515, 453, 484, 422, 453,
- 391, 422, 360, 391, 329, 360, 298, 329, 267, 298, 236, 267, 205, 236,
- 174, 205, 143, 174, 112, 143, 81, 112, 50, 81, 19, 50, 19, 19,
- 20, 20, 20, 51, 51, 82, 82, 113, 113, 144, 144, 175, 175, 206,
- 206, 237, 237, 268, 268, 299, 299, 330, 330, 361, 361, 392, 392, 423,
- 423, 454, 454, 485, 485, 516, 516, 547, 547, 578, 578, 609, 609, 640,
- 640, 640, 672, 672, 641, 672, 610, 641, 579, 610, 548, 579, 517, 548,
- 486, 517, 455, 486, 424, 455, 393, 424, 362, 393, 331, 362, 300, 331,
- 269, 300, 238, 269, 207, 238, 176, 207, 145, 176, 114, 145, 83, 114,
- 52, 83, 21, 52, 21, 21, 22, 22, 22, 53, 53, 84, 84, 115,
- 115, 146, 146, 177, 177, 208, 208, 239, 239, 270, 270, 301, 301, 332,
- 332, 363, 363, 394, 394, 425, 425, 456, 456, 487, 487, 518, 518, 549,
- 549, 580, 580, 611, 611, 642, 642, 673, 673, 704, 704, 704, 736, 736,
- 705, 736, 674, 705, 643, 674, 612, 643, 581, 612, 550, 581, 519, 550,
- 488, 519, 457, 488, 426, 457, 395, 426, 364, 395, 333, 364, 302, 333,
- 271, 302, 240, 271, 209, 240, 178, 209, 147, 178, 116, 147, 85, 116,
- 54, 85, 23, 54, 23, 23, 24, 24, 24, 55, 55, 86, 86, 117,
- 117, 148, 148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334,
- 334, 365, 365, 396, 396, 427, 427, 458, 458, 489, 489, 520, 520, 551,
- 551, 582, 582, 613, 613, 644, 644, 675, 675, 706, 706, 737, 737, 768,
- 768, 768, 800, 800, 769, 800, 738, 769, 707, 738, 676, 707, 645, 676,
- 614, 645, 583, 614, 552, 583, 521, 552, 490, 521, 459, 490, 428, 459,
- 397, 428, 366, 397, 335, 366, 304, 335, 273, 304, 242, 273, 211, 242,
- 180, 211, 149, 180, 118, 149, 87, 118, 56, 87, 25, 56, 25, 25,
- 26, 26, 26, 57, 57, 88, 88, 119, 119, 150, 150, 181, 181, 212,
- 212, 243, 243, 274, 274, 305, 305, 336, 336, 367, 367, 398, 398, 429,
- 429, 460, 460, 491, 491, 522, 522, 553, 553, 584, 584, 615, 615, 646,
- 646, 677, 677, 708, 708, 739, 739, 770, 770, 801, 801, 832, 832, 832,
- 864, 864, 833, 864, 802, 833, 771, 802, 740, 771, 709, 740, 678, 709,
- 647, 678, 616, 647, 585, 616, 554, 585, 523, 554, 492, 523, 461, 492,
- 430, 461, 399, 430, 368, 399, 337, 368, 306, 337, 275, 306, 244, 275,
- 213, 244, 182, 213, 151, 182, 120, 151, 89, 120, 58, 89, 27, 58,
- 27, 27, 28, 28, 28, 59, 59, 90, 90, 121, 121, 152, 152, 183,
- 183, 214, 214, 245, 245, 276, 276, 307, 307, 338, 338, 369, 369, 400,
- 400, 431, 431, 462, 462, 493, 493, 524, 524, 555, 555, 586, 586, 617,
- 617, 648, 648, 679, 679, 710, 710, 741, 741, 772, 772, 803, 803, 834,
- 834, 865, 865, 896, 896, 896, 928, 928, 897, 928, 866, 897, 835, 866,
- 804, 835, 773, 804, 742, 773, 711, 742, 680, 711, 649, 680, 618, 649,
- 587, 618, 556, 587, 525, 556, 494, 525, 463, 494, 432, 463, 401, 432,
- 370, 401, 339, 370, 308, 339, 277, 308, 246, 277, 215, 246, 184, 215,
- 153, 184, 122, 153, 91, 122, 60, 91, 29, 60, 29, 29, 30, 30,
- 30, 61, 61, 92, 92, 123, 123, 154, 154, 185, 185, 216, 216, 247,
- 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433, 433, 464,
- 464, 495, 495, 526, 526, 557, 557, 588, 588, 619, 619, 650, 650, 681,
- 681, 712, 712, 743, 743, 774, 774, 805, 805, 836, 836, 867, 867, 898,
- 898, 929, 929, 960, 960, 960, 961, 992, 930, 961, 899, 930, 868, 899,
- 837, 868, 806, 837, 775, 806, 744, 775, 713, 744, 682, 713, 651, 682,
- 620, 651, 589, 620, 558, 589, 527, 558, 496, 527, 465, 496, 434, 465,
- 403, 434, 372, 403, 341, 372, 310, 341, 279, 310, 248, 279, 217, 248,
- 186, 217, 155, 186, 124, 155, 93, 124, 62, 93, 31, 62, 63, 94,
- 94, 125, 125, 156, 156, 187, 187, 218, 218, 249, 249, 280, 280, 311,
- 311, 342, 342, 373, 373, 404, 404, 435, 435, 466, 466, 497, 497, 528,
- 528, 559, 559, 590, 590, 621, 621, 652, 652, 683, 683, 714, 714, 745,
- 745, 776, 776, 807, 807, 838, 838, 869, 869, 900, 900, 931, 931, 962,
- 962, 993, 963, 994, 932, 963, 901, 932, 870, 901, 839, 870, 808, 839,
- 777, 808, 746, 777, 715, 746, 684, 715, 653, 684, 622, 653, 591, 622,
- 560, 591, 529, 560, 498, 529, 467, 498, 436, 467, 405, 436, 374, 405,
- 343, 374, 312, 343, 281, 312, 250, 281, 219, 250, 188, 219, 157, 188,
- 126, 157, 95, 126, 127, 158, 158, 189, 189, 220, 220, 251, 251, 282,
- 282, 313, 313, 344, 344, 375, 375, 406, 406, 437, 437, 468, 468, 499,
- 499, 530, 530, 561, 561, 592, 592, 623, 623, 654, 654, 685, 685, 716,
- 716, 747, 747, 778, 778, 809, 809, 840, 840, 871, 871, 902, 902, 933,
- 933, 964, 964, 995, 965, 996, 934, 965, 903, 934, 872, 903, 841, 872,
- 810, 841, 779, 810, 748, 779, 717, 748, 686, 717, 655, 686, 624, 655,
- 593, 624, 562, 593, 531, 562, 500, 531, 469, 500, 438, 469, 407, 438,
- 376, 407, 345, 376, 314, 345, 283, 314, 252, 283, 221, 252, 190, 221,
- 159, 190, 191, 222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377,
- 377, 408, 408, 439, 439, 470, 470, 501, 501, 532, 532, 563, 563, 594,
- 594, 625, 625, 656, 656, 687, 687, 718, 718, 749, 749, 780, 780, 811,
- 811, 842, 842, 873, 873, 904, 904, 935, 935, 966, 966, 997, 967, 998,
- 936, 967, 905, 936, 874, 905, 843, 874, 812, 843, 781, 812, 750, 781,
- 719, 750, 688, 719, 657, 688, 626, 657, 595, 626, 564, 595, 533, 564,
- 502, 533, 471, 502, 440, 471, 409, 440, 378, 409, 347, 378, 316, 347,
- 285, 316, 254, 285, 223, 254, 255, 286, 286, 317, 317, 348, 348, 379,
- 379, 410, 410, 441, 441, 472, 472, 503, 503, 534, 534, 565, 565, 596,
- 596, 627, 627, 658, 658, 689, 689, 720, 720, 751, 751, 782, 782, 813,
- 813, 844, 844, 875, 875, 906, 906, 937, 937, 968, 968, 999, 969, 1000,
- 938, 969, 907, 938, 876, 907, 845, 876, 814, 845, 783, 814, 752, 783,
- 721, 752, 690, 721, 659, 690, 628, 659, 597, 628, 566, 597, 535, 566,
- 504, 535, 473, 504, 442, 473, 411, 442, 380, 411, 349, 380, 318, 349,
- 287, 318, 319, 350, 350, 381, 381, 412, 412, 443, 443, 474, 474, 505,
- 505, 536, 536, 567, 567, 598, 598, 629, 629, 660, 660, 691, 691, 722,
- 722, 753, 753, 784, 784, 815, 815, 846, 846, 877, 877, 908, 908, 939,
- 939, 970, 970, 1001, 971, 1002, 940, 971, 909, 940, 878, 909, 847, 878,
- 816, 847, 785, 816, 754, 785, 723, 754, 692, 723, 661, 692, 630, 661,
- 599, 630, 568, 599, 537, 568, 506, 537, 475, 506, 444, 475, 413, 444,
- 382, 413, 351, 382, 383, 414, 414, 445, 445, 476, 476, 507, 507, 538,
- 538, 569, 569, 600, 600, 631, 631, 662, 662, 693, 693, 724, 724, 755,
- 755, 786, 786, 817, 817, 848, 848, 879, 879, 910, 910, 941, 941, 972,
- 972, 1003, 973, 1004, 942, 973, 911, 942, 880, 911, 849, 880, 818, 849,
- 787, 818, 756, 787, 725, 756, 694, 725, 663, 694, 632, 663, 601, 632,
- 570, 601, 539, 570, 508, 539, 477, 508, 446, 477, 415, 446, 447, 478,
- 478, 509, 509, 540, 540, 571, 571, 602, 602, 633, 633, 664, 664, 695,
- 695, 726, 726, 757, 757, 788, 788, 819, 819, 850, 850, 881, 881, 912,
- 912, 943, 943, 974, 974, 1005, 975, 1006, 944, 975, 913, 944, 882, 913,
- 851, 882, 820, 851, 789, 820, 758, 789, 727, 758, 696, 727, 665, 696,
- 634, 665, 603, 634, 572, 603, 541, 572, 510, 541, 479, 510, 511, 542,
- 542, 573, 573, 604, 604, 635, 635, 666, 666, 697, 697, 728, 728, 759,
- 759, 790, 790, 821, 821, 852, 852, 883, 883, 914, 914, 945, 945, 976,
- 976, 1007, 977, 1008, 946, 977, 915, 946, 884, 915, 853, 884, 822, 853,
- 791, 822, 760, 791, 729, 760, 698, 729, 667, 698, 636, 667, 605, 636,
- 574, 605, 543, 574, 575, 606, 606, 637, 637, 668, 668, 699, 699, 730,
- 730, 761, 761, 792, 792, 823, 823, 854, 854, 885, 885, 916, 916, 947,
- 947, 978, 978, 1009, 979, 1010, 948, 979, 917, 948, 886, 917, 855, 886,
- 824, 855, 793, 824, 762, 793, 731, 762, 700, 731, 669, 700, 638, 669,
- 607, 638, 639, 670, 670, 701, 701, 732, 732, 763, 763, 794, 794, 825,
- 825, 856, 856, 887, 887, 918, 918, 949, 949, 980, 980, 1011, 981, 1012,
- 950, 981, 919, 950, 888, 919, 857, 888, 826, 857, 795, 826, 764, 795,
- 733, 764, 702, 733, 671, 702, 703, 734, 734, 765, 765, 796, 796, 827,
- 827, 858, 858, 889, 889, 920, 920, 951, 951, 982, 982, 1013, 983, 1014,
- 952, 983, 921, 952, 890, 921, 859, 890, 828, 859, 797, 828, 766, 797,
- 735, 766, 767, 798, 798, 829, 829, 860, 860, 891, 891, 922, 922, 953,
- 953, 984, 984, 1015, 985, 1016, 954, 985, 923, 954, 892, 923, 861, 892,
- 830, 861, 799, 830, 831, 862, 862, 893, 893, 924, 924, 955, 955, 986,
- 986, 1017, 987, 1018, 956, 987, 925, 956, 894, 925, 863, 894, 895, 926,
- 926, 957, 957, 988, 988, 1019, 989, 1020, 958, 989, 927, 958, 959, 990,
- 990, 1021, 991, 1022, 0, 0
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
- 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15
-};
+ av1_default_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12,
+ 3, 8, 11, 13, 9, 10, 14, 15 };
DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
@@ -3201,535 +1664,385 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
};
const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
// Half of the coefficients of tx64 at higher frequencies are set to
// zeros. So tx32's scan order is used.
- { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32 },
};
const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
{
// TX_4X4
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
- { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
- { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
- { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
- { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
- { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4 },
},
{
// TX_8X8
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
- { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
- { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
- { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
- { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
- { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8 },
},
{
// TX_16X16
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
- { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
- { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
- { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
- { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
- { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16 },
},
{
// TX_32X32
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
},
{
// TX_64X64
// Half of the coefficients of tx64 at higher frequencies are set to
// zeros. So tx32's scan order is used.
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
},
{
// TX_4X8
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
- { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
- { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
- { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
- { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
- { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
- { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8 },
},
{
// TX_8X4
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
- { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
- { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
- { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
- { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
- { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
- { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4 },
},
{
// TX_8X16
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { default_scan_8x16, av1_default_iscan_8x16,
- default_scan_8x16_neighbors },
- { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
- { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
- { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
- { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
- { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
- { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16 },
},
{
// TX_16X8
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { default_scan_16x8, av1_default_iscan_16x8,
- default_scan_16x8_neighbors },
- { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
- { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
- { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
- { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
- { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
- { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8 },
},
{
// TX_16X32
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
- { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
- { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
- { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
- { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
- { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
},
{
// TX_32X16
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
- { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
- { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
- { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
- { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
- { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
},
{
// TX_32X64
// Half of the coefficients of tx64 at higher frequencies are set to
// zeros. So tx32's scan order is used.
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
},
{
// TX_64X32
// Half of the coefficients of tx64 at higher frequencies are set to
// zeros. So tx32's scan order is used.
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
- { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
- { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
},
{
// TX_4X16
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { default_scan_4x16, av1_default_iscan_4x16,
- default_scan_4x16_neighbors },
- { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
- { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
- { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
- { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
- { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
- { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16 },
},
{
// TX_16X4
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { default_scan_16x4, av1_default_iscan_16x4,
- default_scan_16x4_neighbors },
- { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
- { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
- { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
- { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
- { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
- { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4 },
},
{
// TX_8X32
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { default_scan_8x32, av1_default_iscan_8x32,
- default_scan_8x32_neighbors },
- { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
- { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
- { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
- { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
- { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
- { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32 },
},
{
// TX_32X8
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { default_scan_32x8, av1_default_iscan_32x8,
- default_scan_32x8_neighbors },
- { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
- { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
- { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
- { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
- { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
- { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8 },
},
{
// TX_16X64
// Half of the coefficients of tx64 at higher frequencies are set to
// zeros. So tx32's scan order is used.
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { default_scan_16x32, av1_default_iscan_16x32,
- default_scan_16x32_neighbors },
- { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
- { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
- { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
- { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
- { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
- { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
},
{
// TX_64X16
// Half of the coefficients of tx64 at higher frequencies are set to
// zeros. So tx32's scan order is used.
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { default_scan_32x16, av1_default_iscan_32x16,
- default_scan_32x16_neighbors },
- { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
- { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
- { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
- { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
- { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
- { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
},
};
diff --git a/media/libaom/src/av1/common/scan.h b/media/libaom/src/av1/common/scan.h
index 233dc0efa..d9620e1c5 100644
--- a/media/libaom/src/av1/common/scan.h
+++ b/media/libaom/src/av1/common/scan.h
@@ -15,9 +15,9 @@
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
-#include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
#ifdef __cplusplus
extern "C" {
@@ -25,14 +25,14 @@ extern "C" {
#define MAX_NEIGHBORS 2
-typedef enum SCAN_MODE {
+enum {
SCAN_MODE_ZIG_ZAG,
SCAN_MODE_COL_DIAG,
SCAN_MODE_ROW_DIAG,
SCAN_MODE_COL_1D,
SCAN_MODE_ROW_1D,
SCAN_MODES
-} SCAN_MODE;
+} UENUM1BYTE(SCAN_MODE);
extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
diff --git a/media/libaom/src/av1/common/seg_common.c b/media/libaom/src/av1/common/seg_common.c
index cd189ad76..60b185161 100644
--- a/media/libaom/src/av1/common/seg_common.c
+++ b/media/libaom/src/av1/common/seg_common.c
@@ -16,12 +16,19 @@
#include "av1/common/seg_common.h"
#include "av1/common/quant_common.h"
-static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0 };
-
-static const int seg_feature_data_max[SEG_LVL_MAX] = {
- MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0
+static const int seg_feature_data_signed[SEG_LVL_MAX] = {
+ 1, 1, 1, 1, 1, 0, 0, 0
};
+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ,
+ MAX_LOOP_FILTER,
+ MAX_LOOP_FILTER,
+ MAX_LOOP_FILTER,
+ MAX_LOOP_FILTER,
+ 7,
+ 0,
+ 0 };
+
// These functions provide access to new segment level features.
// Eventually these function may be "optimized out" but for the moment,
// the coding mechanism is still subject to change so these provide a
@@ -32,7 +39,7 @@ void av1_clearall_segfeatures(struct segmentation *seg) {
av1_zero(seg->feature_mask);
}
-void calculate_segdata(struct segmentation *seg) {
+void av1_calculate_segdata(struct segmentation *seg) {
seg->segid_preskip = 0;
seg->last_active_segid = 0;
for (int i = 0; i < MAX_SEGMENTS; i++) {
diff --git a/media/libaom/src/av1/common/seg_common.h b/media/libaom/src/av1/common/seg_common.h
index 8c35bba86..aeb9c1768 100644
--- a/media/libaom/src/av1/common/seg_common.h
+++ b/media/libaom/src/av1/common/seg_common.h
@@ -24,7 +24,7 @@ extern "C" {
#define SEG_TEMPORAL_PRED_CTXS 3
#define SPATIAL_PREDICTION_PROBS 3
-typedef enum {
+enum {
SEG_LVL_ALT_Q, // Use alternate Quantizer ....
SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical
SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal
@@ -34,7 +34,7 @@ typedef enum {
SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode
SEG_LVL_GLOBALMV,
SEG_LVL_MAX
-} SEG_LVL_FEATURES;
+} UENUM1BYTE(SEG_LVL_FEATURES);
struct segmentation {
uint8_t enabled;
@@ -83,7 +83,7 @@ void av1_clearall_segfeatures(struct segmentation *seg);
void av1_enable_segfeature(struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id);
-void calculate_segdata(struct segmentation *seg);
+void av1_calculate_segdata(struct segmentation *seg);
int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
diff --git a/media/libaom/src/av1/common/thread_common.c b/media/libaom/src/av1/common/thread_common.c
index 8df4c9a09..f3c8795f8 100644
--- a/media/libaom/src/av1/common/thread_common.c
+++ b/media/libaom/src/av1/common/thread_common.c
@@ -205,7 +205,11 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
}
static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
- int stop, int plane_start, int plane_end) {
+ int stop,
+#if CONFIG_LPF_MASK
+ int is_decoding,
+#endif
+ int plane_start, int plane_end) {
int mi_row, plane, dir;
AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
lf_sync->jobs_enqueued = 0;
@@ -219,7 +223,16 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
continue;
else if (plane == 2 && !(cm->lf.filter_level_v))
continue;
- for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+#if CONFIG_LPF_MASK
+ int step = MAX_MIB_SIZE;
+ if (is_decoding) {
+ step = MI_SIZE_64X64;
+ }
+ for (mi_row = start; mi_row < stop; mi_row += step)
+#else
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE)
+#endif
+ {
lf_job_queue->mi_row = mi_row;
lf_job_queue->plane = plane;
lf_job_queue->dir = dir;
@@ -230,7 +243,7 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
}
}
-AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+static AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
AV1LfMTInfo *cur_job_info = NULL;
#if CONFIG_MULTITHREAD
@@ -255,7 +268,8 @@ static INLINE void thread_loop_filter_rows(
struct macroblockd_plane *planes, MACROBLOCKD *xd,
AV1LfSync *const lf_sync) {
const int sb_cols =
- ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
+ MAX_MIB_SIZE_LOG2;
int mi_row, mi_col, plane, dir;
int r, c;
@@ -269,7 +283,8 @@ static INLINE void thread_loop_filter_rows(
r = mi_row >> MAX_MIB_SIZE_LOG2;
if (dir == 0) {
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+ mi_col += MAX_MIB_SIZE) {
c = mi_col >> MAX_MIB_SIZE_LOG2;
av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
@@ -280,7 +295,8 @@ static INLINE void thread_loop_filter_rows(
sync_write(lf_sync, r, c, sb_cols, plane);
}
} else if (dir == 1) {
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+ mi_col += MAX_MIB_SIZE) {
c = mi_col >> MAX_MIB_SIZE_LOG2;
// Wait for vertical edge filtering of the top-right block to be
@@ -312,15 +328,98 @@ static int loop_filter_row_worker(void *arg1, void *arg2) {
return 1;
}
+#if CONFIG_LPF_MASK
+static INLINE void thread_loop_filter_bitmask_rows(
+ const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+ struct macroblockd_plane *planes, MACROBLOCKD *xd,
+ AV1LfSync *const lf_sync) {
+ const int sb_cols =
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >>
+ MIN_MIB_SIZE_LOG2;
+ int mi_row, mi_col, plane, dir;
+ int r, c;
+ (void)xd;
+
+ while (1) {
+ AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+ if (cur_job_info != NULL) {
+ mi_row = cur_job_info->mi_row;
+ plane = cur_job_info->plane;
+ dir = cur_job_info->dir;
+ r = mi_row >> MIN_MIB_SIZE_LOG2;
+
+ if (dir == 0) {
+ for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+ mi_col += MI_SIZE_64X64) {
+ c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+ av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+ mi_col, plane, plane + 1);
+
+ av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row,
+ mi_col);
+ sync_write(lf_sync, r, c, sb_cols, plane);
+ }
+ } else if (dir == 1) {
+ for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+ mi_col += MI_SIZE_64X64) {
+ c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+ // Wait for vertical edge filtering of the top-right block to be
+ // completed
+ sync_read(lf_sync, r, c, plane);
+
+ // Wait for vertical edge filtering of the right block to be
+ // completed
+ sync_read(lf_sync, r + 1, c, plane);
+
+ av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+ mi_col, plane, plane + 1);
+ av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row,
+ mi_col);
+ }
+ }
+ } else {
+ break;
+ }
+ }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+ LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+ thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm,
+ lf_data->planes, lf_data->xd, lf_sync);
+ return 1;
+}
+#endif // CONFIG_LPF_MASK
+
static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int start, int stop,
int plane_start, int plane_end,
+#if CONFIG_LPF_MASK
+ int is_decoding,
+#endif
AVxWorker *workers, int nworkers,
AV1LfSync *lf_sync) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+#if CONFIG_LPF_MASK
+ int sb_rows;
+ if (is_decoding) {
+ sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >>
+ MIN_MIB_SIZE_LOG2;
+ } else {
+ sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+ MAX_MIB_SIZE_LOG2;
+ }
+#else
// Number of superblock rows and cols
const int sb_rows =
- ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+ MAX_MIB_SIZE_LOG2;
+#endif
const int num_workers = nworkers;
int i;
@@ -336,14 +435,26 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
}
- enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end);
+ enqueue_lf_jobs(lf_sync, cm, start, stop,
+#if CONFIG_LPF_MASK
+ is_decoding,
+#endif
+ plane_start, plane_end);
// Set up loopfilter thread data.
for (i = 0; i < num_workers; ++i) {
AVxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+#if CONFIG_LPF_MASK
+ if (is_decoding) {
+ worker->hook = loop_filter_bitmask_row_worker;
+ } else {
+ worker->hook = loop_filter_row_worker;
+ }
+#else
worker->hook = loop_filter_row_worker;
+#endif
worker->data1 = lf_sync;
worker->data2 = lf_data;
@@ -366,22 +477,55 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int plane_start, int plane_end,
- int partial_frame, AVxWorker *workers,
- int num_workers, AV1LfSync *lf_sync) {
+ int partial_frame,
+#if CONFIG_LPF_MASK
+ int is_decoding,
+#endif
+ AVxWorker *workers, int num_workers,
+ AV1LfSync *lf_sync) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
start_mi_row = 0;
- mi_rows_to_filter = cm->mi_rows;
- if (partial_frame && cm->mi_rows > 8) {
- start_mi_row = cm->mi_rows >> 1;
+ mi_rows_to_filter = cm->mi_params.mi_rows;
+ if (partial_frame && cm->mi_params.mi_rows > 8) {
+ start_mi_row = cm->mi_params.mi_rows >> 1;
start_mi_row &= 0xfffffff8;
- mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+ mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
}
end_mi_row = start_mi_row + mi_rows_to_filter;
av1_loop_filter_frame_init(cm, plane_start, plane_end);
+#if CONFIG_LPF_MASK
+ if (is_decoding) {
+ cm->is_decoding = is_decoding;
+ // TODO(chengchen): currently use one thread to build bitmasks for the
+ // frame. Make it support multi-thread later.
+ for (int plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+ break;
+ else if (plane == 1 && !(cm->lf.filter_level_u))
+ continue;
+ else if (plane == 2 && !(cm->lf.filter_level_v))
+ continue;
+
+ // TODO(chengchen): can we remove this?
+ struct macroblockd_plane *pd = xd->plane;
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
+ plane + 1);
+
+ av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+ av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+ }
+ loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+ plane_end, 1, workers, num_workers, lf_sync);
+ } else {
+ loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+ plane_end, 0, workers, num_workers, lf_sync);
+ }
+#else
loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
plane_end, workers, num_workers, lf_sync);
+#endif
}
static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
@@ -630,7 +774,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
}
}
-AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
AV1LrMTInfo *cur_job_info = NULL;
#if CONFIG_MULTITHREAD
@@ -664,9 +808,9 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) {
typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
int vstart, int vend);
- static const copy_fun copy_funs[3] = {
- aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
- };
+ static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
+ aom_yv12_partial_coloc_copy_u,
+ aom_yv12_partial_coloc_copy_v };
while (1) {
AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
@@ -772,7 +916,7 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
AV1_COMMON *cm, int optimized_lr,
AVxWorker *workers, int num_workers,
AV1LrSync *lr_sync, void *lr_ctxt) {
- assert(!cm->all_lossless);
+ assert(!cm->features.all_lossless);
const int num_planes = av1_num_planes(cm);
diff --git a/media/libaom/src/av1/common/thread_common.h b/media/libaom/src/av1/common/thread_common.h
index 23d61d72a..7397f1c54 100644
--- a/media/libaom/src/av1/common/thread_common.h
+++ b/media/libaom/src/av1/common/thread_common.h
@@ -101,8 +101,11 @@ typedef struct AV1LrSyncData {
void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
- struct macroblockd *mbd, int plane_start,
+ struct macroblockd *xd, int plane_start,
int plane_end, int partial_frame,
+#if CONFIG_LPF_MASK
+ int is_decoding,
+#endif
AVxWorker *workers, int num_workers,
AV1LfSync *lf_sync);
void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/media/libaom/src/av1/common/tile_common.c b/media/libaom/src/av1/common/tile_common.c
index 1b413487f..1b11bd760 100644
--- a/media/libaom/src/av1/common/tile_common.c
+++ b/media/libaom/src/av1/common/tile_common.c
@@ -9,9 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "av1/common/tile_common.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
#include "aom_dsp/aom_dsp_common.h"
void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
@@ -28,102 +28,126 @@ static int tile_log2(int blk_size, int target) {
}
void av1_get_tile_limits(AV1_COMMON *const cm) {
- int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
- int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
- int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
- int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
-
- int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2;
- cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
- int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
-
- cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols);
- cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
- cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
- cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
- cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols);
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ CommonTileParams *const tiles = &cm->tiles;
+ const int mi_cols =
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+ const int mi_rows =
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+ const int sb_cols = mi_cols >> seq_params->mib_size_log2;
+ const int sb_rows = mi_rows >> seq_params->mib_size_log2;
+
+ const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
+ tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+ const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+ tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
+ tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
+ tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
+ tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
+ tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols);
}
-void av1_calculate_tile_cols(AV1_COMMON *const cm) {
- int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
- int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
- int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
- int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
+ int cm_mi_rows, int cm_mi_cols,
+ CommonTileParams *const tiles) {
+ int mi_cols = ALIGN_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
+ int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+ int sb_cols = mi_cols >> seq_params->mib_size_log2;
+ int sb_rows = mi_rows >> seq_params->mib_size_log2;
int i;
- if (cm->uniform_tile_spacing_flag) {
+ // This will be overridden if there is at least two columns of tiles
+ // (otherwise there is no inner tile width)
+ tiles->min_inner_width = -1;
+
+ if (tiles->uniform_spacing) {
int start_sb;
- int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols);
- size_sb >>= cm->log2_tile_cols;
+ int size_sb = ALIGN_POWER_OF_TWO(sb_cols, tiles->log2_cols);
+ size_sb >>= tiles->log2_cols;
assert(size_sb > 0);
for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
- cm->tile_col_start_sb[i] = start_sb;
+ tiles->col_start_sb[i] = start_sb;
start_sb += size_sb;
}
- cm->tile_cols = i;
- cm->tile_col_start_sb[i] = sb_cols;
- cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0);
- cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows;
-
- cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
- cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+ tiles->cols = i;
+ tiles->col_start_sb[i] = sb_cols;
+ tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0);
+ tiles->max_height_sb = sb_rows >> tiles->min_log2_rows;
+
+ tiles->width = size_sb << seq_params->mib_size_log2;
+ tiles->width = AOMMIN(tiles->width, cm_mi_cols);
+ if (tiles->cols > 1) {
+ tiles->min_inner_width = tiles->width;
+ }
} else {
int max_tile_area_sb = (sb_rows * sb_cols);
int widest_tile_sb = 1;
- cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
- for (i = 0; i < cm->tile_cols; i++) {
- int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+ int narrowest_inner_tile_sb = 65536;
+ tiles->log2_cols = tile_log2(1, tiles->cols);
+ for (i = 0; i < tiles->cols; i++) {
+ int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+ // ignore the rightmost tile in frame for determining the narrowest
+ if (i < tiles->cols - 1)
+ narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
}
- if (cm->min_log2_tiles) {
- max_tile_area_sb >>= (cm->min_log2_tiles + 1);
+ if (tiles->min_log2) {
+ max_tile_area_sb >>= (tiles->min_log2 + 1);
+ }
+ tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+ if (tiles->cols > 1) {
+ tiles->min_inner_width = narrowest_inner_tile_sb
+ << seq_params->mib_size_log2;
}
- cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
}
}
-void av1_calculate_tile_rows(AV1_COMMON *const cm) {
- int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
- int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
+ int cm_mi_rows, CommonTileParams *const tiles) {
+ int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+ int sb_rows = mi_rows >> seq_params->mib_size_log2;
int start_sb, size_sb, i;
- if (cm->uniform_tile_spacing_flag) {
- size_sb = ALIGN_POWER_OF_TWO(sb_rows, cm->log2_tile_rows);
- size_sb >>= cm->log2_tile_rows;
+ if (tiles->uniform_spacing) {
+ size_sb = ALIGN_POWER_OF_TWO(sb_rows, tiles->log2_rows);
+ size_sb >>= tiles->log2_rows;
assert(size_sb > 0);
for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
- cm->tile_row_start_sb[i] = start_sb;
+ tiles->row_start_sb[i] = start_sb;
start_sb += size_sb;
}
- cm->tile_rows = i;
- cm->tile_row_start_sb[i] = sb_rows;
+ tiles->rows = i;
+ tiles->row_start_sb[i] = sb_rows;
- cm->tile_height = size_sb << cm->seq_params.mib_size_log2;
- cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+ tiles->height = size_sb << seq_params->mib_size_log2;
+ tiles->height = AOMMIN(tiles->height, cm_mi_rows);
} else {
- cm->log2_tile_rows = tile_log2(1, cm->tile_rows);
+ tiles->log2_rows = tile_log2(1, tiles->rows);
}
}
void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
- assert(row < cm->tile_rows);
- int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2;
- int mi_row_end = cm->tile_row_start_sb[row + 1]
+ assert(row < cm->tiles.rows);
+ int mi_row_start = cm->tiles.row_start_sb[row]
+ << cm->seq_params.mib_size_log2;
+ int mi_row_end = cm->tiles.row_start_sb[row + 1]
<< cm->seq_params.mib_size_log2;
tile->tile_row = row;
tile->mi_row_start = mi_row_start;
- tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows);
+ tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
assert(tile->mi_row_end > tile->mi_row_start);
}
void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
- assert(col < cm->tile_cols);
- int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2;
- int mi_col_end = cm->tile_col_start_sb[col + 1]
+ assert(col < cm->tiles.cols);
+ int mi_col_start = cm->tiles.col_start_sb[col]
+ << cm->seq_params.mib_size_log2;
+ int mi_col_end = cm->tiles.col_start_sb[col + 1]
<< cm->seq_params.mib_size_log2;
tile->tile_col = col;
tile->mi_col_start = mi_col_start;
- tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols);
+ tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
assert(tile->mi_col_end > tile->mi_col_start);
}
@@ -143,30 +167,6 @@ int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
return sb_cols;
}
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
- // Round the frame up to a whole number of max superblocks
- mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
-
- // Divide by the signalled number of tiles, rounding up to the multiple of
- // the max superblock size. To do this, shift right (and round up) to get the
- // tile size in max super-blocks and then shift left again to convert it to
- // mi units.
- const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2;
- const int max_sb_tile_size =
- ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift;
- const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2;
-
- // The actual number of tiles is the ceiling of the frame size in mi units
- // divided by mi_size. This is at most 1 << log2_tile_num but might be
- // strictly less if max_sb_tile_size got rounded up significantly.
- if (ntiles) {
- *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size;
- assert(*ntiles <= (1 << log2_tile_num));
- }
-
- return mi_tile_size;
-}
-
AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
int is_uv) {
AV1PixelRect r;
@@ -205,3 +205,35 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
return r;
}
+
+void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
+ const CommonTileParams *const tiles = &cm->tiles;
+ if (tiles->uniform_spacing) {
+ *w = tiles->width;
+ *h = tiles->height;
+ } else {
+ for (int i = 0; i < tiles->cols; ++i) {
+ const int tile_width_sb =
+ tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+ const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+ assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension
+ *w = tile_w;
+ }
+
+ for (int i = 0; i < tiles->rows; ++i) {
+ const int tile_height_sb =
+ tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+ const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+ assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension
+ *h = tile_h;
+ }
+ }
+}
+
+int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
+ // Disable check if there is a single tile col in the frame
+ if (cm->tiles.cols == 1) return 1;
+
+ return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >=
+ (64 << av1_superres_scaled(cm)));
+}
diff --git a/media/libaom/src/av1/common/tile_common.h b/media/libaom/src/av1/common/tile_common.h
index c03553dc6..ca7c5f496 100644
--- a/media/libaom/src/av1/common/tile_common.h
+++ b/media/libaom/src/av1/common/tile_common.h
@@ -19,13 +19,14 @@ extern "C" {
#include "config/aom_config.h"
struct AV1Common;
+struct SequenceHeader;
+struct CommonTileParams;
#define DEFAULT_MAX_NUM_TG 1
typedef struct TileInfo {
int mi_row_start, mi_row_end;
int mi_col_start, mi_col_end;
- int tg_horz_boundary;
int tile_row;
int tile_col;
} TileInfo;
@@ -37,12 +38,6 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
- int *max_log2_tile_cols);
-
-// Calculate the correct tile size (width or height) for (1 << log2_tile_num)
-// tiles horizontally or vertically in the frame.
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
@@ -61,9 +56,17 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
#define MAX_TILE_WIDTH (4096) // Max Tile width in pixels
#define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels
+void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
void av1_get_tile_limits(struct AV1Common *const cm);
-void av1_calculate_tile_cols(struct AV1Common *const cm);
-void av1_calculate_tile_rows(struct AV1Common *const cm);
+void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params,
+ int cm_mi_rows, int cm_mi_cols,
+ struct CommonTileParams *const tiles);
+void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params,
+ int cm_mi_rows,
+ struct CommonTileParams *const tiles);
+
+// Checks if the minimum tile_width requirement is satisfied
+int av1_is_min_tile_width_satisfied(const struct AV1Common *cm);
#ifdef __cplusplus
} // extern "C"
diff --git a/media/libaom/src/av1/common/timing.c b/media/libaom/src/av1/common/timing.c
index 49dbde78f..a959cdf76 100644
--- a/media/libaom/src/av1/common/timing.c
+++ b/media/libaom/src/av1/common/timing.c
@@ -15,22 +15,35 @@
* The tables are in Kbps instead of Mbps in the specification.
* Note that depending on the profile, a multiplier is needed.
*/
+#define UNDEFINED_RATE \
+ (1 << 21) // Placeholder rate for levels with undefined rate
+#define INVALID_RATE \
+ (0) // For invalid profile-level configuration, set rate to 0
/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
/* is a dummy value. The decoder model is not applicable for level 31. */
static int32_t main_kbps[1 << LEVEL_BITS] = {
- 1500, 3000, 0, 0, 6000, 10000, 0, 0, 12000, 20000, 0,
- 0, 30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, (1 << 26)
+ 1500, 3000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 6000, 10000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 12000, 20000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 30000, 40000, 60000, 60000,
+ 60000, 100000, 160000, 160000,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
};
/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
/* is a dummy value. The decoder model is not applicable for level 31. */
static int32_t high_kbps[1 << LEVEL_BITS] = {
- 0, 0, 0, 0, 0, 0, 0, 0,
- 30000, 50000, 0, 0, 100000, 160000, 240000, 240000,
- 240000, 480000, 800000, 800000, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, (1 << 26)
+ INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE,
+ INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE,
+ 30000, 50000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 100000, 160000, 240000, 240000,
+ 240000, 480000, 800000, 800000,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
};
/* BitrateProfileFactor */
@@ -38,8 +51,8 @@ static int bitrate_profile_factor[1 << PROFILE_BITS] = {
1, 2, 3, 0, 0, 0, 0, 0
};
-int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
- int seq_tier) {
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+ int seq_tier) {
int64_t bitrate;
if (seq_tier) {
@@ -51,13 +64,13 @@ int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
return bitrate * 1000;
}
-void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
decoder_model->encoder_decoder_buffer_delay_length = 16;
decoder_model->buffer_removal_time_length = 10;
decoder_model->frame_presentation_time_length = 10;
}
-void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
op_params->decoder_model_param_present_flag = 1;
op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s
op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s
@@ -66,7 +79,7 @@ void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
op_params->initial_display_delay = 8; // 8 frames delay
}
-void set_resource_availability_parameters(
+void av1_set_resource_availability_parameters(
aom_dec_model_op_parameters_t *op_params) {
op_params->decoder_model_param_present_flag = 0;
op_params->decoder_buffer_delay =
diff --git a/media/libaom/src/av1/common/timing.h b/media/libaom/src/av1/common/timing.h
index 06939ae43..9192124f7 100644
--- a/media/libaom/src/av1/common/timing.h
+++ b/media/libaom/src/av1/common/timing.h
@@ -42,18 +42,14 @@ typedef struct aom_dec_model_op_parameters {
int initial_display_delay;
} aom_dec_model_op_parameters_t;
-typedef struct aom_op_timing_info_t {
- uint32_t buffer_removal_time;
-} aom_op_timing_info_t;
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
-void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
-void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
-
-void set_resource_availability_parameters(
+void av1_set_resource_availability_parameters(
aom_dec_model_op_parameters_t *op_params);
-int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
- int seq_tier);
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+ int seq_tier);
#endif // AOM_AV1_COMMON_TIMING_H_
diff --git a/media/libaom/src/av1/common/token_cdfs.h b/media/libaom/src/av1/common/token_cdfs.h
index 53e956450..f1edda58d 100644
--- a/media/libaom/src/av1/common/token_cdfs.h
+++ b/media/libaom/src/av1/common/token_cdfs.h
@@ -1707,1687 +1707,1687 @@ static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
- [CDF_SIZE(NUM_BASE_LEVELS + 2)] =
- { { { { { AOM_CDF4(4034, 8930, 12727) },
- { AOM_CDF4(18082, 29741, 31877) },
- { AOM_CDF4(12596, 26124, 30493) },
- { AOM_CDF4(9446, 21118, 27005) },
- { AOM_CDF4(6308, 15141, 21279) },
- { AOM_CDF4(2463, 6357, 9783) },
- { AOM_CDF4(20667, 30546, 31929) },
- { AOM_CDF4(13043, 26123, 30134) },
- { AOM_CDF4(8151, 18757, 24778) },
- { AOM_CDF4(5255, 12839, 18632) },
- { AOM_CDF4(2820, 7206, 11161) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(15736, 27553, 30604) },
- { AOM_CDF4(11210, 23794, 28787) },
- { AOM_CDF4(5947, 13874, 19701) },
- { AOM_CDF4(4215, 9323, 13891) },
- { AOM_CDF4(2833, 6462, 10059) },
- { AOM_CDF4(19605, 30393, 31582) },
- { AOM_CDF4(13523, 26252, 30248) },
- { AOM_CDF4(8446, 18622, 24512) },
- { AOM_CDF4(3818, 10343, 15974) },
- { AOM_CDF4(1481, 4117, 6796) },
- { AOM_CDF4(22649, 31302, 32190) },
- { AOM_CDF4(14829, 27127, 30449) },
- { AOM_CDF4(8313, 17702, 23304) },
- { AOM_CDF4(3022, 8301, 12786) },
- { AOM_CDF4(1536, 4412, 7184) },
- { AOM_CDF4(22354, 29774, 31372) },
- { AOM_CDF4(14723, 25472, 29214) },
- { AOM_CDF4(6673, 13745, 18662) },
- { AOM_CDF4(2068, 5766, 9322) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(6302, 16444, 21761) },
- { AOM_CDF4(23040, 31538, 32475) },
- { AOM_CDF4(15196, 28452, 31496) },
- { AOM_CDF4(10020, 22946, 28514) },
- { AOM_CDF4(6533, 16862, 23501) },
- { AOM_CDF4(3538, 9816, 15076) },
- { AOM_CDF4(24444, 31875, 32525) },
- { AOM_CDF4(15881, 28924, 31635) },
- { AOM_CDF4(9922, 22873, 28466) },
- { AOM_CDF4(6527, 16966, 23691) },
- { AOM_CDF4(4114, 11303, 17220) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(20201, 30770, 32209) },
- { AOM_CDF4(14754, 28071, 31258) },
- { AOM_CDF4(8378, 20186, 26517) },
- { AOM_CDF4(5916, 15299, 21978) },
- { AOM_CDF4(4268, 11583, 17901) },
- { AOM_CDF4(24361, 32025, 32581) },
- { AOM_CDF4(18673, 30105, 31943) },
- { AOM_CDF4(10196, 22244, 27576) },
- { AOM_CDF4(5495, 14349, 20417) },
- { AOM_CDF4(2676, 7415, 11498) },
- { AOM_CDF4(24678, 31958, 32585) },
- { AOM_CDF4(18629, 29906, 31831) },
- { AOM_CDF4(9364, 20724, 26315) },
- { AOM_CDF4(4641, 12318, 18094) },
- { AOM_CDF4(2758, 7387, 11579) },
- { AOM_CDF4(25433, 31842, 32469) },
- { AOM_CDF4(18795, 29289, 31411) },
- { AOM_CDF4(7644, 17584, 23592) },
- { AOM_CDF4(3408, 9014, 15047) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(4536, 10072, 14001) },
- { AOM_CDF4(25459, 31416, 32206) },
- { AOM_CDF4(16605, 28048, 30818) },
- { AOM_CDF4(11008, 22857, 27719) },
- { AOM_CDF4(6915, 16268, 22315) },
- { AOM_CDF4(2625, 6812, 10537) },
- { AOM_CDF4(24257, 31788, 32499) },
- { AOM_CDF4(16880, 29454, 31879) },
- { AOM_CDF4(11958, 25054, 29778) },
- { AOM_CDF4(7916, 18718, 25084) },
- { AOM_CDF4(3383, 8777, 13446) },
- { AOM_CDF4(22720, 31603, 32393) },
- { AOM_CDF4(14960, 28125, 31335) },
- { AOM_CDF4(9731, 22210, 27928) },
- { AOM_CDF4(6304, 15832, 22277) },
- { AOM_CDF4(2910, 7818, 12166) },
- { AOM_CDF4(20375, 30627, 32131) },
- { AOM_CDF4(13904, 27284, 30887) },
- { AOM_CDF4(9368, 21558, 27144) },
- { AOM_CDF4(5937, 14966, 21119) },
- { AOM_CDF4(2667, 7225, 11319) },
- { AOM_CDF4(23970, 31470, 32378) },
- { AOM_CDF4(17173, 29734, 32018) },
- { AOM_CDF4(12795, 25441, 29965) },
- { AOM_CDF4(8981, 19680, 25893) },
- { AOM_CDF4(4728, 11372, 16902) },
- { AOM_CDF4(24287, 31797, 32439) },
- { AOM_CDF4(16703, 29145, 31696) },
- { AOM_CDF4(10833, 23554, 28725) },
- { AOM_CDF4(6468, 16566, 23057) },
- { AOM_CDF4(2415, 6562, 10278) },
- { AOM_CDF4(26610, 32395, 32659) },
- { AOM_CDF4(18590, 30498, 32117) },
- { AOM_CDF4(12420, 25756, 29950) },
- { AOM_CDF4(7639, 18746, 24710) },
- { AOM_CDF4(3001, 8086, 12347) },
- { AOM_CDF4(25076, 32064, 32580) },
- { AOM_CDF4(17946, 30128, 32028) },
- { AOM_CDF4(12024, 24985, 29378) },
- { AOM_CDF4(7517, 18390, 24304) },
- { AOM_CDF4(3243, 8781, 13331) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(6037, 16771, 21957) },
- { AOM_CDF4(24774, 31704, 32426) },
- { AOM_CDF4(16830, 28589, 31056) },
- { AOM_CDF4(10602, 22828, 27760) },
- { AOM_CDF4(6733, 16829, 23071) },
- { AOM_CDF4(3250, 8914, 13556) },
- { AOM_CDF4(25582, 32220, 32668) },
- { AOM_CDF4(18659, 30342, 32223) },
- { AOM_CDF4(12546, 26149, 30515) },
- { AOM_CDF4(8420, 20451, 26801) },
- { AOM_CDF4(4636, 12420, 18344) },
- { AOM_CDF4(27581, 32362, 32639) },
- { AOM_CDF4(18987, 30083, 31978) },
- { AOM_CDF4(11327, 24248, 29084) },
- { AOM_CDF4(7264, 17719, 24120) },
- { AOM_CDF4(3995, 10768, 16169) },
- { AOM_CDF4(25893, 31831, 32487) },
- { AOM_CDF4(16577, 28587, 31379) },
- { AOM_CDF4(10189, 22748, 28182) },
- { AOM_CDF4(6832, 17094, 23556) },
- { AOM_CDF4(3708, 10110, 15334) },
- { AOM_CDF4(25904, 32282, 32656) },
- { AOM_CDF4(19721, 30792, 32276) },
- { AOM_CDF4(12819, 26243, 30411) },
- { AOM_CDF4(8572, 20614, 26891) },
- { AOM_CDF4(5364, 14059, 20467) },
- { AOM_CDF4(26580, 32438, 32677) },
- { AOM_CDF4(20852, 31225, 32340) },
- { AOM_CDF4(12435, 25700, 29967) },
- { AOM_CDF4(8691, 20825, 26976) },
- { AOM_CDF4(4446, 12209, 17269) },
- { AOM_CDF4(27350, 32429, 32696) },
- { AOM_CDF4(21372, 30977, 32272) },
- { AOM_CDF4(12673, 25270, 29853) },
- { AOM_CDF4(9208, 20925, 26640) },
- { AOM_CDF4(5018, 13351, 18732) },
- { AOM_CDF4(27351, 32479, 32713) },
- { AOM_CDF4(21398, 31209, 32387) },
- { AOM_CDF4(12162, 25047, 29842) },
- { AOM_CDF4(7896, 18691, 25319) },
- { AOM_CDF4(4670, 12882, 18881) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(5487, 10460, 13708) },
- { AOM_CDF4(21597, 28303, 30674) },
- { AOM_CDF4(11037, 21953, 26476) },
- { AOM_CDF4(8147, 17962, 22952) },
- { AOM_CDF4(5242, 13061, 18532) },
- { AOM_CDF4(1889, 5208, 8182) },
- { AOM_CDF4(26774, 32133, 32590) },
- { AOM_CDF4(17844, 29564, 31767) },
- { AOM_CDF4(11690, 24438, 29171) },
- { AOM_CDF4(7542, 18215, 24459) },
- { AOM_CDF4(2993, 8050, 12319) },
- { AOM_CDF4(28023, 32328, 32591) },
- { AOM_CDF4(18651, 30126, 31954) },
- { AOM_CDF4(12164, 25146, 29589) },
- { AOM_CDF4(7762, 18530, 24771) },
- { AOM_CDF4(3492, 9183, 13920) },
- { AOM_CDF4(27591, 32008, 32491) },
- { AOM_CDF4(17149, 28853, 31510) },
- { AOM_CDF4(11485, 24003, 28860) },
- { AOM_CDF4(7697, 18086, 24210) },
- { AOM_CDF4(3075, 7999, 12218) },
- { AOM_CDF4(28268, 32482, 32654) },
- { AOM_CDF4(19631, 31051, 32404) },
- { AOM_CDF4(13860, 27260, 31020) },
- { AOM_CDF4(9605, 21613, 27594) },
- { AOM_CDF4(4876, 12162, 17908) },
- { AOM_CDF4(27248, 32316, 32576) },
- { AOM_CDF4(18955, 30457, 32075) },
- { AOM_CDF4(11824, 23997, 28795) },
- { AOM_CDF4(7346, 18196, 24647) },
- { AOM_CDF4(3403, 9247, 14111) },
- { AOM_CDF4(29711, 32655, 32735) },
- { AOM_CDF4(21169, 31394, 32417) },
- { AOM_CDF4(13487, 27198, 30957) },
- { AOM_CDF4(8828, 21683, 27614) },
- { AOM_CDF4(4270, 11451, 17038) },
- { AOM_CDF4(28708, 32578, 32731) },
- { AOM_CDF4(20120, 31241, 32482) },
- { AOM_CDF4(13692, 27550, 31321) },
- { AOM_CDF4(9418, 22514, 28439) },
- { AOM_CDF4(4999, 13283, 19462) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(5673, 14302, 19711) },
- { AOM_CDF4(26251, 30701, 31834) },
- { AOM_CDF4(12782, 23783, 27803) },
- { AOM_CDF4(9127, 20657, 25808) },
- { AOM_CDF4(6368, 16208, 21462) },
- { AOM_CDF4(2465, 7177, 10822) },
- { AOM_CDF4(29961, 32563, 32719) },
- { AOM_CDF4(18318, 29891, 31949) },
- { AOM_CDF4(11361, 24514, 29357) },
- { AOM_CDF4(7900, 19603, 25607) },
- { AOM_CDF4(4002, 10590, 15546) },
- { AOM_CDF4(29637, 32310, 32595) },
- { AOM_CDF4(18296, 29913, 31809) },
- { AOM_CDF4(10144, 21515, 26871) },
- { AOM_CDF4(5358, 14322, 20394) },
- { AOM_CDF4(3067, 8362, 13346) },
- { AOM_CDF4(28652, 32470, 32676) },
- { AOM_CDF4(17538, 30771, 32209) },
- { AOM_CDF4(13924, 26882, 30494) },
- { AOM_CDF4(10496, 22837, 27869) },
- { AOM_CDF4(7236, 16396, 21621) },
- { AOM_CDF4(30743, 32687, 32746) },
- { AOM_CDF4(23006, 31676, 32489) },
- { AOM_CDF4(14494, 27828, 31120) },
- { AOM_CDF4(10174, 22801, 28352) },
- { AOM_CDF4(6242, 15281, 21043) },
- { AOM_CDF4(25817, 32243, 32720) },
- { AOM_CDF4(18618, 31367, 32325) },
- { AOM_CDF4(13997, 28318, 31878) },
- { AOM_CDF4(12255, 26534, 31383) },
- { AOM_CDF4(9561, 21588, 28450) },
- { AOM_CDF4(28188, 32635, 32724) },
- { AOM_CDF4(22060, 32365, 32728) },
- { AOM_CDF4(18102, 30690, 32528) },
- { AOM_CDF4(14196, 28864, 31999) },
- { AOM_CDF4(12262, 25792, 30865) },
- { AOM_CDF4(24176, 32109, 32628) },
- { AOM_CDF4(18280, 29681, 31963) },
- { AOM_CDF4(10205, 23703, 29664) },
- { AOM_CDF4(7889, 20025, 27676) },
- { AOM_CDF4(6060, 16743, 23970) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(5141, 7096, 8260) },
- { AOM_CDF4(27186, 29022, 29789) },
- { AOM_CDF4(6668, 12568, 15682) },
- { AOM_CDF4(2172, 6181, 8638) },
- { AOM_CDF4(1126, 3379, 4531) },
- { AOM_CDF4(443, 1361, 2254) },
- { AOM_CDF4(26083, 31153, 32436) },
- { AOM_CDF4(13486, 24603, 28483) },
- { AOM_CDF4(6508, 14840, 19910) },
- { AOM_CDF4(3386, 8800, 13286) },
- { AOM_CDF4(1530, 4322, 7054) },
- { AOM_CDF4(29639, 32080, 32548) },
- { AOM_CDF4(15897, 27552, 30290) },
- { AOM_CDF4(8588, 20047, 25383) },
- { AOM_CDF4(4889, 13339, 19269) },
- { AOM_CDF4(2240, 6871, 10498) },
- { AOM_CDF4(28165, 32197, 32517) },
- { AOM_CDF4(20735, 30427, 31568) },
- { AOM_CDF4(14325, 24671, 27692) },
- { AOM_CDF4(5119, 12554, 17805) },
- { AOM_CDF4(1810, 5441, 8261) },
- { AOM_CDF4(31212, 32724, 32748) },
- { AOM_CDF4(23352, 31766, 32545) },
- { AOM_CDF4(14669, 27570, 31059) },
- { AOM_CDF4(8492, 20894, 27272) },
- { AOM_CDF4(3644, 10194, 15204) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(2461, 7013, 9371) },
- { AOM_CDF4(24749, 29600, 30986) },
- { AOM_CDF4(9466, 19037, 22417) },
- { AOM_CDF4(3584, 9280, 14400) },
- { AOM_CDF4(1505, 3929, 5433) },
- { AOM_CDF4(677, 1500, 2736) },
- { AOM_CDF4(23987, 30702, 32117) },
- { AOM_CDF4(13554, 24571, 29263) },
- { AOM_CDF4(6211, 14556, 21155) },
- { AOM_CDF4(3135, 10972, 15625) },
- { AOM_CDF4(2435, 7127, 11427) },
- { AOM_CDF4(31300, 32532, 32550) },
- { AOM_CDF4(14757, 30365, 31954) },
- { AOM_CDF4(4405, 11612, 18553) },
- { AOM_CDF4(580, 4132, 7322) },
- { AOM_CDF4(1695, 10169, 14124) },
- { AOM_CDF4(30008, 32282, 32591) },
- { AOM_CDF4(19244, 30108, 31748) },
- { AOM_CDF4(11180, 24158, 29555) },
- { AOM_CDF4(5650, 14972, 19209) },
- { AOM_CDF4(2114, 5109, 8456) },
- { AOM_CDF4(31856, 32716, 32748) },
- { AOM_CDF4(23012, 31664, 32572) },
- { AOM_CDF4(13694, 26656, 30636) },
- { AOM_CDF4(8142, 19508, 26093) },
- { AOM_CDF4(4253, 10955, 16724) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(601, 983, 1311) },
- { AOM_CDF4(18725, 23406, 28087) },
- { AOM_CDF4(5461, 8192, 10923) },
- { AOM_CDF4(3781, 15124, 21425) },
- { AOM_CDF4(2587, 7761, 12072) },
- { AOM_CDF4(106, 458, 810) },
- { AOM_CDF4(22282, 29710, 31894) },
- { AOM_CDF4(8508, 20926, 25984) },
- { AOM_CDF4(3726, 12713, 18083) },
- { AOM_CDF4(1620, 7112, 10893) },
- { AOM_CDF4(729, 2236, 3495) },
- { AOM_CDF4(30163, 32474, 32684) },
- { AOM_CDF4(18304, 30464, 32000) },
- { AOM_CDF4(11443, 26526, 29647) },
- { AOM_CDF4(6007, 15292, 21299) },
- { AOM_CDF4(2234, 6703, 8937) },
- { AOM_CDF4(30954, 32177, 32571) },
- { AOM_CDF4(17363, 29562, 31076) },
- { AOM_CDF4(9686, 22464, 27410) },
- { AOM_CDF4(8192, 16384, 21390) },
- { AOM_CDF4(1755, 8046, 11264) },
- { AOM_CDF4(31168, 32734, 32748) },
- { AOM_CDF4(22486, 31441, 32471) },
- { AOM_CDF4(12833, 25627, 29738) },
- { AOM_CDF4(6980, 17379, 23122) },
- { AOM_CDF4(3111, 8887, 13479) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } } },
- { { { { AOM_CDF4(6041, 11854, 15927) },
- { AOM_CDF4(20326, 30905, 32251) },
- { AOM_CDF4(14164, 26831, 30725) },
- { AOM_CDF4(9760, 20647, 26585) },
- { AOM_CDF4(6416, 14953, 21219) },
- { AOM_CDF4(2966, 7151, 10891) },
- { AOM_CDF4(23567, 31374, 32254) },
- { AOM_CDF4(14978, 27416, 30946) },
- { AOM_CDF4(9434, 20225, 26254) },
- { AOM_CDF4(6658, 14558, 20535) },
- { AOM_CDF4(3916, 8677, 12989) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(18088, 29545, 31587) },
- { AOM_CDF4(13062, 25843, 30073) },
- { AOM_CDF4(8940, 16827, 22251) },
- { AOM_CDF4(7654, 13220, 17973) },
- { AOM_CDF4(5733, 10316, 14456) },
- { AOM_CDF4(22879, 31388, 32114) },
- { AOM_CDF4(15215, 27993, 30955) },
- { AOM_CDF4(9397, 19445, 24978) },
- { AOM_CDF4(3442, 9813, 15344) },
- { AOM_CDF4(1368, 3936, 6532) },
- { AOM_CDF4(25494, 32033, 32406) },
- { AOM_CDF4(16772, 27963, 30718) },
- { AOM_CDF4(9419, 18165, 23260) },
- { AOM_CDF4(2677, 7501, 11797) },
- { AOM_CDF4(1516, 4344, 7170) },
- { AOM_CDF4(26556, 31454, 32101) },
- { AOM_CDF4(17128, 27035, 30108) },
- { AOM_CDF4(8324, 15344, 20249) },
- { AOM_CDF4(1903, 5696, 9469) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(8455, 19003, 24368) },
- { AOM_CDF4(23563, 32021, 32604) },
- { AOM_CDF4(16237, 29446, 31935) },
- { AOM_CDF4(10724, 23999, 29358) },
- { AOM_CDF4(6725, 17528, 24416) },
- { AOM_CDF4(3927, 10927, 16825) },
- { AOM_CDF4(26313, 32288, 32634) },
- { AOM_CDF4(17430, 30095, 32095) },
- { AOM_CDF4(11116, 24606, 29679) },
- { AOM_CDF4(7195, 18384, 25269) },
- { AOM_CDF4(4726, 12852, 19315) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(22822, 31648, 32483) },
- { AOM_CDF4(16724, 29633, 31929) },
- { AOM_CDF4(10261, 23033, 28725) },
- { AOM_CDF4(7029, 17840, 24528) },
- { AOM_CDF4(4867, 13886, 21502) },
- { AOM_CDF4(25298, 31892, 32491) },
- { AOM_CDF4(17809, 29330, 31512) },
- { AOM_CDF4(9668, 21329, 26579) },
- { AOM_CDF4(4774, 12956, 18976) },
- { AOM_CDF4(2322, 7030, 11540) },
- { AOM_CDF4(25472, 31920, 32543) },
- { AOM_CDF4(17957, 29387, 31632) },
- { AOM_CDF4(9196, 20593, 26400) },
- { AOM_CDF4(4680, 12705, 19202) },
- { AOM_CDF4(2917, 8456, 13436) },
- { AOM_CDF4(26471, 32059, 32574) },
- { AOM_CDF4(18458, 29783, 31909) },
- { AOM_CDF4(8400, 19464, 25956) },
- { AOM_CDF4(3812, 10973, 17206) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(6779, 13743, 17678) },
- { AOM_CDF4(24806, 31797, 32457) },
- { AOM_CDF4(17616, 29047, 31372) },
- { AOM_CDF4(11063, 23175, 28003) },
- { AOM_CDF4(6521, 16110, 22324) },
- { AOM_CDF4(2764, 7504, 11654) },
- { AOM_CDF4(25266, 32367, 32637) },
- { AOM_CDF4(19054, 30553, 32175) },
- { AOM_CDF4(12139, 25212, 29807) },
- { AOM_CDF4(7311, 18162, 24704) },
- { AOM_CDF4(3397, 9164, 14074) },
- { AOM_CDF4(25988, 32208, 32522) },
- { AOM_CDF4(16253, 28912, 31526) },
- { AOM_CDF4(9151, 21387, 27372) },
- { AOM_CDF4(5688, 14915, 21496) },
- { AOM_CDF4(2717, 7627, 12004) },
- { AOM_CDF4(23144, 31855, 32443) },
- { AOM_CDF4(16070, 28491, 31325) },
- { AOM_CDF4(8702, 20467, 26517) },
- { AOM_CDF4(5243, 13956, 20367) },
- { AOM_CDF4(2621, 7335, 11567) },
- { AOM_CDF4(26636, 32340, 32630) },
- { AOM_CDF4(19990, 31050, 32341) },
- { AOM_CDF4(13243, 26105, 30315) },
- { AOM_CDF4(8588, 19521, 25918) },
- { AOM_CDF4(4717, 11585, 17304) },
- { AOM_CDF4(25844, 32292, 32582) },
- { AOM_CDF4(19090, 30635, 32097) },
- { AOM_CDF4(11963, 24546, 28939) },
- { AOM_CDF4(6218, 16087, 22354) },
- { AOM_CDF4(2340, 6608, 10426) },
- { AOM_CDF4(28046, 32576, 32694) },
- { AOM_CDF4(21178, 31313, 32296) },
- { AOM_CDF4(13486, 26184, 29870) },
- { AOM_CDF4(7149, 17871, 23723) },
- { AOM_CDF4(2833, 7958, 12259) },
- { AOM_CDF4(27710, 32528, 32686) },
- { AOM_CDF4(20674, 31076, 32268) },
- { AOM_CDF4(12413, 24955, 29243) },
- { AOM_CDF4(6676, 16927, 23097) },
- { AOM_CDF4(2966, 8333, 12919) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(8639, 19339, 24429) },
- { AOM_CDF4(24404, 31837, 32525) },
- { AOM_CDF4(16997, 29425, 31784) },
- { AOM_CDF4(11253, 24234, 29149) },
- { AOM_CDF4(6751, 17394, 24028) },
- { AOM_CDF4(3490, 9830, 15191) },
- { AOM_CDF4(26283, 32471, 32714) },
- { AOM_CDF4(19599, 31168, 32442) },
- { AOM_CDF4(13146, 26954, 30893) },
- { AOM_CDF4(8214, 20588, 26890) },
- { AOM_CDF4(4699, 13081, 19300) },
- { AOM_CDF4(28212, 32458, 32669) },
- { AOM_CDF4(18594, 30316, 32100) },
- { AOM_CDF4(11219, 24408, 29234) },
- { AOM_CDF4(6865, 17656, 24149) },
- { AOM_CDF4(3678, 10362, 16006) },
- { AOM_CDF4(25825, 32136, 32616) },
- { AOM_CDF4(17313, 29853, 32021) },
- { AOM_CDF4(11197, 24471, 29472) },
- { AOM_CDF4(6947, 17781, 24405) },
- { AOM_CDF4(3768, 10660, 16261) },
- { AOM_CDF4(27352, 32500, 32706) },
- { AOM_CDF4(20850, 31468, 32469) },
- { AOM_CDF4(14021, 27707, 31133) },
- { AOM_CDF4(8964, 21748, 27838) },
- { AOM_CDF4(5437, 14665, 21187) },
- { AOM_CDF4(26304, 32492, 32698) },
- { AOM_CDF4(20409, 31380, 32385) },
- { AOM_CDF4(13682, 27222, 30632) },
- { AOM_CDF4(8974, 21236, 26685) },
- { AOM_CDF4(4234, 11665, 16934) },
- { AOM_CDF4(26273, 32357, 32711) },
- { AOM_CDF4(20672, 31242, 32441) },
- { AOM_CDF4(14172, 27254, 30902) },
- { AOM_CDF4(9870, 21898, 27275) },
- { AOM_CDF4(5164, 13506, 19270) },
- { AOM_CDF4(26725, 32459, 32728) },
- { AOM_CDF4(20991, 31442, 32527) },
- { AOM_CDF4(13071, 26434, 30811) },
- { AOM_CDF4(8184, 20090, 26742) },
- { AOM_CDF4(4803, 13255, 19895) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(7555, 14942, 18501) },
- { AOM_CDF4(24410, 31178, 32287) },
- { AOM_CDF4(14394, 26738, 30253) },
- { AOM_CDF4(8413, 19554, 25195) },
- { AOM_CDF4(4766, 12924, 18785) },
- { AOM_CDF4(2029, 5806, 9207) },
- { AOM_CDF4(26776, 32364, 32663) },
- { AOM_CDF4(18732, 29967, 31931) },
- { AOM_CDF4(11005, 23786, 28852) },
- { AOM_CDF4(6466, 16909, 23510) },
- { AOM_CDF4(3044, 8638, 13419) },
- { AOM_CDF4(29208, 32582, 32704) },
- { AOM_CDF4(20068, 30857, 32208) },
- { AOM_CDF4(12003, 25085, 29595) },
- { AOM_CDF4(6947, 17750, 24189) },
- { AOM_CDF4(3245, 9103, 14007) },
- { AOM_CDF4(27359, 32465, 32669) },
- { AOM_CDF4(19421, 30614, 32174) },
- { AOM_CDF4(11915, 25010, 29579) },
- { AOM_CDF4(6950, 17676, 24074) },
- { AOM_CDF4(3007, 8473, 13096) },
- { AOM_CDF4(29002, 32676, 32735) },
- { AOM_CDF4(22102, 31849, 32576) },
- { AOM_CDF4(14408, 28009, 31405) },
- { AOM_CDF4(9027, 21679, 27931) },
- { AOM_CDF4(4694, 12678, 18748) },
- { AOM_CDF4(28216, 32528, 32682) },
- { AOM_CDF4(20849, 31264, 32318) },
- { AOM_CDF4(12756, 25815, 29751) },
- { AOM_CDF4(7565, 18801, 24923) },
- { AOM_CDF4(3509, 9533, 14477) },
- { AOM_CDF4(30133, 32687, 32739) },
- { AOM_CDF4(23063, 31910, 32515) },
- { AOM_CDF4(14588, 28051, 31132) },
- { AOM_CDF4(9085, 21649, 27457) },
- { AOM_CDF4(4261, 11654, 17264) },
- { AOM_CDF4(29518, 32691, 32748) },
- { AOM_CDF4(22451, 31959, 32613) },
- { AOM_CDF4(14864, 28722, 31700) },
- { AOM_CDF4(9695, 22964, 28716) },
- { AOM_CDF4(4932, 13358, 19502) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(6465, 16958, 21688) },
- { AOM_CDF4(25199, 31514, 32360) },
- { AOM_CDF4(14774, 27149, 30607) },
- { AOM_CDF4(9257, 21438, 26972) },
- { AOM_CDF4(5723, 15183, 21882) },
- { AOM_CDF4(3150, 8879, 13731) },
- { AOM_CDF4(26989, 32262, 32682) },
- { AOM_CDF4(17396, 29937, 32085) },
- { AOM_CDF4(11387, 24901, 29784) },
- { AOM_CDF4(7289, 18821, 25548) },
- { AOM_CDF4(3734, 10577, 16086) },
- { AOM_CDF4(29728, 32501, 32695) },
- { AOM_CDF4(17431, 29701, 31903) },
- { AOM_CDF4(9921, 22826, 28300) },
- { AOM_CDF4(5896, 15434, 22068) },
- { AOM_CDF4(3430, 9646, 14757) },
- { AOM_CDF4(28614, 32511, 32705) },
- { AOM_CDF4(19364, 30638, 32263) },
- { AOM_CDF4(13129, 26254, 30402) },
- { AOM_CDF4(8754, 20484, 26440) },
- { AOM_CDF4(4378, 11607, 17110) },
- { AOM_CDF4(30292, 32671, 32744) },
- { AOM_CDF4(21780, 31603, 32501) },
- { AOM_CDF4(14314, 27829, 31291) },
- { AOM_CDF4(9611, 22327, 28263) },
- { AOM_CDF4(4890, 13087, 19065) },
- { AOM_CDF4(25862, 32567, 32733) },
- { AOM_CDF4(20794, 32050, 32567) },
- { AOM_CDF4(17243, 30625, 32254) },
- { AOM_CDF4(13283, 27628, 31474) },
- { AOM_CDF4(9669, 22532, 28918) },
- { AOM_CDF4(27435, 32697, 32748) },
- { AOM_CDF4(24922, 32390, 32714) },
- { AOM_CDF4(21449, 31504, 32536) },
- { AOM_CDF4(16392, 29729, 31832) },
- { AOM_CDF4(11692, 24884, 29076) },
- { AOM_CDF4(24193, 32290, 32735) },
- { AOM_CDF4(18909, 31104, 32563) },
- { AOM_CDF4(12236, 26841, 31403) },
- { AOM_CDF4(8171, 21840, 29082) },
- { AOM_CDF4(7224, 17280, 25275) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(3078, 6839, 9890) },
- { AOM_CDF4(13837, 20450, 24479) },
- { AOM_CDF4(5914, 14222, 19328) },
- { AOM_CDF4(3866, 10267, 14762) },
- { AOM_CDF4(2612, 7208, 11042) },
- { AOM_CDF4(1067, 2991, 4776) },
- { AOM_CDF4(25817, 31646, 32529) },
- { AOM_CDF4(13708, 26338, 30385) },
- { AOM_CDF4(7328, 18585, 24870) },
- { AOM_CDF4(4691, 13080, 19276) },
- { AOM_CDF4(1825, 5253, 8352) },
- { AOM_CDF4(29386, 32315, 32624) },
- { AOM_CDF4(17160, 29001, 31360) },
- { AOM_CDF4(9602, 21862, 27396) },
- { AOM_CDF4(5915, 15772, 22148) },
- { AOM_CDF4(2786, 7779, 12047) },
- { AOM_CDF4(29246, 32450, 32663) },
- { AOM_CDF4(18696, 29929, 31818) },
- { AOM_CDF4(10510, 23369, 28560) },
- { AOM_CDF4(6229, 16499, 23125) },
- { AOM_CDF4(2608, 7448, 11705) },
- { AOM_CDF4(30753, 32710, 32748) },
- { AOM_CDF4(21638, 31487, 32503) },
- { AOM_CDF4(12937, 26854, 30870) },
- { AOM_CDF4(8182, 20596, 26970) },
- { AOM_CDF4(3637, 10269, 15497) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(5244, 12150, 16906) },
- { AOM_CDF4(20486, 26858, 29701) },
- { AOM_CDF4(7756, 18317, 23735) },
- { AOM_CDF4(3452, 9256, 13146) },
- { AOM_CDF4(2020, 5206, 8229) },
- { AOM_CDF4(1801, 4993, 7903) },
- { AOM_CDF4(27051, 31858, 32531) },
- { AOM_CDF4(15988, 27531, 30619) },
- { AOM_CDF4(9188, 21484, 26719) },
- { AOM_CDF4(6273, 17186, 23800) },
- { AOM_CDF4(3108, 9355, 14764) },
- { AOM_CDF4(31076, 32520, 32680) },
- { AOM_CDF4(18119, 30037, 31850) },
- { AOM_CDF4(10244, 22969, 27472) },
- { AOM_CDF4(4692, 14077, 19273) },
- { AOM_CDF4(3694, 11677, 17556) },
- { AOM_CDF4(30060, 32581, 32720) },
- { AOM_CDF4(21011, 30775, 32120) },
- { AOM_CDF4(11931, 24820, 29289) },
- { AOM_CDF4(7119, 17662, 24356) },
- { AOM_CDF4(3833, 10706, 16304) },
- { AOM_CDF4(31954, 32731, 32748) },
- { AOM_CDF4(23913, 31724, 32489) },
- { AOM_CDF4(15520, 28060, 31286) },
- { AOM_CDF4(11517, 23008, 28571) },
- { AOM_CDF4(6193, 14508, 20629) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(1035, 2807, 4156) },
- { AOM_CDF4(13162, 18138, 20939) },
- { AOM_CDF4(2696, 6633, 8755) },
- { AOM_CDF4(1373, 4161, 6853) },
- { AOM_CDF4(1099, 2746, 4716) },
- { AOM_CDF4(340, 1021, 1599) },
- { AOM_CDF4(22826, 30419, 32135) },
- { AOM_CDF4(10395, 21762, 26942) },
- { AOM_CDF4(4726, 12407, 17361) },
- { AOM_CDF4(2447, 7080, 10593) },
- { AOM_CDF4(1227, 3717, 6011) },
- { AOM_CDF4(28156, 31424, 31934) },
- { AOM_CDF4(16915, 27754, 30373) },
- { AOM_CDF4(9148, 20990, 26431) },
- { AOM_CDF4(5950, 15515, 21148) },
- { AOM_CDF4(2492, 7327, 11526) },
- { AOM_CDF4(30602, 32477, 32670) },
- { AOM_CDF4(20026, 29955, 31568) },
- { AOM_CDF4(11220, 23628, 28105) },
- { AOM_CDF4(6652, 17019, 22973) },
- { AOM_CDF4(3064, 8536, 13043) },
- { AOM_CDF4(31769, 32724, 32748) },
- { AOM_CDF4(22230, 30887, 32373) },
- { AOM_CDF4(12234, 25079, 29731) },
- { AOM_CDF4(7326, 18816, 25353) },
- { AOM_CDF4(3933, 10907, 16616) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } } },
- { { { { AOM_CDF4(8896, 16227, 20630) },
- { AOM_CDF4(23629, 31782, 32527) },
- { AOM_CDF4(15173, 27755, 31321) },
- { AOM_CDF4(10158, 21233, 27382) },
- { AOM_CDF4(6420, 14857, 21558) },
- { AOM_CDF4(3269, 8155, 12646) },
- { AOM_CDF4(24835, 32009, 32496) },
- { AOM_CDF4(16509, 28421, 31579) },
- { AOM_CDF4(10957, 21514, 27418) },
- { AOM_CDF4(7881, 15930, 22096) },
- { AOM_CDF4(5388, 10960, 15918) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(20745, 30773, 32093) },
- { AOM_CDF4(15200, 27221, 30861) },
- { AOM_CDF4(13032, 20873, 25667) },
- { AOM_CDF4(12285, 18663, 23494) },
- { AOM_CDF4(11563, 17481, 21489) },
- { AOM_CDF4(26260, 31982, 32320) },
- { AOM_CDF4(15397, 28083, 31100) },
- { AOM_CDF4(9742, 19217, 24824) },
- { AOM_CDF4(3261, 9629, 15362) },
- { AOM_CDF4(1480, 4322, 7499) },
- { AOM_CDF4(27599, 32256, 32460) },
- { AOM_CDF4(16857, 27659, 30774) },
- { AOM_CDF4(9551, 18290, 23748) },
- { AOM_CDF4(3052, 8933, 14103) },
- { AOM_CDF4(2021, 5910, 9787) },
- { AOM_CDF4(29005, 32015, 32392) },
- { AOM_CDF4(17677, 27694, 30863) },
- { AOM_CDF4(9204, 17356, 23219) },
- { AOM_CDF4(2403, 7516, 12814) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(10808, 22056, 26896) },
- { AOM_CDF4(25739, 32313, 32676) },
- { AOM_CDF4(17288, 30203, 32221) },
- { AOM_CDF4(11359, 24878, 29896) },
- { AOM_CDF4(6949, 17767, 24893) },
- { AOM_CDF4(4287, 11796, 18071) },
- { AOM_CDF4(27880, 32521, 32705) },
- { AOM_CDF4(19038, 31004, 32414) },
- { AOM_CDF4(12564, 26345, 30768) },
- { AOM_CDF4(8269, 19947, 26779) },
- { AOM_CDF4(5674, 14657, 21674) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(25742, 32319, 32671) },
- { AOM_CDF4(19557, 31164, 32454) },
- { AOM_CDF4(13381, 26381, 30755) },
- { AOM_CDF4(10101, 21466, 26722) },
- { AOM_CDF4(9209, 19650, 26825) },
- { AOM_CDF4(27107, 31917, 32432) },
- { AOM_CDF4(18056, 28893, 31203) },
- { AOM_CDF4(10200, 21434, 26764) },
- { AOM_CDF4(4660, 12913, 19502) },
- { AOM_CDF4(2368, 6930, 12504) },
- { AOM_CDF4(26960, 32158, 32613) },
- { AOM_CDF4(18628, 30005, 32031) },
- { AOM_CDF4(10233, 22442, 28232) },
- { AOM_CDF4(5471, 14630, 21516) },
- { AOM_CDF4(3235, 10767, 17109) },
- { AOM_CDF4(27696, 32440, 32692) },
- { AOM_CDF4(20032, 31167, 32438) },
- { AOM_CDF4(8700, 21341, 28442) },
- { AOM_CDF4(5662, 14831, 21795) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(9704, 17294, 21132) },
- { AOM_CDF4(26762, 32278, 32633) },
- { AOM_CDF4(18382, 29620, 31819) },
- { AOM_CDF4(10891, 23475, 28723) },
- { AOM_CDF4(6358, 16583, 23309) },
- { AOM_CDF4(3248, 9118, 14141) },
- { AOM_CDF4(27204, 32573, 32699) },
- { AOM_CDF4(19818, 30824, 32329) },
- { AOM_CDF4(11772, 25120, 30041) },
- { AOM_CDF4(6995, 18033, 25039) },
- { AOM_CDF4(3752, 10442, 16098) },
- { AOM_CDF4(27222, 32256, 32559) },
- { AOM_CDF4(15356, 28399, 31475) },
- { AOM_CDF4(8821, 20635, 27057) },
- { AOM_CDF4(5511, 14404, 21239) },
- { AOM_CDF4(2935, 8222, 13051) },
- { AOM_CDF4(24875, 32120, 32529) },
- { AOM_CDF4(15233, 28265, 31445) },
- { AOM_CDF4(8605, 20570, 26932) },
- { AOM_CDF4(5431, 14413, 21196) },
- { AOM_CDF4(2994, 8341, 13223) },
- { AOM_CDF4(28201, 32604, 32700) },
- { AOM_CDF4(21041, 31446, 32456) },
- { AOM_CDF4(13221, 26213, 30475) },
- { AOM_CDF4(8255, 19385, 26037) },
- { AOM_CDF4(4930, 12585, 18830) },
- { AOM_CDF4(28768, 32448, 32627) },
- { AOM_CDF4(19705, 30561, 32021) },
- { AOM_CDF4(11572, 23589, 28220) },
- { AOM_CDF4(5532, 15034, 21446) },
- { AOM_CDF4(2460, 7150, 11456) },
- { AOM_CDF4(29874, 32619, 32699) },
- { AOM_CDF4(21621, 31071, 32201) },
- { AOM_CDF4(12511, 24747, 28992) },
- { AOM_CDF4(6281, 16395, 22748) },
- { AOM_CDF4(3246, 9278, 14497) },
- { AOM_CDF4(29715, 32625, 32712) },
- { AOM_CDF4(20958, 31011, 32283) },
- { AOM_CDF4(11233, 23671, 28806) },
- { AOM_CDF4(6012, 16128, 22868) },
- { AOM_CDF4(3427, 9851, 15414) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(11016, 22111, 26794) },
- { AOM_CDF4(25946, 32357, 32677) },
- { AOM_CDF4(17890, 30452, 32252) },
- { AOM_CDF4(11678, 25142, 29816) },
- { AOM_CDF4(6720, 17534, 24584) },
- { AOM_CDF4(4230, 11665, 17820) },
- { AOM_CDF4(28400, 32623, 32747) },
- { AOM_CDF4(21164, 31668, 32575) },
- { AOM_CDF4(13572, 27388, 31182) },
- { AOM_CDF4(8234, 20750, 27358) },
- { AOM_CDF4(5065, 14055, 20897) },
- { AOM_CDF4(28981, 32547, 32705) },
- { AOM_CDF4(18681, 30543, 32239) },
- { AOM_CDF4(10919, 24075, 29286) },
- { AOM_CDF4(6431, 17199, 24077) },
- { AOM_CDF4(3819, 10464, 16618) },
- { AOM_CDF4(26870, 32467, 32693) },
- { AOM_CDF4(19041, 30831, 32347) },
- { AOM_CDF4(11794, 25211, 30016) },
- { AOM_CDF4(6888, 18019, 24970) },
- { AOM_CDF4(4370, 12363, 18992) },
- { AOM_CDF4(29578, 32670, 32744) },
- { AOM_CDF4(23159, 32007, 32613) },
- { AOM_CDF4(15315, 28669, 31676) },
- { AOM_CDF4(9298, 22607, 28782) },
- { AOM_CDF4(6144, 15913, 22968) },
- { AOM_CDF4(28110, 32499, 32669) },
- { AOM_CDF4(21574, 30937, 32015) },
- { AOM_CDF4(12759, 24818, 28727) },
- { AOM_CDF4(6545, 16761, 23042) },
- { AOM_CDF4(3649, 10597, 16833) },
- { AOM_CDF4(28163, 32552, 32728) },
- { AOM_CDF4(22101, 31469, 32464) },
- { AOM_CDF4(13160, 25472, 30143) },
- { AOM_CDF4(7303, 18684, 25468) },
- { AOM_CDF4(5241, 13975, 20955) },
- { AOM_CDF4(28400, 32631, 32744) },
- { AOM_CDF4(22104, 31793, 32603) },
- { AOM_CDF4(13557, 26571, 30846) },
- { AOM_CDF4(7749, 19861, 26675) },
- { AOM_CDF4(4873, 14030, 21234) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(9800, 17635, 21073) },
- { AOM_CDF4(26153, 31885, 32527) },
- { AOM_CDF4(15038, 27852, 31006) },
- { AOM_CDF4(8718, 20564, 26486) },
- { AOM_CDF4(5128, 14076, 20514) },
- { AOM_CDF4(2636, 7566, 11925) },
- { AOM_CDF4(27551, 32504, 32701) },
- { AOM_CDF4(18310, 30054, 32100) },
- { AOM_CDF4(10211, 23420, 29082) },
- { AOM_CDF4(6222, 16876, 23916) },
- { AOM_CDF4(3462, 9954, 15498) },
- { AOM_CDF4(29991, 32633, 32721) },
- { AOM_CDF4(19883, 30751, 32201) },
- { AOM_CDF4(11141, 24184, 29285) },
- { AOM_CDF4(6420, 16940, 23774) },
- { AOM_CDF4(3392, 9753, 15118) },
- { AOM_CDF4(28465, 32616, 32712) },
- { AOM_CDF4(19850, 30702, 32244) },
- { AOM_CDF4(10983, 24024, 29223) },
- { AOM_CDF4(6294, 16770, 23582) },
- { AOM_CDF4(3244, 9283, 14509) },
- { AOM_CDF4(30023, 32717, 32748) },
- { AOM_CDF4(22940, 32032, 32626) },
- { AOM_CDF4(14282, 27928, 31473) },
- { AOM_CDF4(8562, 21327, 27914) },
- { AOM_CDF4(4846, 13393, 19919) },
- { AOM_CDF4(29981, 32590, 32695) },
- { AOM_CDF4(20465, 30963, 32166) },
- { AOM_CDF4(11479, 23579, 28195) },
- { AOM_CDF4(5916, 15648, 22073) },
- { AOM_CDF4(3031, 8605, 13398) },
- { AOM_CDF4(31146, 32691, 32739) },
- { AOM_CDF4(23106, 31724, 32444) },
- { AOM_CDF4(13783, 26738, 30439) },
- { AOM_CDF4(7852, 19468, 25807) },
- { AOM_CDF4(3860, 11124, 16853) },
- { AOM_CDF4(31014, 32724, 32748) },
- { AOM_CDF4(23629, 32109, 32628) },
- { AOM_CDF4(14747, 28115, 31403) },
- { AOM_CDF4(8545, 21242, 27478) },
- { AOM_CDF4(4574, 12781, 19067) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(9185, 19694, 24688) },
- { AOM_CDF4(26081, 31985, 32621) },
- { AOM_CDF4(16015, 29000, 31787) },
- { AOM_CDF4(10542, 23690, 29206) },
- { AOM_CDF4(6732, 17945, 24677) },
- { AOM_CDF4(3916, 11039, 16722) },
- { AOM_CDF4(28224, 32566, 32744) },
- { AOM_CDF4(19100, 31138, 32485) },
- { AOM_CDF4(12528, 26620, 30879) },
- { AOM_CDF4(7741, 20277, 26885) },
- { AOM_CDF4(4566, 12845, 18990) },
- { AOM_CDF4(29933, 32593, 32718) },
- { AOM_CDF4(17670, 30333, 32155) },
- { AOM_CDF4(10385, 23600, 28909) },
- { AOM_CDF4(6243, 16236, 22407) },
- { AOM_CDF4(3976, 10389, 16017) },
- { AOM_CDF4(28377, 32561, 32738) },
- { AOM_CDF4(19366, 31175, 32482) },
- { AOM_CDF4(13327, 27175, 31094) },
- { AOM_CDF4(8258, 20769, 27143) },
- { AOM_CDF4(4703, 13198, 19527) },
- { AOM_CDF4(31086, 32706, 32748) },
- { AOM_CDF4(22853, 31902, 32583) },
- { AOM_CDF4(14759, 28186, 31419) },
- { AOM_CDF4(9284, 22382, 28348) },
- { AOM_CDF4(5585, 15192, 21868) },
- { AOM_CDF4(28291, 32652, 32746) },
- { AOM_CDF4(19849, 32107, 32571) },
- { AOM_CDF4(14834, 26818, 29214) },
- { AOM_CDF4(10306, 22594, 28672) },
- { AOM_CDF4(6615, 17384, 23384) },
- { AOM_CDF4(28947, 32604, 32745) },
- { AOM_CDF4(25625, 32289, 32646) },
- { AOM_CDF4(18758, 28672, 31403) },
- { AOM_CDF4(10017, 23430, 28523) },
- { AOM_CDF4(6862, 15269, 22131) },
- { AOM_CDF4(23933, 32509, 32739) },
- { AOM_CDF4(19927, 31495, 32631) },
- { AOM_CDF4(11903, 26023, 30621) },
- { AOM_CDF4(7026, 20094, 27252) },
- { AOM_CDF4(5998, 18106, 24437) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(4456, 11274, 15533) },
- { AOM_CDF4(21219, 29079, 31616) },
- { AOM_CDF4(11173, 23774, 28567) },
- { AOM_CDF4(7282, 18293, 24263) },
- { AOM_CDF4(4890, 13286, 19115) },
- { AOM_CDF4(1890, 5508, 8659) },
- { AOM_CDF4(26651, 32136, 32647) },
- { AOM_CDF4(14630, 28254, 31455) },
- { AOM_CDF4(8716, 21287, 27395) },
- { AOM_CDF4(5615, 15331, 22008) },
- { AOM_CDF4(2675, 7700, 12150) },
- { AOM_CDF4(29954, 32526, 32690) },
- { AOM_CDF4(16126, 28982, 31633) },
- { AOM_CDF4(9030, 21361, 27352) },
- { AOM_CDF4(5411, 14793, 21271) },
- { AOM_CDF4(2943, 8422, 13163) },
- { AOM_CDF4(29539, 32601, 32730) },
- { AOM_CDF4(18125, 30385, 32201) },
- { AOM_CDF4(10422, 24090, 29468) },
- { AOM_CDF4(6468, 17487, 24438) },
- { AOM_CDF4(2970, 8653, 13531) },
- { AOM_CDF4(30912, 32715, 32748) },
- { AOM_CDF4(20666, 31373, 32497) },
- { AOM_CDF4(12509, 26640, 30917) },
- { AOM_CDF4(8058, 20629, 27290) },
- { AOM_CDF4(4231, 12006, 18052) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(10202, 20633, 25484) },
- { AOM_CDF4(27336, 31445, 32352) },
- { AOM_CDF4(12420, 24384, 28552) },
- { AOM_CDF4(7648, 18115, 23856) },
- { AOM_CDF4(5662, 14341, 19902) },
- { AOM_CDF4(3611, 10328, 15390) },
- { AOM_CDF4(30945, 32616, 32736) },
- { AOM_CDF4(18682, 30505, 32253) },
- { AOM_CDF4(11513, 25336, 30203) },
- { AOM_CDF4(7449, 19452, 26148) },
- { AOM_CDF4(4482, 13051, 18886) },
- { AOM_CDF4(32022, 32690, 32747) },
- { AOM_CDF4(18578, 30501, 32146) },
- { AOM_CDF4(11249, 23368, 28631) },
- { AOM_CDF4(5645, 16958, 22158) },
- { AOM_CDF4(5009, 11444, 16637) },
- { AOM_CDF4(31357, 32710, 32748) },
- { AOM_CDF4(21552, 31494, 32504) },
- { AOM_CDF4(13891, 27677, 31340) },
- { AOM_CDF4(9051, 22098, 28172) },
- { AOM_CDF4(5190, 13377, 19486) },
- { AOM_CDF4(32364, 32740, 32748) },
- { AOM_CDF4(24839, 31907, 32551) },
- { AOM_CDF4(17160, 28779, 31696) },
- { AOM_CDF4(12452, 24137, 29602) },
- { AOM_CDF4(6165, 15389, 22477) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(2575, 7281, 11077) },
- { AOM_CDF4(14002, 20866, 25402) },
- { AOM_CDF4(6343, 15056, 19658) },
- { AOM_CDF4(4474, 11858, 17041) },
- { AOM_CDF4(2865, 8299, 12534) },
- { AOM_CDF4(1344, 3949, 6391) },
- { AOM_CDF4(24720, 31239, 32459) },
- { AOM_CDF4(12585, 25356, 29968) },
- { AOM_CDF4(7181, 18246, 24444) },
- { AOM_CDF4(5025, 13667, 19885) },
- { AOM_CDF4(2521, 7304, 11605) },
- { AOM_CDF4(29908, 32252, 32584) },
- { AOM_CDF4(17421, 29156, 31575) },
- { AOM_CDF4(9889, 22188, 27782) },
- { AOM_CDF4(5878, 15647, 22123) },
- { AOM_CDF4(2814, 8665, 13323) },
- { AOM_CDF4(30183, 32568, 32713) },
- { AOM_CDF4(18528, 30195, 32049) },
- { AOM_CDF4(10982, 24606, 29657) },
- { AOM_CDF4(6957, 18165, 25231) },
- { AOM_CDF4(3508, 10118, 15468) },
- { AOM_CDF4(31761, 32736, 32748) },
- { AOM_CDF4(21041, 31328, 32546) },
- { AOM_CDF4(12568, 26732, 31166) },
- { AOM_CDF4(8052, 20720, 27733) },
- { AOM_CDF4(4336, 12192, 18396) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } } },
- { { { { AOM_CDF4(7062, 16472, 22319) },
- { AOM_CDF4(24538, 32261, 32674) },
- { AOM_CDF4(13675, 28041, 31779) },
- { AOM_CDF4(8590, 20674, 27631) },
- { AOM_CDF4(5685, 14675, 22013) },
- { AOM_CDF4(3655, 9898, 15731) },
- { AOM_CDF4(26493, 32418, 32658) },
- { AOM_CDF4(16376, 29342, 32090) },
- { AOM_CDF4(10594, 22649, 28970) },
- { AOM_CDF4(8176, 17170, 24303) },
- { AOM_CDF4(5605, 12694, 19139) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(23888, 31902, 32542) },
- { AOM_CDF4(18612, 29687, 31987) },
- { AOM_CDF4(16245, 24852, 29249) },
- { AOM_CDF4(15765, 22608, 27559) },
- { AOM_CDF4(19895, 24699, 27510) },
- { AOM_CDF4(28401, 32212, 32457) },
- { AOM_CDF4(15274, 27825, 30980) },
- { AOM_CDF4(9364, 18128, 24332) },
- { AOM_CDF4(2283, 8193, 15082) },
- { AOM_CDF4(1228, 3972, 7881) },
- { AOM_CDF4(29455, 32469, 32620) },
- { AOM_CDF4(17981, 28245, 31388) },
- { AOM_CDF4(10921, 20098, 26240) },
- { AOM_CDF4(3743, 11829, 18657) },
- { AOM_CDF4(2374, 9593, 15715) },
- { AOM_CDF4(31068, 32466, 32635) },
- { AOM_CDF4(20321, 29572, 31971) },
- { AOM_CDF4(10771, 20255, 27119) },
- { AOM_CDF4(2795, 10410, 17361) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(9320, 22102, 27840) },
- { AOM_CDF4(27057, 32464, 32724) },
- { AOM_CDF4(16331, 30268, 32309) },
- { AOM_CDF4(10319, 23935, 29720) },
- { AOM_CDF4(6189, 16448, 24106) },
- { AOM_CDF4(3589, 10884, 18808) },
- { AOM_CDF4(29026, 32624, 32748) },
- { AOM_CDF4(19226, 31507, 32587) },
- { AOM_CDF4(12692, 26921, 31203) },
- { AOM_CDF4(7049, 19532, 27635) },
- { AOM_CDF4(7727, 15669, 23252) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(28056, 32625, 32748) },
- { AOM_CDF4(22383, 32075, 32669) },
- { AOM_CDF4(15417, 27098, 31749) },
- { AOM_CDF4(18127, 26493, 27190) },
- { AOM_CDF4(5461, 16384, 21845) },
- { AOM_CDF4(27982, 32091, 32584) },
- { AOM_CDF4(19045, 29868, 31972) },
- { AOM_CDF4(10397, 22266, 27932) },
- { AOM_CDF4(5990, 13697, 21500) },
- { AOM_CDF4(1792, 6912, 15104) },
- { AOM_CDF4(28198, 32501, 32718) },
- { AOM_CDF4(21534, 31521, 32569) },
- { AOM_CDF4(11109, 25217, 30017) },
- { AOM_CDF4(5671, 15124, 26151) },
- { AOM_CDF4(4681, 14043, 18725) },
- { AOM_CDF4(28688, 32580, 32741) },
- { AOM_CDF4(22576, 32079, 32661) },
- { AOM_CDF4(10627, 22141, 28340) },
- { AOM_CDF4(9362, 14043, 28087) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(7754, 16948, 22142) },
- { AOM_CDF4(25670, 32330, 32691) },
- { AOM_CDF4(15663, 29225, 31994) },
- { AOM_CDF4(9878, 23288, 29158) },
- { AOM_CDF4(6419, 17088, 24336) },
- { AOM_CDF4(3859, 11003, 17039) },
- { AOM_CDF4(27562, 32595, 32725) },
- { AOM_CDF4(17575, 30588, 32399) },
- { AOM_CDF4(10819, 24838, 30309) },
- { AOM_CDF4(7124, 18686, 25916) },
- { AOM_CDF4(4479, 12688, 19340) },
- { AOM_CDF4(28385, 32476, 32673) },
- { AOM_CDF4(15306, 29005, 31938) },
- { AOM_CDF4(8937, 21615, 28322) },
- { AOM_CDF4(5982, 15603, 22786) },
- { AOM_CDF4(3620, 10267, 16136) },
- { AOM_CDF4(27280, 32464, 32667) },
- { AOM_CDF4(15607, 29160, 32004) },
- { AOM_CDF4(9091, 22135, 28740) },
- { AOM_CDF4(6232, 16632, 24020) },
- { AOM_CDF4(4047, 11377, 17672) },
- { AOM_CDF4(29220, 32630, 32718) },
- { AOM_CDF4(19650, 31220, 32462) },
- { AOM_CDF4(13050, 26312, 30827) },
- { AOM_CDF4(9228, 20870, 27468) },
- { AOM_CDF4(6146, 15149, 21971) },
- { AOM_CDF4(30169, 32481, 32623) },
- { AOM_CDF4(17212, 29311, 31554) },
- { AOM_CDF4(9911, 21311, 26882) },
- { AOM_CDF4(4487, 13314, 20372) },
- { AOM_CDF4(2570, 7772, 12889) },
- { AOM_CDF4(30924, 32613, 32708) },
- { AOM_CDF4(19490, 30206, 32107) },
- { AOM_CDF4(11232, 23998, 29276) },
- { AOM_CDF4(6769, 17955, 25035) },
- { AOM_CDF4(4398, 12623, 19214) },
- { AOM_CDF4(30609, 32627, 32722) },
- { AOM_CDF4(19370, 30582, 32287) },
- { AOM_CDF4(10457, 23619, 29409) },
- { AOM_CDF4(6443, 17637, 24834) },
- { AOM_CDF4(4645, 13236, 20106) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(8626, 20271, 26216) },
- { AOM_CDF4(26707, 32406, 32711) },
- { AOM_CDF4(16999, 30329, 32286) },
- { AOM_CDF4(11445, 25123, 30286) },
- { AOM_CDF4(6411, 18828, 25601) },
- { AOM_CDF4(6801, 12458, 20248) },
- { AOM_CDF4(29918, 32682, 32748) },
- { AOM_CDF4(20649, 31739, 32618) },
- { AOM_CDF4(12879, 27773, 31581) },
- { AOM_CDF4(7896, 21751, 28244) },
- { AOM_CDF4(5260, 14870, 23698) },
- { AOM_CDF4(29252, 32593, 32731) },
- { AOM_CDF4(17072, 30460, 32294) },
- { AOM_CDF4(10653, 24143, 29365) },
- { AOM_CDF4(6536, 17490, 23983) },
- { AOM_CDF4(4929, 13170, 20085) },
- { AOM_CDF4(28137, 32518, 32715) },
- { AOM_CDF4(18171, 30784, 32407) },
- { AOM_CDF4(11437, 25436, 30459) },
- { AOM_CDF4(7252, 18534, 26176) },
- { AOM_CDF4(4126, 13353, 20978) },
- { AOM_CDF4(31162, 32726, 32748) },
- { AOM_CDF4(23017, 32222, 32701) },
- { AOM_CDF4(15629, 29233, 32046) },
- { AOM_CDF4(9387, 22621, 29480) },
- { AOM_CDF4(6922, 17616, 25010) },
- { AOM_CDF4(28838, 32265, 32614) },
- { AOM_CDF4(19701, 30206, 31920) },
- { AOM_CDF4(11214, 22410, 27933) },
- { AOM_CDF4(5320, 14177, 23034) },
- { AOM_CDF4(5049, 12881, 17827) },
- { AOM_CDF4(27484, 32471, 32734) },
- { AOM_CDF4(21076, 31526, 32561) },
- { AOM_CDF4(12707, 26303, 31211) },
- { AOM_CDF4(8169, 21722, 28219) },
- { AOM_CDF4(6045, 19406, 27042) },
- { AOM_CDF4(27753, 32572, 32745) },
- { AOM_CDF4(20832, 31878, 32653) },
- { AOM_CDF4(13250, 27356, 31674) },
- { AOM_CDF4(7718, 21508, 29858) },
- { AOM_CDF4(7209, 18350, 25559) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(7876, 16901, 21741) },
- { AOM_CDF4(24001, 31898, 32625) },
- { AOM_CDF4(14529, 27959, 31451) },
- { AOM_CDF4(8273, 20818, 27258) },
- { AOM_CDF4(5278, 14673, 21510) },
- { AOM_CDF4(2983, 8843, 14039) },
- { AOM_CDF4(28016, 32574, 32732) },
- { AOM_CDF4(17471, 30306, 32301) },
- { AOM_CDF4(10224, 24063, 29728) },
- { AOM_CDF4(6602, 17954, 25052) },
- { AOM_CDF4(4002, 11585, 17759) },
- { AOM_CDF4(30190, 32634, 32739) },
- { AOM_CDF4(17497, 30282, 32270) },
- { AOM_CDF4(10229, 23729, 29538) },
- { AOM_CDF4(6344, 17211, 24440) },
- { AOM_CDF4(3849, 11189, 17108) },
- { AOM_CDF4(28570, 32583, 32726) },
- { AOM_CDF4(17521, 30161, 32238) },
- { AOM_CDF4(10153, 23565, 29378) },
- { AOM_CDF4(6455, 17341, 24443) },
- { AOM_CDF4(3907, 11042, 17024) },
- { AOM_CDF4(30689, 32715, 32748) },
- { AOM_CDF4(21546, 31840, 32610) },
- { AOM_CDF4(13547, 27581, 31459) },
- { AOM_CDF4(8912, 21757, 28309) },
- { AOM_CDF4(5548, 15080, 22046) },
- { AOM_CDF4(30783, 32540, 32685) },
- { AOM_CDF4(17540, 29528, 31668) },
- { AOM_CDF4(10160, 21468, 26783) },
- { AOM_CDF4(4724, 13393, 20054) },
- { AOM_CDF4(2702, 8174, 13102) },
- { AOM_CDF4(31648, 32686, 32742) },
- { AOM_CDF4(20954, 31094, 32337) },
- { AOM_CDF4(12420, 25698, 30179) },
- { AOM_CDF4(7304, 19320, 26248) },
- { AOM_CDF4(4366, 12261, 18864) },
- { AOM_CDF4(31581, 32723, 32748) },
- { AOM_CDF4(21373, 31586, 32525) },
- { AOM_CDF4(12744, 26625, 30885) },
- { AOM_CDF4(7431, 20322, 26950) },
- { AOM_CDF4(4692, 13323, 20111) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(7833, 18369, 24095) },
- { AOM_CDF4(26650, 32273, 32702) },
- { AOM_CDF4(16371, 29961, 32191) },
- { AOM_CDF4(11055, 24082, 29629) },
- { AOM_CDF4(6892, 18644, 25400) },
- { AOM_CDF4(5006, 13057, 19240) },
- { AOM_CDF4(29834, 32666, 32748) },
- { AOM_CDF4(19577, 31335, 32570) },
- { AOM_CDF4(12253, 26509, 31122) },
- { AOM_CDF4(7991, 20772, 27711) },
- { AOM_CDF4(5677, 15910, 23059) },
- { AOM_CDF4(30109, 32532, 32720) },
- { AOM_CDF4(16747, 30166, 32252) },
- { AOM_CDF4(10134, 23542, 29184) },
- { AOM_CDF4(5791, 16176, 23556) },
- { AOM_CDF4(4362, 10414, 17284) },
- { AOM_CDF4(29492, 32626, 32748) },
- { AOM_CDF4(19894, 31402, 32525) },
- { AOM_CDF4(12942, 27071, 30869) },
- { AOM_CDF4(8346, 21216, 27405) },
- { AOM_CDF4(6572, 17087, 23859) },
- { AOM_CDF4(32035, 32735, 32748) },
- { AOM_CDF4(22957, 31838, 32618) },
- { AOM_CDF4(14724, 28572, 31772) },
- { AOM_CDF4(10364, 23999, 29553) },
- { AOM_CDF4(7004, 18433, 25655) },
- { AOM_CDF4(27528, 32277, 32681) },
- { AOM_CDF4(16959, 31171, 32096) },
- { AOM_CDF4(10486, 23593, 27962) },
- { AOM_CDF4(8192, 16384, 23211) },
- { AOM_CDF4(8937, 17873, 20852) },
- { AOM_CDF4(27715, 32002, 32615) },
- { AOM_CDF4(15073, 29491, 31676) },
- { AOM_CDF4(11264, 24576, 28672) },
- { AOM_CDF4(2341, 18725, 23406) },
- { AOM_CDF4(7282, 18204, 25486) },
- { AOM_CDF4(28547, 32213, 32657) },
- { AOM_CDF4(20788, 29773, 32239) },
- { AOM_CDF4(6780, 21469, 30508) },
- { AOM_CDF4(5958, 14895, 23831) },
- { AOM_CDF4(16384, 21845, 27307) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(5992, 14304, 19765) },
- { AOM_CDF4(22612, 31238, 32456) },
- { AOM_CDF4(13456, 27162, 31087) },
- { AOM_CDF4(8001, 20062, 26504) },
- { AOM_CDF4(5168, 14105, 20764) },
- { AOM_CDF4(2632, 7771, 12385) },
- { AOM_CDF4(27034, 32344, 32709) },
- { AOM_CDF4(15850, 29415, 31997) },
- { AOM_CDF4(9494, 22776, 28841) },
- { AOM_CDF4(6151, 16830, 23969) },
- { AOM_CDF4(3461, 10039, 15722) },
- { AOM_CDF4(30134, 32569, 32731) },
- { AOM_CDF4(15638, 29422, 31945) },
- { AOM_CDF4(9150, 21865, 28218) },
- { AOM_CDF4(5647, 15719, 22676) },
- { AOM_CDF4(3402, 9772, 15477) },
- { AOM_CDF4(28530, 32586, 32735) },
- { AOM_CDF4(17139, 30298, 32292) },
- { AOM_CDF4(10200, 24039, 29685) },
- { AOM_CDF4(6419, 17674, 24786) },
- { AOM_CDF4(3544, 10225, 15824) },
- { AOM_CDF4(31333, 32726, 32748) },
- { AOM_CDF4(20618, 31487, 32544) },
- { AOM_CDF4(12901, 27217, 31232) },
- { AOM_CDF4(8624, 21734, 28171) },
- { AOM_CDF4(5104, 14191, 20748) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(11206, 21090, 26561) },
- { AOM_CDF4(28759, 32279, 32671) },
- { AOM_CDF4(14171, 27952, 31569) },
- { AOM_CDF4(9743, 22907, 29141) },
- { AOM_CDF4(6871, 17886, 24868) },
- { AOM_CDF4(4960, 13152, 19315) },
- { AOM_CDF4(31077, 32661, 32748) },
- { AOM_CDF4(19400, 31195, 32515) },
- { AOM_CDF4(12752, 26858, 31040) },
- { AOM_CDF4(8370, 22098, 28591) },
- { AOM_CDF4(5457, 15373, 22298) },
- { AOM_CDF4(31697, 32706, 32748) },
- { AOM_CDF4(17860, 30657, 32333) },
- { AOM_CDF4(12510, 24812, 29261) },
- { AOM_CDF4(6180, 19124, 24722) },
- { AOM_CDF4(5041, 13548, 17959) },
- { AOM_CDF4(31552, 32716, 32748) },
- { AOM_CDF4(21908, 31769, 32623) },
- { AOM_CDF4(14470, 28201, 31565) },
- { AOM_CDF4(9493, 22982, 28608) },
- { AOM_CDF4(6858, 17240, 24137) },
- { AOM_CDF4(32543, 32752, 32756) },
- { AOM_CDF4(24286, 32097, 32666) },
- { AOM_CDF4(15958, 29217, 32024) },
- { AOM_CDF4(10207, 24234, 29958) },
- { AOM_CDF4(6929, 18305, 25652) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } },
- { { { AOM_CDF4(4137, 10847, 15682) },
- { AOM_CDF4(17824, 27001, 30058) },
- { AOM_CDF4(10204, 22796, 28291) },
- { AOM_CDF4(6076, 15935, 22125) },
- { AOM_CDF4(3852, 10937, 16816) },
- { AOM_CDF4(2252, 6324, 10131) },
- { AOM_CDF4(25840, 32016, 32662) },
- { AOM_CDF4(15109, 28268, 31531) },
- { AOM_CDF4(9385, 22231, 28340) },
- { AOM_CDF4(6082, 16672, 23479) },
- { AOM_CDF4(3318, 9427, 14681) },
- { AOM_CDF4(30594, 32574, 32718) },
- { AOM_CDF4(16836, 29552, 31859) },
- { AOM_CDF4(9556, 22542, 28356) },
- { AOM_CDF4(6305, 16725, 23540) },
- { AOM_CDF4(3376, 9895, 15184) },
- { AOM_CDF4(29383, 32617, 32745) },
- { AOM_CDF4(18891, 30809, 32401) },
- { AOM_CDF4(11688, 25942, 30687) },
- { AOM_CDF4(7468, 19469, 26651) },
- { AOM_CDF4(3909, 11358, 17012) },
- { AOM_CDF4(31564, 32736, 32748) },
- { AOM_CDF4(20906, 31611, 32600) },
- { AOM_CDF4(13191, 27621, 31537) },
- { AOM_CDF4(8768, 22029, 28676) },
- { AOM_CDF4(5079, 14109, 20906) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } },
- { { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) },
- { AOM_CDF4(8192, 16384, 24576) } } } } };
+ [CDF_SIZE(NUM_BASE_LEVELS +
+ 2)] = { { { { { AOM_CDF4(4034, 8930, 12727) },
+ { AOM_CDF4(18082, 29741, 31877) },
+ { AOM_CDF4(12596, 26124, 30493) },
+ { AOM_CDF4(9446, 21118, 27005) },
+ { AOM_CDF4(6308, 15141, 21279) },
+ { AOM_CDF4(2463, 6357, 9783) },
+ { AOM_CDF4(20667, 30546, 31929) },
+ { AOM_CDF4(13043, 26123, 30134) },
+ { AOM_CDF4(8151, 18757, 24778) },
+ { AOM_CDF4(5255, 12839, 18632) },
+ { AOM_CDF4(2820, 7206, 11161) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(15736, 27553, 30604) },
+ { AOM_CDF4(11210, 23794, 28787) },
+ { AOM_CDF4(5947, 13874, 19701) },
+ { AOM_CDF4(4215, 9323, 13891) },
+ { AOM_CDF4(2833, 6462, 10059) },
+ { AOM_CDF4(19605, 30393, 31582) },
+ { AOM_CDF4(13523, 26252, 30248) },
+ { AOM_CDF4(8446, 18622, 24512) },
+ { AOM_CDF4(3818, 10343, 15974) },
+ { AOM_CDF4(1481, 4117, 6796) },
+ { AOM_CDF4(22649, 31302, 32190) },
+ { AOM_CDF4(14829, 27127, 30449) },
+ { AOM_CDF4(8313, 17702, 23304) },
+ { AOM_CDF4(3022, 8301, 12786) },
+ { AOM_CDF4(1536, 4412, 7184) },
+ { AOM_CDF4(22354, 29774, 31372) },
+ { AOM_CDF4(14723, 25472, 29214) },
+ { AOM_CDF4(6673, 13745, 18662) },
+ { AOM_CDF4(2068, 5766, 9322) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6302, 16444, 21761) },
+ { AOM_CDF4(23040, 31538, 32475) },
+ { AOM_CDF4(15196, 28452, 31496) },
+ { AOM_CDF4(10020, 22946, 28514) },
+ { AOM_CDF4(6533, 16862, 23501) },
+ { AOM_CDF4(3538, 9816, 15076) },
+ { AOM_CDF4(24444, 31875, 32525) },
+ { AOM_CDF4(15881, 28924, 31635) },
+ { AOM_CDF4(9922, 22873, 28466) },
+ { AOM_CDF4(6527, 16966, 23691) },
+ { AOM_CDF4(4114, 11303, 17220) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(20201, 30770, 32209) },
+ { AOM_CDF4(14754, 28071, 31258) },
+ { AOM_CDF4(8378, 20186, 26517) },
+ { AOM_CDF4(5916, 15299, 21978) },
+ { AOM_CDF4(4268, 11583, 17901) },
+ { AOM_CDF4(24361, 32025, 32581) },
+ { AOM_CDF4(18673, 30105, 31943) },
+ { AOM_CDF4(10196, 22244, 27576) },
+ { AOM_CDF4(5495, 14349, 20417) },
+ { AOM_CDF4(2676, 7415, 11498) },
+ { AOM_CDF4(24678, 31958, 32585) },
+ { AOM_CDF4(18629, 29906, 31831) },
+ { AOM_CDF4(9364, 20724, 26315) },
+ { AOM_CDF4(4641, 12318, 18094) },
+ { AOM_CDF4(2758, 7387, 11579) },
+ { AOM_CDF4(25433, 31842, 32469) },
+ { AOM_CDF4(18795, 29289, 31411) },
+ { AOM_CDF4(7644, 17584, 23592) },
+ { AOM_CDF4(3408, 9014, 15047) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4536, 10072, 14001) },
+ { AOM_CDF4(25459, 31416, 32206) },
+ { AOM_CDF4(16605, 28048, 30818) },
+ { AOM_CDF4(11008, 22857, 27719) },
+ { AOM_CDF4(6915, 16268, 22315) },
+ { AOM_CDF4(2625, 6812, 10537) },
+ { AOM_CDF4(24257, 31788, 32499) },
+ { AOM_CDF4(16880, 29454, 31879) },
+ { AOM_CDF4(11958, 25054, 29778) },
+ { AOM_CDF4(7916, 18718, 25084) },
+ { AOM_CDF4(3383, 8777, 13446) },
+ { AOM_CDF4(22720, 31603, 32393) },
+ { AOM_CDF4(14960, 28125, 31335) },
+ { AOM_CDF4(9731, 22210, 27928) },
+ { AOM_CDF4(6304, 15832, 22277) },
+ { AOM_CDF4(2910, 7818, 12166) },
+ { AOM_CDF4(20375, 30627, 32131) },
+ { AOM_CDF4(13904, 27284, 30887) },
+ { AOM_CDF4(9368, 21558, 27144) },
+ { AOM_CDF4(5937, 14966, 21119) },
+ { AOM_CDF4(2667, 7225, 11319) },
+ { AOM_CDF4(23970, 31470, 32378) },
+ { AOM_CDF4(17173, 29734, 32018) },
+ { AOM_CDF4(12795, 25441, 29965) },
+ { AOM_CDF4(8981, 19680, 25893) },
+ { AOM_CDF4(4728, 11372, 16902) },
+ { AOM_CDF4(24287, 31797, 32439) },
+ { AOM_CDF4(16703, 29145, 31696) },
+ { AOM_CDF4(10833, 23554, 28725) },
+ { AOM_CDF4(6468, 16566, 23057) },
+ { AOM_CDF4(2415, 6562, 10278) },
+ { AOM_CDF4(26610, 32395, 32659) },
+ { AOM_CDF4(18590, 30498, 32117) },
+ { AOM_CDF4(12420, 25756, 29950) },
+ { AOM_CDF4(7639, 18746, 24710) },
+ { AOM_CDF4(3001, 8086, 12347) },
+ { AOM_CDF4(25076, 32064, 32580) },
+ { AOM_CDF4(17946, 30128, 32028) },
+ { AOM_CDF4(12024, 24985, 29378) },
+ { AOM_CDF4(7517, 18390, 24304) },
+ { AOM_CDF4(3243, 8781, 13331) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6037, 16771, 21957) },
+ { AOM_CDF4(24774, 31704, 32426) },
+ { AOM_CDF4(16830, 28589, 31056) },
+ { AOM_CDF4(10602, 22828, 27760) },
+ { AOM_CDF4(6733, 16829, 23071) },
+ { AOM_CDF4(3250, 8914, 13556) },
+ { AOM_CDF4(25582, 32220, 32668) },
+ { AOM_CDF4(18659, 30342, 32223) },
+ { AOM_CDF4(12546, 26149, 30515) },
+ { AOM_CDF4(8420, 20451, 26801) },
+ { AOM_CDF4(4636, 12420, 18344) },
+ { AOM_CDF4(27581, 32362, 32639) },
+ { AOM_CDF4(18987, 30083, 31978) },
+ { AOM_CDF4(11327, 24248, 29084) },
+ { AOM_CDF4(7264, 17719, 24120) },
+ { AOM_CDF4(3995, 10768, 16169) },
+ { AOM_CDF4(25893, 31831, 32487) },
+ { AOM_CDF4(16577, 28587, 31379) },
+ { AOM_CDF4(10189, 22748, 28182) },
+ { AOM_CDF4(6832, 17094, 23556) },
+ { AOM_CDF4(3708, 10110, 15334) },
+ { AOM_CDF4(25904, 32282, 32656) },
+ { AOM_CDF4(19721, 30792, 32276) },
+ { AOM_CDF4(12819, 26243, 30411) },
+ { AOM_CDF4(8572, 20614, 26891) },
+ { AOM_CDF4(5364, 14059, 20467) },
+ { AOM_CDF4(26580, 32438, 32677) },
+ { AOM_CDF4(20852, 31225, 32340) },
+ { AOM_CDF4(12435, 25700, 29967) },
+ { AOM_CDF4(8691, 20825, 26976) },
+ { AOM_CDF4(4446, 12209, 17269) },
+ { AOM_CDF4(27350, 32429, 32696) },
+ { AOM_CDF4(21372, 30977, 32272) },
+ { AOM_CDF4(12673, 25270, 29853) },
+ { AOM_CDF4(9208, 20925, 26640) },
+ { AOM_CDF4(5018, 13351, 18732) },
+ { AOM_CDF4(27351, 32479, 32713) },
+ { AOM_CDF4(21398, 31209, 32387) },
+ { AOM_CDF4(12162, 25047, 29842) },
+ { AOM_CDF4(7896, 18691, 25319) },
+ { AOM_CDF4(4670, 12882, 18881) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5487, 10460, 13708) },
+ { AOM_CDF4(21597, 28303, 30674) },
+ { AOM_CDF4(11037, 21953, 26476) },
+ { AOM_CDF4(8147, 17962, 22952) },
+ { AOM_CDF4(5242, 13061, 18532) },
+ { AOM_CDF4(1889, 5208, 8182) },
+ { AOM_CDF4(26774, 32133, 32590) },
+ { AOM_CDF4(17844, 29564, 31767) },
+ { AOM_CDF4(11690, 24438, 29171) },
+ { AOM_CDF4(7542, 18215, 24459) },
+ { AOM_CDF4(2993, 8050, 12319) },
+ { AOM_CDF4(28023, 32328, 32591) },
+ { AOM_CDF4(18651, 30126, 31954) },
+ { AOM_CDF4(12164, 25146, 29589) },
+ { AOM_CDF4(7762, 18530, 24771) },
+ { AOM_CDF4(3492, 9183, 13920) },
+ { AOM_CDF4(27591, 32008, 32491) },
+ { AOM_CDF4(17149, 28853, 31510) },
+ { AOM_CDF4(11485, 24003, 28860) },
+ { AOM_CDF4(7697, 18086, 24210) },
+ { AOM_CDF4(3075, 7999, 12218) },
+ { AOM_CDF4(28268, 32482, 32654) },
+ { AOM_CDF4(19631, 31051, 32404) },
+ { AOM_CDF4(13860, 27260, 31020) },
+ { AOM_CDF4(9605, 21613, 27594) },
+ { AOM_CDF4(4876, 12162, 17908) },
+ { AOM_CDF4(27248, 32316, 32576) },
+ { AOM_CDF4(18955, 30457, 32075) },
+ { AOM_CDF4(11824, 23997, 28795) },
+ { AOM_CDF4(7346, 18196, 24647) },
+ { AOM_CDF4(3403, 9247, 14111) },
+ { AOM_CDF4(29711, 32655, 32735) },
+ { AOM_CDF4(21169, 31394, 32417) },
+ { AOM_CDF4(13487, 27198, 30957) },
+ { AOM_CDF4(8828, 21683, 27614) },
+ { AOM_CDF4(4270, 11451, 17038) },
+ { AOM_CDF4(28708, 32578, 32731) },
+ { AOM_CDF4(20120, 31241, 32482) },
+ { AOM_CDF4(13692, 27550, 31321) },
+ { AOM_CDF4(9418, 22514, 28439) },
+ { AOM_CDF4(4999, 13283, 19462) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(5673, 14302, 19711) },
+ { AOM_CDF4(26251, 30701, 31834) },
+ { AOM_CDF4(12782, 23783, 27803) },
+ { AOM_CDF4(9127, 20657, 25808) },
+ { AOM_CDF4(6368, 16208, 21462) },
+ { AOM_CDF4(2465, 7177, 10822) },
+ { AOM_CDF4(29961, 32563, 32719) },
+ { AOM_CDF4(18318, 29891, 31949) },
+ { AOM_CDF4(11361, 24514, 29357) },
+ { AOM_CDF4(7900, 19603, 25607) },
+ { AOM_CDF4(4002, 10590, 15546) },
+ { AOM_CDF4(29637, 32310, 32595) },
+ { AOM_CDF4(18296, 29913, 31809) },
+ { AOM_CDF4(10144, 21515, 26871) },
+ { AOM_CDF4(5358, 14322, 20394) },
+ { AOM_CDF4(3067, 8362, 13346) },
+ { AOM_CDF4(28652, 32470, 32676) },
+ { AOM_CDF4(17538, 30771, 32209) },
+ { AOM_CDF4(13924, 26882, 30494) },
+ { AOM_CDF4(10496, 22837, 27869) },
+ { AOM_CDF4(7236, 16396, 21621) },
+ { AOM_CDF4(30743, 32687, 32746) },
+ { AOM_CDF4(23006, 31676, 32489) },
+ { AOM_CDF4(14494, 27828, 31120) },
+ { AOM_CDF4(10174, 22801, 28352) },
+ { AOM_CDF4(6242, 15281, 21043) },
+ { AOM_CDF4(25817, 32243, 32720) },
+ { AOM_CDF4(18618, 31367, 32325) },
+ { AOM_CDF4(13997, 28318, 31878) },
+ { AOM_CDF4(12255, 26534, 31383) },
+ { AOM_CDF4(9561, 21588, 28450) },
+ { AOM_CDF4(28188, 32635, 32724) },
+ { AOM_CDF4(22060, 32365, 32728) },
+ { AOM_CDF4(18102, 30690, 32528) },
+ { AOM_CDF4(14196, 28864, 31999) },
+ { AOM_CDF4(12262, 25792, 30865) },
+ { AOM_CDF4(24176, 32109, 32628) },
+ { AOM_CDF4(18280, 29681, 31963) },
+ { AOM_CDF4(10205, 23703, 29664) },
+ { AOM_CDF4(7889, 20025, 27676) },
+ { AOM_CDF4(6060, 16743, 23970) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5141, 7096, 8260) },
+ { AOM_CDF4(27186, 29022, 29789) },
+ { AOM_CDF4(6668, 12568, 15682) },
+ { AOM_CDF4(2172, 6181, 8638) },
+ { AOM_CDF4(1126, 3379, 4531) },
+ { AOM_CDF4(443, 1361, 2254) },
+ { AOM_CDF4(26083, 31153, 32436) },
+ { AOM_CDF4(13486, 24603, 28483) },
+ { AOM_CDF4(6508, 14840, 19910) },
+ { AOM_CDF4(3386, 8800, 13286) },
+ { AOM_CDF4(1530, 4322, 7054) },
+ { AOM_CDF4(29639, 32080, 32548) },
+ { AOM_CDF4(15897, 27552, 30290) },
+ { AOM_CDF4(8588, 20047, 25383) },
+ { AOM_CDF4(4889, 13339, 19269) },
+ { AOM_CDF4(2240, 6871, 10498) },
+ { AOM_CDF4(28165, 32197, 32517) },
+ { AOM_CDF4(20735, 30427, 31568) },
+ { AOM_CDF4(14325, 24671, 27692) },
+ { AOM_CDF4(5119, 12554, 17805) },
+ { AOM_CDF4(1810, 5441, 8261) },
+ { AOM_CDF4(31212, 32724, 32748) },
+ { AOM_CDF4(23352, 31766, 32545) },
+ { AOM_CDF4(14669, 27570, 31059) },
+ { AOM_CDF4(8492, 20894, 27272) },
+ { AOM_CDF4(3644, 10194, 15204) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(2461, 7013, 9371) },
+ { AOM_CDF4(24749, 29600, 30986) },
+ { AOM_CDF4(9466, 19037, 22417) },
+ { AOM_CDF4(3584, 9280, 14400) },
+ { AOM_CDF4(1505, 3929, 5433) },
+ { AOM_CDF4(677, 1500, 2736) },
+ { AOM_CDF4(23987, 30702, 32117) },
+ { AOM_CDF4(13554, 24571, 29263) },
+ { AOM_CDF4(6211, 14556, 21155) },
+ { AOM_CDF4(3135, 10972, 15625) },
+ { AOM_CDF4(2435, 7127, 11427) },
+ { AOM_CDF4(31300, 32532, 32550) },
+ { AOM_CDF4(14757, 30365, 31954) },
+ { AOM_CDF4(4405, 11612, 18553) },
+ { AOM_CDF4(580, 4132, 7322) },
+ { AOM_CDF4(1695, 10169, 14124) },
+ { AOM_CDF4(30008, 32282, 32591) },
+ { AOM_CDF4(19244, 30108, 31748) },
+ { AOM_CDF4(11180, 24158, 29555) },
+ { AOM_CDF4(5650, 14972, 19209) },
+ { AOM_CDF4(2114, 5109, 8456) },
+ { AOM_CDF4(31856, 32716, 32748) },
+ { AOM_CDF4(23012, 31664, 32572) },
+ { AOM_CDF4(13694, 26656, 30636) },
+ { AOM_CDF4(8142, 19508, 26093) },
+ { AOM_CDF4(4253, 10955, 16724) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(601, 983, 1311) },
+ { AOM_CDF4(18725, 23406, 28087) },
+ { AOM_CDF4(5461, 8192, 10923) },
+ { AOM_CDF4(3781, 15124, 21425) },
+ { AOM_CDF4(2587, 7761, 12072) },
+ { AOM_CDF4(106, 458, 810) },
+ { AOM_CDF4(22282, 29710, 31894) },
+ { AOM_CDF4(8508, 20926, 25984) },
+ { AOM_CDF4(3726, 12713, 18083) },
+ { AOM_CDF4(1620, 7112, 10893) },
+ { AOM_CDF4(729, 2236, 3495) },
+ { AOM_CDF4(30163, 32474, 32684) },
+ { AOM_CDF4(18304, 30464, 32000) },
+ { AOM_CDF4(11443, 26526, 29647) },
+ { AOM_CDF4(6007, 15292, 21299) },
+ { AOM_CDF4(2234, 6703, 8937) },
+ { AOM_CDF4(30954, 32177, 32571) },
+ { AOM_CDF4(17363, 29562, 31076) },
+ { AOM_CDF4(9686, 22464, 27410) },
+ { AOM_CDF4(8192, 16384, 21390) },
+ { AOM_CDF4(1755, 8046, 11264) },
+ { AOM_CDF4(31168, 32734, 32748) },
+ { AOM_CDF4(22486, 31441, 32471) },
+ { AOM_CDF4(12833, 25627, 29738) },
+ { AOM_CDF4(6980, 17379, 23122) },
+ { AOM_CDF4(3111, 8887, 13479) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(6041, 11854, 15927) },
+ { AOM_CDF4(20326, 30905, 32251) },
+ { AOM_CDF4(14164, 26831, 30725) },
+ { AOM_CDF4(9760, 20647, 26585) },
+ { AOM_CDF4(6416, 14953, 21219) },
+ { AOM_CDF4(2966, 7151, 10891) },
+ { AOM_CDF4(23567, 31374, 32254) },
+ { AOM_CDF4(14978, 27416, 30946) },
+ { AOM_CDF4(9434, 20225, 26254) },
+ { AOM_CDF4(6658, 14558, 20535) },
+ { AOM_CDF4(3916, 8677, 12989) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(18088, 29545, 31587) },
+ { AOM_CDF4(13062, 25843, 30073) },
+ { AOM_CDF4(8940, 16827, 22251) },
+ { AOM_CDF4(7654, 13220, 17973) },
+ { AOM_CDF4(5733, 10316, 14456) },
+ { AOM_CDF4(22879, 31388, 32114) },
+ { AOM_CDF4(15215, 27993, 30955) },
+ { AOM_CDF4(9397, 19445, 24978) },
+ { AOM_CDF4(3442, 9813, 15344) },
+ { AOM_CDF4(1368, 3936, 6532) },
+ { AOM_CDF4(25494, 32033, 32406) },
+ { AOM_CDF4(16772, 27963, 30718) },
+ { AOM_CDF4(9419, 18165, 23260) },
+ { AOM_CDF4(2677, 7501, 11797) },
+ { AOM_CDF4(1516, 4344, 7170) },
+ { AOM_CDF4(26556, 31454, 32101) },
+ { AOM_CDF4(17128, 27035, 30108) },
+ { AOM_CDF4(8324, 15344, 20249) },
+ { AOM_CDF4(1903, 5696, 9469) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8455, 19003, 24368) },
+ { AOM_CDF4(23563, 32021, 32604) },
+ { AOM_CDF4(16237, 29446, 31935) },
+ { AOM_CDF4(10724, 23999, 29358) },
+ { AOM_CDF4(6725, 17528, 24416) },
+ { AOM_CDF4(3927, 10927, 16825) },
+ { AOM_CDF4(26313, 32288, 32634) },
+ { AOM_CDF4(17430, 30095, 32095) },
+ { AOM_CDF4(11116, 24606, 29679) },
+ { AOM_CDF4(7195, 18384, 25269) },
+ { AOM_CDF4(4726, 12852, 19315) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(22822, 31648, 32483) },
+ { AOM_CDF4(16724, 29633, 31929) },
+ { AOM_CDF4(10261, 23033, 28725) },
+ { AOM_CDF4(7029, 17840, 24528) },
+ { AOM_CDF4(4867, 13886, 21502) },
+ { AOM_CDF4(25298, 31892, 32491) },
+ { AOM_CDF4(17809, 29330, 31512) },
+ { AOM_CDF4(9668, 21329, 26579) },
+ { AOM_CDF4(4774, 12956, 18976) },
+ { AOM_CDF4(2322, 7030, 11540) },
+ { AOM_CDF4(25472, 31920, 32543) },
+ { AOM_CDF4(17957, 29387, 31632) },
+ { AOM_CDF4(9196, 20593, 26400) },
+ { AOM_CDF4(4680, 12705, 19202) },
+ { AOM_CDF4(2917, 8456, 13436) },
+ { AOM_CDF4(26471, 32059, 32574) },
+ { AOM_CDF4(18458, 29783, 31909) },
+ { AOM_CDF4(8400, 19464, 25956) },
+ { AOM_CDF4(3812, 10973, 17206) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(6779, 13743, 17678) },
+ { AOM_CDF4(24806, 31797, 32457) },
+ { AOM_CDF4(17616, 29047, 31372) },
+ { AOM_CDF4(11063, 23175, 28003) },
+ { AOM_CDF4(6521, 16110, 22324) },
+ { AOM_CDF4(2764, 7504, 11654) },
+ { AOM_CDF4(25266, 32367, 32637) },
+ { AOM_CDF4(19054, 30553, 32175) },
+ { AOM_CDF4(12139, 25212, 29807) },
+ { AOM_CDF4(7311, 18162, 24704) },
+ { AOM_CDF4(3397, 9164, 14074) },
+ { AOM_CDF4(25988, 32208, 32522) },
+ { AOM_CDF4(16253, 28912, 31526) },
+ { AOM_CDF4(9151, 21387, 27372) },
+ { AOM_CDF4(5688, 14915, 21496) },
+ { AOM_CDF4(2717, 7627, 12004) },
+ { AOM_CDF4(23144, 31855, 32443) },
+ { AOM_CDF4(16070, 28491, 31325) },
+ { AOM_CDF4(8702, 20467, 26517) },
+ { AOM_CDF4(5243, 13956, 20367) },
+ { AOM_CDF4(2621, 7335, 11567) },
+ { AOM_CDF4(26636, 32340, 32630) },
+ { AOM_CDF4(19990, 31050, 32341) },
+ { AOM_CDF4(13243, 26105, 30315) },
+ { AOM_CDF4(8588, 19521, 25918) },
+ { AOM_CDF4(4717, 11585, 17304) },
+ { AOM_CDF4(25844, 32292, 32582) },
+ { AOM_CDF4(19090, 30635, 32097) },
+ { AOM_CDF4(11963, 24546, 28939) },
+ { AOM_CDF4(6218, 16087, 22354) },
+ { AOM_CDF4(2340, 6608, 10426) },
+ { AOM_CDF4(28046, 32576, 32694) },
+ { AOM_CDF4(21178, 31313, 32296) },
+ { AOM_CDF4(13486, 26184, 29870) },
+ { AOM_CDF4(7149, 17871, 23723) },
+ { AOM_CDF4(2833, 7958, 12259) },
+ { AOM_CDF4(27710, 32528, 32686) },
+ { AOM_CDF4(20674, 31076, 32268) },
+ { AOM_CDF4(12413, 24955, 29243) },
+ { AOM_CDF4(6676, 16927, 23097) },
+ { AOM_CDF4(2966, 8333, 12919) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8639, 19339, 24429) },
+ { AOM_CDF4(24404, 31837, 32525) },
+ { AOM_CDF4(16997, 29425, 31784) },
+ { AOM_CDF4(11253, 24234, 29149) },
+ { AOM_CDF4(6751, 17394, 24028) },
+ { AOM_CDF4(3490, 9830, 15191) },
+ { AOM_CDF4(26283, 32471, 32714) },
+ { AOM_CDF4(19599, 31168, 32442) },
+ { AOM_CDF4(13146, 26954, 30893) },
+ { AOM_CDF4(8214, 20588, 26890) },
+ { AOM_CDF4(4699, 13081, 19300) },
+ { AOM_CDF4(28212, 32458, 32669) },
+ { AOM_CDF4(18594, 30316, 32100) },
+ { AOM_CDF4(11219, 24408, 29234) },
+ { AOM_CDF4(6865, 17656, 24149) },
+ { AOM_CDF4(3678, 10362, 16006) },
+ { AOM_CDF4(25825, 32136, 32616) },
+ { AOM_CDF4(17313, 29853, 32021) },
+ { AOM_CDF4(11197, 24471, 29472) },
+ { AOM_CDF4(6947, 17781, 24405) },
+ { AOM_CDF4(3768, 10660, 16261) },
+ { AOM_CDF4(27352, 32500, 32706) },
+ { AOM_CDF4(20850, 31468, 32469) },
+ { AOM_CDF4(14021, 27707, 31133) },
+ { AOM_CDF4(8964, 21748, 27838) },
+ { AOM_CDF4(5437, 14665, 21187) },
+ { AOM_CDF4(26304, 32492, 32698) },
+ { AOM_CDF4(20409, 31380, 32385) },
+ { AOM_CDF4(13682, 27222, 30632) },
+ { AOM_CDF4(8974, 21236, 26685) },
+ { AOM_CDF4(4234, 11665, 16934) },
+ { AOM_CDF4(26273, 32357, 32711) },
+ { AOM_CDF4(20672, 31242, 32441) },
+ { AOM_CDF4(14172, 27254, 30902) },
+ { AOM_CDF4(9870, 21898, 27275) },
+ { AOM_CDF4(5164, 13506, 19270) },
+ { AOM_CDF4(26725, 32459, 32728) },
+ { AOM_CDF4(20991, 31442, 32527) },
+ { AOM_CDF4(13071, 26434, 30811) },
+ { AOM_CDF4(8184, 20090, 26742) },
+ { AOM_CDF4(4803, 13255, 19895) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7555, 14942, 18501) },
+ { AOM_CDF4(24410, 31178, 32287) },
+ { AOM_CDF4(14394, 26738, 30253) },
+ { AOM_CDF4(8413, 19554, 25195) },
+ { AOM_CDF4(4766, 12924, 18785) },
+ { AOM_CDF4(2029, 5806, 9207) },
+ { AOM_CDF4(26776, 32364, 32663) },
+ { AOM_CDF4(18732, 29967, 31931) },
+ { AOM_CDF4(11005, 23786, 28852) },
+ { AOM_CDF4(6466, 16909, 23510) },
+ { AOM_CDF4(3044, 8638, 13419) },
+ { AOM_CDF4(29208, 32582, 32704) },
+ { AOM_CDF4(20068, 30857, 32208) },
+ { AOM_CDF4(12003, 25085, 29595) },
+ { AOM_CDF4(6947, 17750, 24189) },
+ { AOM_CDF4(3245, 9103, 14007) },
+ { AOM_CDF4(27359, 32465, 32669) },
+ { AOM_CDF4(19421, 30614, 32174) },
+ { AOM_CDF4(11915, 25010, 29579) },
+ { AOM_CDF4(6950, 17676, 24074) },
+ { AOM_CDF4(3007, 8473, 13096) },
+ { AOM_CDF4(29002, 32676, 32735) },
+ { AOM_CDF4(22102, 31849, 32576) },
+ { AOM_CDF4(14408, 28009, 31405) },
+ { AOM_CDF4(9027, 21679, 27931) },
+ { AOM_CDF4(4694, 12678, 18748) },
+ { AOM_CDF4(28216, 32528, 32682) },
+ { AOM_CDF4(20849, 31264, 32318) },
+ { AOM_CDF4(12756, 25815, 29751) },
+ { AOM_CDF4(7565, 18801, 24923) },
+ { AOM_CDF4(3509, 9533, 14477) },
+ { AOM_CDF4(30133, 32687, 32739) },
+ { AOM_CDF4(23063, 31910, 32515) },
+ { AOM_CDF4(14588, 28051, 31132) },
+ { AOM_CDF4(9085, 21649, 27457) },
+ { AOM_CDF4(4261, 11654, 17264) },
+ { AOM_CDF4(29518, 32691, 32748) },
+ { AOM_CDF4(22451, 31959, 32613) },
+ { AOM_CDF4(14864, 28722, 31700) },
+ { AOM_CDF4(9695, 22964, 28716) },
+ { AOM_CDF4(4932, 13358, 19502) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6465, 16958, 21688) },
+ { AOM_CDF4(25199, 31514, 32360) },
+ { AOM_CDF4(14774, 27149, 30607) },
+ { AOM_CDF4(9257, 21438, 26972) },
+ { AOM_CDF4(5723, 15183, 21882) },
+ { AOM_CDF4(3150, 8879, 13731) },
+ { AOM_CDF4(26989, 32262, 32682) },
+ { AOM_CDF4(17396, 29937, 32085) },
+ { AOM_CDF4(11387, 24901, 29784) },
+ { AOM_CDF4(7289, 18821, 25548) },
+ { AOM_CDF4(3734, 10577, 16086) },
+ { AOM_CDF4(29728, 32501, 32695) },
+ { AOM_CDF4(17431, 29701, 31903) },
+ { AOM_CDF4(9921, 22826, 28300) },
+ { AOM_CDF4(5896, 15434, 22068) },
+ { AOM_CDF4(3430, 9646, 14757) },
+ { AOM_CDF4(28614, 32511, 32705) },
+ { AOM_CDF4(19364, 30638, 32263) },
+ { AOM_CDF4(13129, 26254, 30402) },
+ { AOM_CDF4(8754, 20484, 26440) },
+ { AOM_CDF4(4378, 11607, 17110) },
+ { AOM_CDF4(30292, 32671, 32744) },
+ { AOM_CDF4(21780, 31603, 32501) },
+ { AOM_CDF4(14314, 27829, 31291) },
+ { AOM_CDF4(9611, 22327, 28263) },
+ { AOM_CDF4(4890, 13087, 19065) },
+ { AOM_CDF4(25862, 32567, 32733) },
+ { AOM_CDF4(20794, 32050, 32567) },
+ { AOM_CDF4(17243, 30625, 32254) },
+ { AOM_CDF4(13283, 27628, 31474) },
+ { AOM_CDF4(9669, 22532, 28918) },
+ { AOM_CDF4(27435, 32697, 32748) },
+ { AOM_CDF4(24922, 32390, 32714) },
+ { AOM_CDF4(21449, 31504, 32536) },
+ { AOM_CDF4(16392, 29729, 31832) },
+ { AOM_CDF4(11692, 24884, 29076) },
+ { AOM_CDF4(24193, 32290, 32735) },
+ { AOM_CDF4(18909, 31104, 32563) },
+ { AOM_CDF4(12236, 26841, 31403) },
+ { AOM_CDF4(8171, 21840, 29082) },
+ { AOM_CDF4(7224, 17280, 25275) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(3078, 6839, 9890) },
+ { AOM_CDF4(13837, 20450, 24479) },
+ { AOM_CDF4(5914, 14222, 19328) },
+ { AOM_CDF4(3866, 10267, 14762) },
+ { AOM_CDF4(2612, 7208, 11042) },
+ { AOM_CDF4(1067, 2991, 4776) },
+ { AOM_CDF4(25817, 31646, 32529) },
+ { AOM_CDF4(13708, 26338, 30385) },
+ { AOM_CDF4(7328, 18585, 24870) },
+ { AOM_CDF4(4691, 13080, 19276) },
+ { AOM_CDF4(1825, 5253, 8352) },
+ { AOM_CDF4(29386, 32315, 32624) },
+ { AOM_CDF4(17160, 29001, 31360) },
+ { AOM_CDF4(9602, 21862, 27396) },
+ { AOM_CDF4(5915, 15772, 22148) },
+ { AOM_CDF4(2786, 7779, 12047) },
+ { AOM_CDF4(29246, 32450, 32663) },
+ { AOM_CDF4(18696, 29929, 31818) },
+ { AOM_CDF4(10510, 23369, 28560) },
+ { AOM_CDF4(6229, 16499, 23125) },
+ { AOM_CDF4(2608, 7448, 11705) },
+ { AOM_CDF4(30753, 32710, 32748) },
+ { AOM_CDF4(21638, 31487, 32503) },
+ { AOM_CDF4(12937, 26854, 30870) },
+ { AOM_CDF4(8182, 20596, 26970) },
+ { AOM_CDF4(3637, 10269, 15497) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(5244, 12150, 16906) },
+ { AOM_CDF4(20486, 26858, 29701) },
+ { AOM_CDF4(7756, 18317, 23735) },
+ { AOM_CDF4(3452, 9256, 13146) },
+ { AOM_CDF4(2020, 5206, 8229) },
+ { AOM_CDF4(1801, 4993, 7903) },
+ { AOM_CDF4(27051, 31858, 32531) },
+ { AOM_CDF4(15988, 27531, 30619) },
+ { AOM_CDF4(9188, 21484, 26719) },
+ { AOM_CDF4(6273, 17186, 23800) },
+ { AOM_CDF4(3108, 9355, 14764) },
+ { AOM_CDF4(31076, 32520, 32680) },
+ { AOM_CDF4(18119, 30037, 31850) },
+ { AOM_CDF4(10244, 22969, 27472) },
+ { AOM_CDF4(4692, 14077, 19273) },
+ { AOM_CDF4(3694, 11677, 17556) },
+ { AOM_CDF4(30060, 32581, 32720) },
+ { AOM_CDF4(21011, 30775, 32120) },
+ { AOM_CDF4(11931, 24820, 29289) },
+ { AOM_CDF4(7119, 17662, 24356) },
+ { AOM_CDF4(3833, 10706, 16304) },
+ { AOM_CDF4(31954, 32731, 32748) },
+ { AOM_CDF4(23913, 31724, 32489) },
+ { AOM_CDF4(15520, 28060, 31286) },
+ { AOM_CDF4(11517, 23008, 28571) },
+ { AOM_CDF4(6193, 14508, 20629) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(1035, 2807, 4156) },
+ { AOM_CDF4(13162, 18138, 20939) },
+ { AOM_CDF4(2696, 6633, 8755) },
+ { AOM_CDF4(1373, 4161, 6853) },
+ { AOM_CDF4(1099, 2746, 4716) },
+ { AOM_CDF4(340, 1021, 1599) },
+ { AOM_CDF4(22826, 30419, 32135) },
+ { AOM_CDF4(10395, 21762, 26942) },
+ { AOM_CDF4(4726, 12407, 17361) },
+ { AOM_CDF4(2447, 7080, 10593) },
+ { AOM_CDF4(1227, 3717, 6011) },
+ { AOM_CDF4(28156, 31424, 31934) },
+ { AOM_CDF4(16915, 27754, 30373) },
+ { AOM_CDF4(9148, 20990, 26431) },
+ { AOM_CDF4(5950, 15515, 21148) },
+ { AOM_CDF4(2492, 7327, 11526) },
+ { AOM_CDF4(30602, 32477, 32670) },
+ { AOM_CDF4(20026, 29955, 31568) },
+ { AOM_CDF4(11220, 23628, 28105) },
+ { AOM_CDF4(6652, 17019, 22973) },
+ { AOM_CDF4(3064, 8536, 13043) },
+ { AOM_CDF4(31769, 32724, 32748) },
+ { AOM_CDF4(22230, 30887, 32373) },
+ { AOM_CDF4(12234, 25079, 29731) },
+ { AOM_CDF4(7326, 18816, 25353) },
+ { AOM_CDF4(3933, 10907, 16616) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(8896, 16227, 20630) },
+ { AOM_CDF4(23629, 31782, 32527) },
+ { AOM_CDF4(15173, 27755, 31321) },
+ { AOM_CDF4(10158, 21233, 27382) },
+ { AOM_CDF4(6420, 14857, 21558) },
+ { AOM_CDF4(3269, 8155, 12646) },
+ { AOM_CDF4(24835, 32009, 32496) },
+ { AOM_CDF4(16509, 28421, 31579) },
+ { AOM_CDF4(10957, 21514, 27418) },
+ { AOM_CDF4(7881, 15930, 22096) },
+ { AOM_CDF4(5388, 10960, 15918) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(20745, 30773, 32093) },
+ { AOM_CDF4(15200, 27221, 30861) },
+ { AOM_CDF4(13032, 20873, 25667) },
+ { AOM_CDF4(12285, 18663, 23494) },
+ { AOM_CDF4(11563, 17481, 21489) },
+ { AOM_CDF4(26260, 31982, 32320) },
+ { AOM_CDF4(15397, 28083, 31100) },
+ { AOM_CDF4(9742, 19217, 24824) },
+ { AOM_CDF4(3261, 9629, 15362) },
+ { AOM_CDF4(1480, 4322, 7499) },
+ { AOM_CDF4(27599, 32256, 32460) },
+ { AOM_CDF4(16857, 27659, 30774) },
+ { AOM_CDF4(9551, 18290, 23748) },
+ { AOM_CDF4(3052, 8933, 14103) },
+ { AOM_CDF4(2021, 5910, 9787) },
+ { AOM_CDF4(29005, 32015, 32392) },
+ { AOM_CDF4(17677, 27694, 30863) },
+ { AOM_CDF4(9204, 17356, 23219) },
+ { AOM_CDF4(2403, 7516, 12814) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(10808, 22056, 26896) },
+ { AOM_CDF4(25739, 32313, 32676) },
+ { AOM_CDF4(17288, 30203, 32221) },
+ { AOM_CDF4(11359, 24878, 29896) },
+ { AOM_CDF4(6949, 17767, 24893) },
+ { AOM_CDF4(4287, 11796, 18071) },
+ { AOM_CDF4(27880, 32521, 32705) },
+ { AOM_CDF4(19038, 31004, 32414) },
+ { AOM_CDF4(12564, 26345, 30768) },
+ { AOM_CDF4(8269, 19947, 26779) },
+ { AOM_CDF4(5674, 14657, 21674) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(25742, 32319, 32671) },
+ { AOM_CDF4(19557, 31164, 32454) },
+ { AOM_CDF4(13381, 26381, 30755) },
+ { AOM_CDF4(10101, 21466, 26722) },
+ { AOM_CDF4(9209, 19650, 26825) },
+ { AOM_CDF4(27107, 31917, 32432) },
+ { AOM_CDF4(18056, 28893, 31203) },
+ { AOM_CDF4(10200, 21434, 26764) },
+ { AOM_CDF4(4660, 12913, 19502) },
+ { AOM_CDF4(2368, 6930, 12504) },
+ { AOM_CDF4(26960, 32158, 32613) },
+ { AOM_CDF4(18628, 30005, 32031) },
+ { AOM_CDF4(10233, 22442, 28232) },
+ { AOM_CDF4(5471, 14630, 21516) },
+ { AOM_CDF4(3235, 10767, 17109) },
+ { AOM_CDF4(27696, 32440, 32692) },
+ { AOM_CDF4(20032, 31167, 32438) },
+ { AOM_CDF4(8700, 21341, 28442) },
+ { AOM_CDF4(5662, 14831, 21795) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(9704, 17294, 21132) },
+ { AOM_CDF4(26762, 32278, 32633) },
+ { AOM_CDF4(18382, 29620, 31819) },
+ { AOM_CDF4(10891, 23475, 28723) },
+ { AOM_CDF4(6358, 16583, 23309) },
+ { AOM_CDF4(3248, 9118, 14141) },
+ { AOM_CDF4(27204, 32573, 32699) },
+ { AOM_CDF4(19818, 30824, 32329) },
+ { AOM_CDF4(11772, 25120, 30041) },
+ { AOM_CDF4(6995, 18033, 25039) },
+ { AOM_CDF4(3752, 10442, 16098) },
+ { AOM_CDF4(27222, 32256, 32559) },
+ { AOM_CDF4(15356, 28399, 31475) },
+ { AOM_CDF4(8821, 20635, 27057) },
+ { AOM_CDF4(5511, 14404, 21239) },
+ { AOM_CDF4(2935, 8222, 13051) },
+ { AOM_CDF4(24875, 32120, 32529) },
+ { AOM_CDF4(15233, 28265, 31445) },
+ { AOM_CDF4(8605, 20570, 26932) },
+ { AOM_CDF4(5431, 14413, 21196) },
+ { AOM_CDF4(2994, 8341, 13223) },
+ { AOM_CDF4(28201, 32604, 32700) },
+ { AOM_CDF4(21041, 31446, 32456) },
+ { AOM_CDF4(13221, 26213, 30475) },
+ { AOM_CDF4(8255, 19385, 26037) },
+ { AOM_CDF4(4930, 12585, 18830) },
+ { AOM_CDF4(28768, 32448, 32627) },
+ { AOM_CDF4(19705, 30561, 32021) },
+ { AOM_CDF4(11572, 23589, 28220) },
+ { AOM_CDF4(5532, 15034, 21446) },
+ { AOM_CDF4(2460, 7150, 11456) },
+ { AOM_CDF4(29874, 32619, 32699) },
+ { AOM_CDF4(21621, 31071, 32201) },
+ { AOM_CDF4(12511, 24747, 28992) },
+ { AOM_CDF4(6281, 16395, 22748) },
+ { AOM_CDF4(3246, 9278, 14497) },
+ { AOM_CDF4(29715, 32625, 32712) },
+ { AOM_CDF4(20958, 31011, 32283) },
+ { AOM_CDF4(11233, 23671, 28806) },
+ { AOM_CDF4(6012, 16128, 22868) },
+ { AOM_CDF4(3427, 9851, 15414) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(11016, 22111, 26794) },
+ { AOM_CDF4(25946, 32357, 32677) },
+ { AOM_CDF4(17890, 30452, 32252) },
+ { AOM_CDF4(11678, 25142, 29816) },
+ { AOM_CDF4(6720, 17534, 24584) },
+ { AOM_CDF4(4230, 11665, 17820) },
+ { AOM_CDF4(28400, 32623, 32747) },
+ { AOM_CDF4(21164, 31668, 32575) },
+ { AOM_CDF4(13572, 27388, 31182) },
+ { AOM_CDF4(8234, 20750, 27358) },
+ { AOM_CDF4(5065, 14055, 20897) },
+ { AOM_CDF4(28981, 32547, 32705) },
+ { AOM_CDF4(18681, 30543, 32239) },
+ { AOM_CDF4(10919, 24075, 29286) },
+ { AOM_CDF4(6431, 17199, 24077) },
+ { AOM_CDF4(3819, 10464, 16618) },
+ { AOM_CDF4(26870, 32467, 32693) },
+ { AOM_CDF4(19041, 30831, 32347) },
+ { AOM_CDF4(11794, 25211, 30016) },
+ { AOM_CDF4(6888, 18019, 24970) },
+ { AOM_CDF4(4370, 12363, 18992) },
+ { AOM_CDF4(29578, 32670, 32744) },
+ { AOM_CDF4(23159, 32007, 32613) },
+ { AOM_CDF4(15315, 28669, 31676) },
+ { AOM_CDF4(9298, 22607, 28782) },
+ { AOM_CDF4(6144, 15913, 22968) },
+ { AOM_CDF4(28110, 32499, 32669) },
+ { AOM_CDF4(21574, 30937, 32015) },
+ { AOM_CDF4(12759, 24818, 28727) },
+ { AOM_CDF4(6545, 16761, 23042) },
+ { AOM_CDF4(3649, 10597, 16833) },
+ { AOM_CDF4(28163, 32552, 32728) },
+ { AOM_CDF4(22101, 31469, 32464) },
+ { AOM_CDF4(13160, 25472, 30143) },
+ { AOM_CDF4(7303, 18684, 25468) },
+ { AOM_CDF4(5241, 13975, 20955) },
+ { AOM_CDF4(28400, 32631, 32744) },
+ { AOM_CDF4(22104, 31793, 32603) },
+ { AOM_CDF4(13557, 26571, 30846) },
+ { AOM_CDF4(7749, 19861, 26675) },
+ { AOM_CDF4(4873, 14030, 21234) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(9800, 17635, 21073) },
+ { AOM_CDF4(26153, 31885, 32527) },
+ { AOM_CDF4(15038, 27852, 31006) },
+ { AOM_CDF4(8718, 20564, 26486) },
+ { AOM_CDF4(5128, 14076, 20514) },
+ { AOM_CDF4(2636, 7566, 11925) },
+ { AOM_CDF4(27551, 32504, 32701) },
+ { AOM_CDF4(18310, 30054, 32100) },
+ { AOM_CDF4(10211, 23420, 29082) },
+ { AOM_CDF4(6222, 16876, 23916) },
+ { AOM_CDF4(3462, 9954, 15498) },
+ { AOM_CDF4(29991, 32633, 32721) },
+ { AOM_CDF4(19883, 30751, 32201) },
+ { AOM_CDF4(11141, 24184, 29285) },
+ { AOM_CDF4(6420, 16940, 23774) },
+ { AOM_CDF4(3392, 9753, 15118) },
+ { AOM_CDF4(28465, 32616, 32712) },
+ { AOM_CDF4(19850, 30702, 32244) },
+ { AOM_CDF4(10983, 24024, 29223) },
+ { AOM_CDF4(6294, 16770, 23582) },
+ { AOM_CDF4(3244, 9283, 14509) },
+ { AOM_CDF4(30023, 32717, 32748) },
+ { AOM_CDF4(22940, 32032, 32626) },
+ { AOM_CDF4(14282, 27928, 31473) },
+ { AOM_CDF4(8562, 21327, 27914) },
+ { AOM_CDF4(4846, 13393, 19919) },
+ { AOM_CDF4(29981, 32590, 32695) },
+ { AOM_CDF4(20465, 30963, 32166) },
+ { AOM_CDF4(11479, 23579, 28195) },
+ { AOM_CDF4(5916, 15648, 22073) },
+ { AOM_CDF4(3031, 8605, 13398) },
+ { AOM_CDF4(31146, 32691, 32739) },
+ { AOM_CDF4(23106, 31724, 32444) },
+ { AOM_CDF4(13783, 26738, 30439) },
+ { AOM_CDF4(7852, 19468, 25807) },
+ { AOM_CDF4(3860, 11124, 16853) },
+ { AOM_CDF4(31014, 32724, 32748) },
+ { AOM_CDF4(23629, 32109, 32628) },
+ { AOM_CDF4(14747, 28115, 31403) },
+ { AOM_CDF4(8545, 21242, 27478) },
+ { AOM_CDF4(4574, 12781, 19067) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(9185, 19694, 24688) },
+ { AOM_CDF4(26081, 31985, 32621) },
+ { AOM_CDF4(16015, 29000, 31787) },
+ { AOM_CDF4(10542, 23690, 29206) },
+ { AOM_CDF4(6732, 17945, 24677) },
+ { AOM_CDF4(3916, 11039, 16722) },
+ { AOM_CDF4(28224, 32566, 32744) },
+ { AOM_CDF4(19100, 31138, 32485) },
+ { AOM_CDF4(12528, 26620, 30879) },
+ { AOM_CDF4(7741, 20277, 26885) },
+ { AOM_CDF4(4566, 12845, 18990) },
+ { AOM_CDF4(29933, 32593, 32718) },
+ { AOM_CDF4(17670, 30333, 32155) },
+ { AOM_CDF4(10385, 23600, 28909) },
+ { AOM_CDF4(6243, 16236, 22407) },
+ { AOM_CDF4(3976, 10389, 16017) },
+ { AOM_CDF4(28377, 32561, 32738) },
+ { AOM_CDF4(19366, 31175, 32482) },
+ { AOM_CDF4(13327, 27175, 31094) },
+ { AOM_CDF4(8258, 20769, 27143) },
+ { AOM_CDF4(4703, 13198, 19527) },
+ { AOM_CDF4(31086, 32706, 32748) },
+ { AOM_CDF4(22853, 31902, 32583) },
+ { AOM_CDF4(14759, 28186, 31419) },
+ { AOM_CDF4(9284, 22382, 28348) },
+ { AOM_CDF4(5585, 15192, 21868) },
+ { AOM_CDF4(28291, 32652, 32746) },
+ { AOM_CDF4(19849, 32107, 32571) },
+ { AOM_CDF4(14834, 26818, 29214) },
+ { AOM_CDF4(10306, 22594, 28672) },
+ { AOM_CDF4(6615, 17384, 23384) },
+ { AOM_CDF4(28947, 32604, 32745) },
+ { AOM_CDF4(25625, 32289, 32646) },
+ { AOM_CDF4(18758, 28672, 31403) },
+ { AOM_CDF4(10017, 23430, 28523) },
+ { AOM_CDF4(6862, 15269, 22131) },
+ { AOM_CDF4(23933, 32509, 32739) },
+ { AOM_CDF4(19927, 31495, 32631) },
+ { AOM_CDF4(11903, 26023, 30621) },
+ { AOM_CDF4(7026, 20094, 27252) },
+ { AOM_CDF4(5998, 18106, 24437) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4456, 11274, 15533) },
+ { AOM_CDF4(21219, 29079, 31616) },
+ { AOM_CDF4(11173, 23774, 28567) },
+ { AOM_CDF4(7282, 18293, 24263) },
+ { AOM_CDF4(4890, 13286, 19115) },
+ { AOM_CDF4(1890, 5508, 8659) },
+ { AOM_CDF4(26651, 32136, 32647) },
+ { AOM_CDF4(14630, 28254, 31455) },
+ { AOM_CDF4(8716, 21287, 27395) },
+ { AOM_CDF4(5615, 15331, 22008) },
+ { AOM_CDF4(2675, 7700, 12150) },
+ { AOM_CDF4(29954, 32526, 32690) },
+ { AOM_CDF4(16126, 28982, 31633) },
+ { AOM_CDF4(9030, 21361, 27352) },
+ { AOM_CDF4(5411, 14793, 21271) },
+ { AOM_CDF4(2943, 8422, 13163) },
+ { AOM_CDF4(29539, 32601, 32730) },
+ { AOM_CDF4(18125, 30385, 32201) },
+ { AOM_CDF4(10422, 24090, 29468) },
+ { AOM_CDF4(6468, 17487, 24438) },
+ { AOM_CDF4(2970, 8653, 13531) },
+ { AOM_CDF4(30912, 32715, 32748) },
+ { AOM_CDF4(20666, 31373, 32497) },
+ { AOM_CDF4(12509, 26640, 30917) },
+ { AOM_CDF4(8058, 20629, 27290) },
+ { AOM_CDF4(4231, 12006, 18052) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(10202, 20633, 25484) },
+ { AOM_CDF4(27336, 31445, 32352) },
+ { AOM_CDF4(12420, 24384, 28552) },
+ { AOM_CDF4(7648, 18115, 23856) },
+ { AOM_CDF4(5662, 14341, 19902) },
+ { AOM_CDF4(3611, 10328, 15390) },
+ { AOM_CDF4(30945, 32616, 32736) },
+ { AOM_CDF4(18682, 30505, 32253) },
+ { AOM_CDF4(11513, 25336, 30203) },
+ { AOM_CDF4(7449, 19452, 26148) },
+ { AOM_CDF4(4482, 13051, 18886) },
+ { AOM_CDF4(32022, 32690, 32747) },
+ { AOM_CDF4(18578, 30501, 32146) },
+ { AOM_CDF4(11249, 23368, 28631) },
+ { AOM_CDF4(5645, 16958, 22158) },
+ { AOM_CDF4(5009, 11444, 16637) },
+ { AOM_CDF4(31357, 32710, 32748) },
+ { AOM_CDF4(21552, 31494, 32504) },
+ { AOM_CDF4(13891, 27677, 31340) },
+ { AOM_CDF4(9051, 22098, 28172) },
+ { AOM_CDF4(5190, 13377, 19486) },
+ { AOM_CDF4(32364, 32740, 32748) },
+ { AOM_CDF4(24839, 31907, 32551) },
+ { AOM_CDF4(17160, 28779, 31696) },
+ { AOM_CDF4(12452, 24137, 29602) },
+ { AOM_CDF4(6165, 15389, 22477) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(2575, 7281, 11077) },
+ { AOM_CDF4(14002, 20866, 25402) },
+ { AOM_CDF4(6343, 15056, 19658) },
+ { AOM_CDF4(4474, 11858, 17041) },
+ { AOM_CDF4(2865, 8299, 12534) },
+ { AOM_CDF4(1344, 3949, 6391) },
+ { AOM_CDF4(24720, 31239, 32459) },
+ { AOM_CDF4(12585, 25356, 29968) },
+ { AOM_CDF4(7181, 18246, 24444) },
+ { AOM_CDF4(5025, 13667, 19885) },
+ { AOM_CDF4(2521, 7304, 11605) },
+ { AOM_CDF4(29908, 32252, 32584) },
+ { AOM_CDF4(17421, 29156, 31575) },
+ { AOM_CDF4(9889, 22188, 27782) },
+ { AOM_CDF4(5878, 15647, 22123) },
+ { AOM_CDF4(2814, 8665, 13323) },
+ { AOM_CDF4(30183, 32568, 32713) },
+ { AOM_CDF4(18528, 30195, 32049) },
+ { AOM_CDF4(10982, 24606, 29657) },
+ { AOM_CDF4(6957, 18165, 25231) },
+ { AOM_CDF4(3508, 10118, 15468) },
+ { AOM_CDF4(31761, 32736, 32748) },
+ { AOM_CDF4(21041, 31328, 32546) },
+ { AOM_CDF4(12568, 26732, 31166) },
+ { AOM_CDF4(8052, 20720, 27733) },
+ { AOM_CDF4(4336, 12192, 18396) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(7062, 16472, 22319) },
+ { AOM_CDF4(24538, 32261, 32674) },
+ { AOM_CDF4(13675, 28041, 31779) },
+ { AOM_CDF4(8590, 20674, 27631) },
+ { AOM_CDF4(5685, 14675, 22013) },
+ { AOM_CDF4(3655, 9898, 15731) },
+ { AOM_CDF4(26493, 32418, 32658) },
+ { AOM_CDF4(16376, 29342, 32090) },
+ { AOM_CDF4(10594, 22649, 28970) },
+ { AOM_CDF4(8176, 17170, 24303) },
+ { AOM_CDF4(5605, 12694, 19139) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(23888, 31902, 32542) },
+ { AOM_CDF4(18612, 29687, 31987) },
+ { AOM_CDF4(16245, 24852, 29249) },
+ { AOM_CDF4(15765, 22608, 27559) },
+ { AOM_CDF4(19895, 24699, 27510) },
+ { AOM_CDF4(28401, 32212, 32457) },
+ { AOM_CDF4(15274, 27825, 30980) },
+ { AOM_CDF4(9364, 18128, 24332) },
+ { AOM_CDF4(2283, 8193, 15082) },
+ { AOM_CDF4(1228, 3972, 7881) },
+ { AOM_CDF4(29455, 32469, 32620) },
+ { AOM_CDF4(17981, 28245, 31388) },
+ { AOM_CDF4(10921, 20098, 26240) },
+ { AOM_CDF4(3743, 11829, 18657) },
+ { AOM_CDF4(2374, 9593, 15715) },
+ { AOM_CDF4(31068, 32466, 32635) },
+ { AOM_CDF4(20321, 29572, 31971) },
+ { AOM_CDF4(10771, 20255, 27119) },
+ { AOM_CDF4(2795, 10410, 17361) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(9320, 22102, 27840) },
+ { AOM_CDF4(27057, 32464, 32724) },
+ { AOM_CDF4(16331, 30268, 32309) },
+ { AOM_CDF4(10319, 23935, 29720) },
+ { AOM_CDF4(6189, 16448, 24106) },
+ { AOM_CDF4(3589, 10884, 18808) },
+ { AOM_CDF4(29026, 32624, 32748) },
+ { AOM_CDF4(19226, 31507, 32587) },
+ { AOM_CDF4(12692, 26921, 31203) },
+ { AOM_CDF4(7049, 19532, 27635) },
+ { AOM_CDF4(7727, 15669, 23252) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(28056, 32625, 32748) },
+ { AOM_CDF4(22383, 32075, 32669) },
+ { AOM_CDF4(15417, 27098, 31749) },
+ { AOM_CDF4(18127, 26493, 27190) },
+ { AOM_CDF4(5461, 16384, 21845) },
+ { AOM_CDF4(27982, 32091, 32584) },
+ { AOM_CDF4(19045, 29868, 31972) },
+ { AOM_CDF4(10397, 22266, 27932) },
+ { AOM_CDF4(5990, 13697, 21500) },
+ { AOM_CDF4(1792, 6912, 15104) },
+ { AOM_CDF4(28198, 32501, 32718) },
+ { AOM_CDF4(21534, 31521, 32569) },
+ { AOM_CDF4(11109, 25217, 30017) },
+ { AOM_CDF4(5671, 15124, 26151) },
+ { AOM_CDF4(4681, 14043, 18725) },
+ { AOM_CDF4(28688, 32580, 32741) },
+ { AOM_CDF4(22576, 32079, 32661) },
+ { AOM_CDF4(10627, 22141, 28340) },
+ { AOM_CDF4(9362, 14043, 28087) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7754, 16948, 22142) },
+ { AOM_CDF4(25670, 32330, 32691) },
+ { AOM_CDF4(15663, 29225, 31994) },
+ { AOM_CDF4(9878, 23288, 29158) },
+ { AOM_CDF4(6419, 17088, 24336) },
+ { AOM_CDF4(3859, 11003, 17039) },
+ { AOM_CDF4(27562, 32595, 32725) },
+ { AOM_CDF4(17575, 30588, 32399) },
+ { AOM_CDF4(10819, 24838, 30309) },
+ { AOM_CDF4(7124, 18686, 25916) },
+ { AOM_CDF4(4479, 12688, 19340) },
+ { AOM_CDF4(28385, 32476, 32673) },
+ { AOM_CDF4(15306, 29005, 31938) },
+ { AOM_CDF4(8937, 21615, 28322) },
+ { AOM_CDF4(5982, 15603, 22786) },
+ { AOM_CDF4(3620, 10267, 16136) },
+ { AOM_CDF4(27280, 32464, 32667) },
+ { AOM_CDF4(15607, 29160, 32004) },
+ { AOM_CDF4(9091, 22135, 28740) },
+ { AOM_CDF4(6232, 16632, 24020) },
+ { AOM_CDF4(4047, 11377, 17672) },
+ { AOM_CDF4(29220, 32630, 32718) },
+ { AOM_CDF4(19650, 31220, 32462) },
+ { AOM_CDF4(13050, 26312, 30827) },
+ { AOM_CDF4(9228, 20870, 27468) },
+ { AOM_CDF4(6146, 15149, 21971) },
+ { AOM_CDF4(30169, 32481, 32623) },
+ { AOM_CDF4(17212, 29311, 31554) },
+ { AOM_CDF4(9911, 21311, 26882) },
+ { AOM_CDF4(4487, 13314, 20372) },
+ { AOM_CDF4(2570, 7772, 12889) },
+ { AOM_CDF4(30924, 32613, 32708) },
+ { AOM_CDF4(19490, 30206, 32107) },
+ { AOM_CDF4(11232, 23998, 29276) },
+ { AOM_CDF4(6769, 17955, 25035) },
+ { AOM_CDF4(4398, 12623, 19214) },
+ { AOM_CDF4(30609, 32627, 32722) },
+ { AOM_CDF4(19370, 30582, 32287) },
+ { AOM_CDF4(10457, 23619, 29409) },
+ { AOM_CDF4(6443, 17637, 24834) },
+ { AOM_CDF4(4645, 13236, 20106) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8626, 20271, 26216) },
+ { AOM_CDF4(26707, 32406, 32711) },
+ { AOM_CDF4(16999, 30329, 32286) },
+ { AOM_CDF4(11445, 25123, 30286) },
+ { AOM_CDF4(6411, 18828, 25601) },
+ { AOM_CDF4(6801, 12458, 20248) },
+ { AOM_CDF4(29918, 32682, 32748) },
+ { AOM_CDF4(20649, 31739, 32618) },
+ { AOM_CDF4(12879, 27773, 31581) },
+ { AOM_CDF4(7896, 21751, 28244) },
+ { AOM_CDF4(5260, 14870, 23698) },
+ { AOM_CDF4(29252, 32593, 32731) },
+ { AOM_CDF4(17072, 30460, 32294) },
+ { AOM_CDF4(10653, 24143, 29365) },
+ { AOM_CDF4(6536, 17490, 23983) },
+ { AOM_CDF4(4929, 13170, 20085) },
+ { AOM_CDF4(28137, 32518, 32715) },
+ { AOM_CDF4(18171, 30784, 32407) },
+ { AOM_CDF4(11437, 25436, 30459) },
+ { AOM_CDF4(7252, 18534, 26176) },
+ { AOM_CDF4(4126, 13353, 20978) },
+ { AOM_CDF4(31162, 32726, 32748) },
+ { AOM_CDF4(23017, 32222, 32701) },
+ { AOM_CDF4(15629, 29233, 32046) },
+ { AOM_CDF4(9387, 22621, 29480) },
+ { AOM_CDF4(6922, 17616, 25010) },
+ { AOM_CDF4(28838, 32265, 32614) },
+ { AOM_CDF4(19701, 30206, 31920) },
+ { AOM_CDF4(11214, 22410, 27933) },
+ { AOM_CDF4(5320, 14177, 23034) },
+ { AOM_CDF4(5049, 12881, 17827) },
+ { AOM_CDF4(27484, 32471, 32734) },
+ { AOM_CDF4(21076, 31526, 32561) },
+ { AOM_CDF4(12707, 26303, 31211) },
+ { AOM_CDF4(8169, 21722, 28219) },
+ { AOM_CDF4(6045, 19406, 27042) },
+ { AOM_CDF4(27753, 32572, 32745) },
+ { AOM_CDF4(20832, 31878, 32653) },
+ { AOM_CDF4(13250, 27356, 31674) },
+ { AOM_CDF4(7718, 21508, 29858) },
+ { AOM_CDF4(7209, 18350, 25559) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7876, 16901, 21741) },
+ { AOM_CDF4(24001, 31898, 32625) },
+ { AOM_CDF4(14529, 27959, 31451) },
+ { AOM_CDF4(8273, 20818, 27258) },
+ { AOM_CDF4(5278, 14673, 21510) },
+ { AOM_CDF4(2983, 8843, 14039) },
+ { AOM_CDF4(28016, 32574, 32732) },
+ { AOM_CDF4(17471, 30306, 32301) },
+ { AOM_CDF4(10224, 24063, 29728) },
+ { AOM_CDF4(6602, 17954, 25052) },
+ { AOM_CDF4(4002, 11585, 17759) },
+ { AOM_CDF4(30190, 32634, 32739) },
+ { AOM_CDF4(17497, 30282, 32270) },
+ { AOM_CDF4(10229, 23729, 29538) },
+ { AOM_CDF4(6344, 17211, 24440) },
+ { AOM_CDF4(3849, 11189, 17108) },
+ { AOM_CDF4(28570, 32583, 32726) },
+ { AOM_CDF4(17521, 30161, 32238) },
+ { AOM_CDF4(10153, 23565, 29378) },
+ { AOM_CDF4(6455, 17341, 24443) },
+ { AOM_CDF4(3907, 11042, 17024) },
+ { AOM_CDF4(30689, 32715, 32748) },
+ { AOM_CDF4(21546, 31840, 32610) },
+ { AOM_CDF4(13547, 27581, 31459) },
+ { AOM_CDF4(8912, 21757, 28309) },
+ { AOM_CDF4(5548, 15080, 22046) },
+ { AOM_CDF4(30783, 32540, 32685) },
+ { AOM_CDF4(17540, 29528, 31668) },
+ { AOM_CDF4(10160, 21468, 26783) },
+ { AOM_CDF4(4724, 13393, 20054) },
+ { AOM_CDF4(2702, 8174, 13102) },
+ { AOM_CDF4(31648, 32686, 32742) },
+ { AOM_CDF4(20954, 31094, 32337) },
+ { AOM_CDF4(12420, 25698, 30179) },
+ { AOM_CDF4(7304, 19320, 26248) },
+ { AOM_CDF4(4366, 12261, 18864) },
+ { AOM_CDF4(31581, 32723, 32748) },
+ { AOM_CDF4(21373, 31586, 32525) },
+ { AOM_CDF4(12744, 26625, 30885) },
+ { AOM_CDF4(7431, 20322, 26950) },
+ { AOM_CDF4(4692, 13323, 20111) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(7833, 18369, 24095) },
+ { AOM_CDF4(26650, 32273, 32702) },
+ { AOM_CDF4(16371, 29961, 32191) },
+ { AOM_CDF4(11055, 24082, 29629) },
+ { AOM_CDF4(6892, 18644, 25400) },
+ { AOM_CDF4(5006, 13057, 19240) },
+ { AOM_CDF4(29834, 32666, 32748) },
+ { AOM_CDF4(19577, 31335, 32570) },
+ { AOM_CDF4(12253, 26509, 31122) },
+ { AOM_CDF4(7991, 20772, 27711) },
+ { AOM_CDF4(5677, 15910, 23059) },
+ { AOM_CDF4(30109, 32532, 32720) },
+ { AOM_CDF4(16747, 30166, 32252) },
+ { AOM_CDF4(10134, 23542, 29184) },
+ { AOM_CDF4(5791, 16176, 23556) },
+ { AOM_CDF4(4362, 10414, 17284) },
+ { AOM_CDF4(29492, 32626, 32748) },
+ { AOM_CDF4(19894, 31402, 32525) },
+ { AOM_CDF4(12942, 27071, 30869) },
+ { AOM_CDF4(8346, 21216, 27405) },
+ { AOM_CDF4(6572, 17087, 23859) },
+ { AOM_CDF4(32035, 32735, 32748) },
+ { AOM_CDF4(22957, 31838, 32618) },
+ { AOM_CDF4(14724, 28572, 31772) },
+ { AOM_CDF4(10364, 23999, 29553) },
+ { AOM_CDF4(7004, 18433, 25655) },
+ { AOM_CDF4(27528, 32277, 32681) },
+ { AOM_CDF4(16959, 31171, 32096) },
+ { AOM_CDF4(10486, 23593, 27962) },
+ { AOM_CDF4(8192, 16384, 23211) },
+ { AOM_CDF4(8937, 17873, 20852) },
+ { AOM_CDF4(27715, 32002, 32615) },
+ { AOM_CDF4(15073, 29491, 31676) },
+ { AOM_CDF4(11264, 24576, 28672) },
+ { AOM_CDF4(2341, 18725, 23406) },
+ { AOM_CDF4(7282, 18204, 25486) },
+ { AOM_CDF4(28547, 32213, 32657) },
+ { AOM_CDF4(20788, 29773, 32239) },
+ { AOM_CDF4(6780, 21469, 30508) },
+ { AOM_CDF4(5958, 14895, 23831) },
+ { AOM_CDF4(16384, 21845, 27307) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5992, 14304, 19765) },
+ { AOM_CDF4(22612, 31238, 32456) },
+ { AOM_CDF4(13456, 27162, 31087) },
+ { AOM_CDF4(8001, 20062, 26504) },
+ { AOM_CDF4(5168, 14105, 20764) },
+ { AOM_CDF4(2632, 7771, 12385) },
+ { AOM_CDF4(27034, 32344, 32709) },
+ { AOM_CDF4(15850, 29415, 31997) },
+ { AOM_CDF4(9494, 22776, 28841) },
+ { AOM_CDF4(6151, 16830, 23969) },
+ { AOM_CDF4(3461, 10039, 15722) },
+ { AOM_CDF4(30134, 32569, 32731) },
+ { AOM_CDF4(15638, 29422, 31945) },
+ { AOM_CDF4(9150, 21865, 28218) },
+ { AOM_CDF4(5647, 15719, 22676) },
+ { AOM_CDF4(3402, 9772, 15477) },
+ { AOM_CDF4(28530, 32586, 32735) },
+ { AOM_CDF4(17139, 30298, 32292) },
+ { AOM_CDF4(10200, 24039, 29685) },
+ { AOM_CDF4(6419, 17674, 24786) },
+ { AOM_CDF4(3544, 10225, 15824) },
+ { AOM_CDF4(31333, 32726, 32748) },
+ { AOM_CDF4(20618, 31487, 32544) },
+ { AOM_CDF4(12901, 27217, 31232) },
+ { AOM_CDF4(8624, 21734, 28171) },
+ { AOM_CDF4(5104, 14191, 20748) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(11206, 21090, 26561) },
+ { AOM_CDF4(28759, 32279, 32671) },
+ { AOM_CDF4(14171, 27952, 31569) },
+ { AOM_CDF4(9743, 22907, 29141) },
+ { AOM_CDF4(6871, 17886, 24868) },
+ { AOM_CDF4(4960, 13152, 19315) },
+ { AOM_CDF4(31077, 32661, 32748) },
+ { AOM_CDF4(19400, 31195, 32515) },
+ { AOM_CDF4(12752, 26858, 31040) },
+ { AOM_CDF4(8370, 22098, 28591) },
+ { AOM_CDF4(5457, 15373, 22298) },
+ { AOM_CDF4(31697, 32706, 32748) },
+ { AOM_CDF4(17860, 30657, 32333) },
+ { AOM_CDF4(12510, 24812, 29261) },
+ { AOM_CDF4(6180, 19124, 24722) },
+ { AOM_CDF4(5041, 13548, 17959) },
+ { AOM_CDF4(31552, 32716, 32748) },
+ { AOM_CDF4(21908, 31769, 32623) },
+ { AOM_CDF4(14470, 28201, 31565) },
+ { AOM_CDF4(9493, 22982, 28608) },
+ { AOM_CDF4(6858, 17240, 24137) },
+ { AOM_CDF4(32543, 32752, 32756) },
+ { AOM_CDF4(24286, 32097, 32666) },
+ { AOM_CDF4(15958, 29217, 32024) },
+ { AOM_CDF4(10207, 24234, 29958) },
+ { AOM_CDF4(6929, 18305, 25652) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4137, 10847, 15682) },
+ { AOM_CDF4(17824, 27001, 30058) },
+ { AOM_CDF4(10204, 22796, 28291) },
+ { AOM_CDF4(6076, 15935, 22125) },
+ { AOM_CDF4(3852, 10937, 16816) },
+ { AOM_CDF4(2252, 6324, 10131) },
+ { AOM_CDF4(25840, 32016, 32662) },
+ { AOM_CDF4(15109, 28268, 31531) },
+ { AOM_CDF4(9385, 22231, 28340) },
+ { AOM_CDF4(6082, 16672, 23479) },
+ { AOM_CDF4(3318, 9427, 14681) },
+ { AOM_CDF4(30594, 32574, 32718) },
+ { AOM_CDF4(16836, 29552, 31859) },
+ { AOM_CDF4(9556, 22542, 28356) },
+ { AOM_CDF4(6305, 16725, 23540) },
+ { AOM_CDF4(3376, 9895, 15184) },
+ { AOM_CDF4(29383, 32617, 32745) },
+ { AOM_CDF4(18891, 30809, 32401) },
+ { AOM_CDF4(11688, 25942, 30687) },
+ { AOM_CDF4(7468, 19469, 26651) },
+ { AOM_CDF4(3909, 11358, 17012) },
+ { AOM_CDF4(31564, 32736, 32748) },
+ { AOM_CDF4(20906, 31611, 32600) },
+ { AOM_CDF4(13191, 27621, 31537) },
+ { AOM_CDF4(8768, 22029, 28676) },
+ { AOM_CDF4(5079, 14109, 20906) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } } };
static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
diff --git a/media/libaom/src/av1/common/txb_common.c b/media/libaom/src/av1/common/txb_common.c
index c96d37cca..4eef319cd 100644
--- a/media/libaom/src/av1/common/txb_common.c
+++ b/media/libaom/src/av1/common/txb_common.c
@@ -9,7 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "aom/aom_integer.h"
-#include "av1/common/onyxc_int.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/txb_common.h"
const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2, 3, 4, 5, 6, 7,
@@ -453,23 +453,6 @@ const int8_t *av1_nz_map_ctx_offset[19] = {
av1_nz_map_ctx_offset_64x32, // TX_64x16
};
-void av1_init_lv_map(AV1_COMMON *cm) {
- LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
- for (int row = 0; row < 2; ++row) {
- for (int col = 0; col < 2; ++col) {
- for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
- for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
- if (row == 0 && col == 0 && count > 5) continue;
- if ((row == 0 || col == 0) && count > 8) continue;
-
- coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
- get_base_ctx_from_count_mag(row, col, count, sig_mag);
- }
- }
- }
- }
-}
-
-const int16_t k_eob_group_start[12] = { 0, 1, 2, 3, 5, 9,
- 17, 33, 65, 129, 257, 513 };
-const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+const int16_t av1_eob_group_start[12] = { 0, 1, 2, 3, 5, 9,
+ 17, 33, 65, 129, 257, 513 };
+const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/media/libaom/src/av1/common/txb_common.h b/media/libaom/src/av1/common/txb_common.h
index 1dda51f8b..5a62fa89b 100644
--- a/media/libaom/src/av1/common/txb_common.h
+++ b/media/libaom/src/av1/common/txb_common.h
@@ -12,8 +12,10 @@
#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
#define AOM_AV1_COMMON_TXB_COMMON_H_
-extern const int16_t k_eob_group_start[12];
-extern const int16_t k_eob_offset_bits[12];
+#include "av1/common/av1_common_int.h"
+
+extern const int16_t av1_eob_group_start[12];
+extern const int16_t av1_eob_offset_bits[12];
extern const int8_t av1_coeff_band_4x4[16];
@@ -157,6 +159,19 @@ static INLINE int get_br_ctx_2d(const uint8_t *const levels,
return mag + 14;
}
+static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order
+ const int bwl,
+ const TX_CLASS tx_class) {
+ const int row = c >> bwl;
+ const int col = c - (row << bwl);
+ if (c == 0) return 0;
+ if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
+ (tx_class == TX_CLASS_HORIZ && col == 0) ||
+ (tx_class == TX_CLASS_VERT && row == 0))
+ return 7;
+ return 14;
+}
+
static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
const int c, // raster order
const int bwl, const TX_CLASS tx_class) {
@@ -270,12 +285,10 @@ static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
const int row = coeff_idx >> bwl;
const int col = coeff_idx - (row << bwl);
return ctx + nz_map_ctx_offset_1d[col];
- break;
}
case TX_CLASS_VERT: {
const int row = coeff_idx >> bwl;
return ctx + nz_map_ctx_offset_1d[row];
- break;
}
default: break;
}
@@ -373,7 +386,9 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
if (plane_bsize == txsize_to_bsize[tx_size]) {
txb_ctx->txb_skip_ctx = 0;
} else {
- // This is the algorithm to generate table skip_contexts[min][max].
+ // This is the algorithm to generate table skip_contexts[top][left].
+ // const int max = AOMMIN(top | left, 4);
+ // const int min = AOMMIN(AOMMIN(top, left), 4);
// if (!max)
// txb_skip_ctx = 1;
// else if (!min)
@@ -385,10 +400,15 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
// else
// txb_skip_ctx = 6;
static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
- { 1, 4, 4, 4, 5 },
- { 1, 4, 4, 4, 5 },
- { 1, 4, 4, 4, 5 },
- { 1, 4, 4, 4, 6 } };
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 3, 5, 5, 5, 6 } };
+ // For top and left, we only care about which of the following three
+ // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The
+ // spec calculates top and left with the Max() function. We can calculate
+ // an approximate max with bitwise OR because the real max and the
+ // approximate max belong to the same category.
int top = 0;
int left = 0;
@@ -397,16 +417,16 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
top |= a[k];
} while (++k < txb_w_unit);
top &= COEFF_CONTEXT_MASK;
+ top = AOMMIN(top, 4);
k = 0;
do {
left |= l[k];
} while (++k < txb_h_unit);
left &= COEFF_CONTEXT_MASK;
- const int max = AOMMIN(top | left, 4);
- const int min = AOMMIN(AOMMIN(top, left), 4);
+ left = AOMMIN(left, 4);
- txb_ctx->txb_skip_ctx = skip_contexts[min][max];
+ txb_ctx->txb_skip_ctx = skip_contexts[top][left];
}
} else {
const int ctx_base = get_entropy_context(tx_size, a, l);
@@ -419,6 +439,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
#undef MAX_TX_SIZE_UNIT
}
-void av1_init_lv_map(AV1_COMMON *cm);
-
#endif // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/media/libaom/src/av1/common/warped_motion.c b/media/libaom/src/av1/common/warped_motion.c
index 4144c4389..4e9fab9bd 100644
--- a/media/libaom/src/av1/common/warped_motion.c
+++ b/media/libaom/src/av1/common/warped_motion.c
@@ -20,85 +20,13 @@
#include "av1/common/warped_motion.h"
#include "av1/common/scale.h"
-#define WARP_ERROR_BLOCK 32
-
-/* clang-format off */
-static const int error_measure_lut[512] = {
- // pow 0.7
- 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
- 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
- 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
- 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
- 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
- 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
- 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
- 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
- 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
- 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
- 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
- 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
- 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
- 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
- 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
- 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
- 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
- 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
- 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
- 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
- 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
- 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
- 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
- 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
- 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
- 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
- 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
- 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
- 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
- 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
- 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
- 1323, 1187, 1045, 894, 731, 550, 339, 0,
- 339, 550, 731, 894, 1045, 1187, 1323, 1452,
- 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
- 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
- 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
- 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
- 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
- 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
- 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
- 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
- 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
- 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
- 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
- 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
- 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
- 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
- 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
- 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
- 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
- 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
- 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
- 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
- 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
- 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
- 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
- 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
- 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
- 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
- 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
- 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
- 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
- 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
- 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
-};
-/* clang-format on */
-
// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
// at a time. The zoom/rotation/shear in the model are applied to the
// "fractional" position of each pixel, which therefore varies within
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
// We need an extra 2 taps to fit this in, for a total of 8 taps.
/* clang-format off */
-const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
#if WARPEDPIXEL_PREC_BITS == 6
// [-1, 0)
{ 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
@@ -345,7 +273,7 @@ static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
}
// Returns 1 on success or 0 on an invalid affine set
-int get_shear_params(WarpedMotionParams *wm) {
+int av1_get_shear_params(WarpedMotionParams *wm) {
const int32_t *mat = wm->wmmat;
if (!is_affine_valid(wm)) return 0;
wm->alpha =
@@ -376,6 +304,7 @@ int get_shear_params(WarpedMotionParams *wm) {
return 1;
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE int highbd_error_measure(int err, int bd) {
const int b = bd - 8;
const int bmask = (1 << b) - 1;
@@ -447,7 +376,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
- const int16_t *coeffs = warped_filter[offs];
+ const int16_t *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_horiz;
for (int m = 0; m < 8; ++m) {
@@ -468,7 +397,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
- const int16_t *coeffs = warped_filter[offs];
+ const int16_t *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_vert;
for (int m = 0; m < 8; ++m) {
@@ -485,7 +414,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
uint16_t *dst16 =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
int32_t tmp32 = *p;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp32 = tmp32 * conv_params->fwd_offset +
sum * conv_params->bck_offset;
tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -514,12 +443,11 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
}
}
-static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
- int width, int height, int stride,
- const uint8_t *const pred8, int p_col, int p_row,
- int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y, int bd,
- ConvolveParams *conv_params) {
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+ int width, int height, int stride, uint16_t *const pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y,
+ int bd, ConvolveParams *conv_params) {
assert(wm->wmtype <= AFFINE);
if (wm->wmtype == ROTZOOM) {
wm->wmmat[5] = wm->wmmat[2];
@@ -531,17 +459,15 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
const int16_t gamma = wm->gamma;
const int16_t delta = wm->delta;
- const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
p_width, p_height, p_stride, subsampling_x,
subsampling_y, bd, conv_params, alpha, beta, gamma,
delta);
}
-static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
- const uint16_t *const dst, int p_width,
- int p_height, int p_stride, int bd) {
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+ const uint16_t *const dst, int p_width,
+ int p_height, int p_stride, int bd) {
int64_t sum_error = 0;
for (int i = 0; i < p_height; ++i) {
for (int j = 0; j < p_width; ++j) {
@@ -552,41 +478,33 @@ static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
return sum_error;
}
-static int64_t highbd_warp_error(
- WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
- int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
- int64_t best_error) {
- int64_t gm_sumerr = 0;
+static int64_t highbd_segmented_frame_error(
+ const uint16_t *const ref, int stride, const uint16_t *const dst,
+ int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
+ int segment_map_stride) {
+ int patch_w, patch_h;
const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
- uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-
- ConvolveParams conv_params = get_conv_params(0, 0, bd);
- conv_params.use_jnt_comp_avg = 0;
- for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
- for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
- // avoid warping extra 8x8 blocks in the padded region of the frame
- // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
- const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
- const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
- highbd_warp_plane(wm, ref8, width, height, stride,
- CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
- WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
- &conv_params);
-
- gm_sumerr += highbd_frame_error(
- tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
- warp_w, warp_h, p_stride, bd);
- if (gm_sumerr > best_error) return gm_sumerr;
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+ sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
+ dst + j + i * p_stride, patch_w,
+ patch_h, p_stride, bd);
}
}
- return gm_sumerr;
-}
-
-static INLINE int error_measure(int err) {
- return error_measure_lut[255 + err];
+ return sum_error;
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
/* The warp filter for ROTZOOM and AFFINE models works as follows:
* Split the input into 8x8 blocks
@@ -732,7 +650,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
- const int16_t *coeffs = warped_filter[offs];
+ const int16_t *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_horiz;
for (int m = 0; m < 8; ++m) {
@@ -756,7 +674,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
- const int16_t *coeffs = warped_filter[offs];
+ const int16_t *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_vert;
for (int m = 0; m < 8; ++m) {
@@ -773,7 +691,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
uint8_t *dst8 =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
int32_t tmp32 = *p;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp32 = tmp32 * conv_params->fwd_offset +
sum * conv_params->bck_offset;
tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -801,11 +719,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
}
}
-static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
- int width, int height, int stride, uint8_t *pred,
- int p_col, int p_row, int p_width, int p_height,
- int p_stride, int subsampling_x, int subsampling_y,
- ConvolveParams *conv_params) {
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+ int height, int stride, uint8_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params) {
assert(wm->wmtype <= AFFINE);
if (wm->wmtype == ROTZOOM) {
wm->wmmat[5] = wm->wmmat[2];
@@ -821,9 +738,9 @@ static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
alpha, beta, gamma, delta);
}
-static int64_t frame_error(const uint8_t *const ref, int stride,
- const uint8_t *const dst, int p_width, int p_height,
- int p_stride) {
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int p_stride) {
int64_t sum_error = 0;
for (int i = 0; i < p_height; ++i) {
for (int j = 0; j < p_width; ++j) {
@@ -834,61 +751,64 @@ static int64_t frame_error(const uint8_t *const ref, int stride,
return sum_error;
}
-static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
- int width, int height, int stride,
- const uint8_t *const dst, int p_col, int p_row,
- int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y,
- int64_t best_error) {
- int64_t gm_sumerr = 0;
- int warp_w, warp_h;
- int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
- int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
- uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
- ConvolveParams conv_params = get_conv_params(0, 0, 8);
- conv_params.use_jnt_comp_avg = 0;
-
- for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
- for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
- // avoid warping extra 8x8 blocks in the padded region of the frame
- // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
- warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
- warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
- warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
- WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
-
- gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
- warp_w, warp_h, p_stride);
- if (gm_sumerr > best_error) return gm_sumerr;
+static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int p_stride,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+ int patch_w, patch_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+ sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
+ dst + j + i * p_stride, patch_w,
+ patch_h, p_stride);
}
}
- return gm_sumerr;
+ return sum_error;
}
int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
uint8_t *dst, int p_width, int p_height, int p_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
if (use_hbd) {
- return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
- CONVERT_TO_SHORTPTR(dst), p_width, p_height,
- p_stride, bd);
+ return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+ CONVERT_TO_SHORTPTR(dst), p_width,
+ p_height, p_stride, bd);
}
- return frame_error(ref, stride, dst, p_width, p_height, p_stride);
+#endif
+ (void)use_hbd;
+ (void)bd;
+ return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
}
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
- const uint8_t *ref, int width, int height, int stride,
- uint8_t *dst, int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x,
- int subsampling_y, int64_t best_error) {
- if (wm->wmtype <= AFFINE)
- if (!get_shear_params(wm)) return 1;
- if (use_hbd)
- return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
- p_width, p_height, p_stride, subsampling_x,
- subsampling_y, bd, best_error);
- return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
- p_height, p_stride, subsampling_x, subsampling_y,
- best_error);
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int stride, uint8_t *dst, int p_width,
+ int p_height, int p_stride,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ return highbd_segmented_frame_error(
+ CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
+ p_height, p_stride, bd, segment_map, segment_map_stride);
+ }
+#endif
+ (void)use_hbd;
+ (void)bd;
+ return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
+ segment_map, segment_map_stride);
}
void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
@@ -896,13 +816,21 @@ void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
uint8_t *pred, int p_col, int p_row, int p_width,
int p_height, int p_stride, int subsampling_x,
int subsampling_y, ConvolveParams *conv_params) {
+#if CONFIG_AV1_HIGHBITDEPTH
if (use_hbd)
- highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
- p_width, p_height, p_stride, subsampling_x, subsampling_y,
- bd, conv_params);
+ highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
+ CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, bd,
+ conv_params);
else
warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#else
+ (void)use_hbd;
+ (void)bd;
+ warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#endif
}
#define LS_MV_MAX 256 // max mv in 1/8-pel
@@ -1023,18 +951,15 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
int32_t Bx[2] = { 0, 0 };
int32_t By[2] = { 0, 0 };
- int i;
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
- const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
- const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
+ const int rsuy = bh / 2 - 1;
+ const int rsux = bw / 2 - 1;
const int suy = rsuy * 8;
const int sux = rsux * 8;
const int duy = suy + mvy;
const int dux = sux + mvx;
- const int isuy = (mi_row * MI_SIZE + rsuy);
- const int isux = (mi_col * MI_SIZE + rsux);
// Assume the center pixel of the block has exactly the same motion vector
// as transmitted for the block. First shift the origin of the source
@@ -1059,7 +984,7 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
// The loop below computes: A = P'P, Bx = P'q, By = P'r
// We need to just compute inv(A).Bx and inv(A).By for the solutions.
// Contribution from neighbor block
- for (i = 0; i < np; i++) {
+ for (int i = 0; i < np; i++) {
const int dx = pts2[i * 2] - dux;
const int dy = pts2[i * 2 + 1] - duy;
const int sx = pts1[i * 2] - sux;
@@ -1087,13 +1012,12 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
- int64_t Det;
- int16_t iDet, shift;
-
// Compute Determinant of A
- Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+ const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
if (Det == 0) return 1;
- iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+
+ int16_t shift;
+ int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
shift -= WARPEDMODEL_PREC_BITS;
if (shift < 0) {
iDet <<= (-shift);
@@ -1101,7 +1025,6 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
}
int64_t Px[2], Py[2];
-
// These divided by the Det, are the least squares solutions
Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
@@ -1113,16 +1036,18 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
+ const int isuy = (mi_row * MI_SIZE + rsuy);
+ const int isux = (mi_col * MI_SIZE + rsux);
// Note: In the vx, vy expressions below, the max value of each of the
// 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
// for the first term so that the overall sum in the worst case fits
// within 32 bits overall.
- int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
- (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
- isuy * wm->wmmat[3]);
- int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
- (isux * wm->wmmat[4] +
- isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+ const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+ isuy * wm->wmmat[3]);
+ const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * wm->wmmat[4] +
+ isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
wm->wmmat[0] =
clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
wm->wmmat[1] =
@@ -1132,9 +1057,9 @@ static int find_affine_int(int np, const int *pts1, const int *pts2,
return 0;
}
-int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
- int mvx, WarpedMotionParams *wm_params, int mi_row,
- int mi_col) {
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+ BLOCK_SIZE bsize, int mvy, int mvx,
+ WarpedMotionParams *wm_params, int mi_row, int mi_col) {
assert(wm_params->wmtype == AFFINE);
if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
@@ -1142,7 +1067,7 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
return 1;
// check compatibility with the fast warp filter
- if (!get_shear_params(wm_params)) return 1;
+ if (!av1_get_shear_params(wm_params)) return 1;
return 0;
}
diff --git a/media/libaom/src/av1/common/warped_motion.h b/media/libaom/src/av1/common/warped_motion.h
index a1a4f067d..14dc0fe47 100644
--- a/media/libaom/src/av1/common/warped_motion.h
+++ b/media/libaom/src/av1/common/warped_motion.h
@@ -31,8 +31,83 @@
#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
#define WARPED_MOTION_DEBUG 0
#define DEFAULT_WMTYPE AFFINE
+#define WARP_ERROR_BLOCK_LOG 5
+#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
-extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+DECLARE_ALIGNED(8, extern const int8_t,
+ av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
+
+/* clang-format off */
+static const int error_measure_lut[512] = {
+ // pow 0.7
+ 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+ 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+ 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+ 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+ 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+ 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+ 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+ 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+ 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+ 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+ 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+ 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+ 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+ 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+ 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+ 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+ 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
+ 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+ 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+ 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+ 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+ 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+ 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+ 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+ 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+ 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+ 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+ 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+ 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+ 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+ 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+ 1323, 1187, 1045, 894, 731, 550, 339, 0,
+ 339, 550, 731, 894, 1045, 1187, 1323, 1452,
+ 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+ 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+ 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+ 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+ 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+ 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+ 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+ 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+ 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+ 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+ 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+ 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+ 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+ 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+ 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
+ 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+ 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+ 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+ 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+ 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+ 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+ 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+ 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+ 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+ 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+ 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+ 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+ 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+ 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+ 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+ 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
static const uint8_t warp_pad_left[14][16] = {
{ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -68,28 +143,44 @@ static const uint8_t warp_pad_right[14][16] = {
{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
};
-// Returns the error between the result of applying motion 'wm' to the frame
-// described by 'ref' and the frame described by 'dst'.
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
- const uint8_t *ref, int width, int height, int stride,
- uint8_t *dst, int p_col, int p_row, int p_width,
- int p_height, int p_stride, int subsampling_x,
- int subsampling_y, int64_t best_error);
+static INLINE int error_measure(int err) {
+ return error_measure_lut[255 + err];
+}
// Returns the error between the frame described by 'ref' and the frame
// described by 'dst'.
int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
uint8_t *dst, int p_width, int p_height, int p_stride);
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int stride, uint8_t *dst, int p_width,
+ int p_height, int p_stride,
+ uint8_t *segment_map, int segment_map_stride);
+
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+ const uint16_t *const dst, int p_width,
+ int p_height, int p_stride, int bd);
+
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+ int width, int height, int stride, uint16_t *const pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y,
+ int bd, ConvolveParams *conv_params);
+
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+ int height, int stride, uint8_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params);
+
void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
const uint8_t *ref, int width, int height, int stride,
uint8_t *pred, int p_col, int p_row, int p_width,
int p_height, int p_stride, int subsampling_x,
int subsampling_y, ConvolveParams *conv_params);
-int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
- int mvx, WarpedMotionParams *wm_params, int mi_row,
- int mi_col);
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+ BLOCK_SIZE bsize, int mvy, int mvx,
+ WarpedMotionParams *wm_params, int mi_row, int mi_col);
-int get_shear_params(WarpedMotionParams *wm);
+int av1_get_shear_params(WarpedMotionParams *wm);
#endif // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c b/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
index d9fb53785..196618176 100644
--- a/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
@@ -129,8 +129,8 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi16(w0);
- const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt0 = _mm_set1_epi16((short)w0);
+ const __m128i wt1 = _mm_set1_epi16((short)w1);
const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
int y_qn = subpel_y_qn;
@@ -175,7 +175,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
if (conv_params->is_compound) {
if (conv_params->do_average) {
const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
const __m128i shifted_32 =
@@ -207,7 +207,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -236,8 +236,7 @@ void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
- // TODO(yaowu): remove unnecessary initializations
- int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
+ int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
filter_params_y->taps;
@@ -408,7 +407,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
__m128i p_32 =
_mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
_mm_mullo_epi32(shifted, wt1));
shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
@@ -443,7 +442,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
diff --git a/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c b/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c
deleted file mode 100644
index 212d3bd72..000000000
--- a/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "av1/common/filter.h"
-
-typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
- int src_stride, uint16_t *dst, int dst_stride,
- int bd);
-
-// pixelsNum 0: write all 4 pixels
-// 1/2/3: residual pixels 1/2/3
-static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
- int dst_stride) {
- if (2 == width) {
- if (0 == pixelsNum) {
- *(int *)dst = _mm_cvtsi128_si32(u[0]);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
- *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
- *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
- } else if (1 == pixelsNum) {
- *(int *)dst = _mm_cvtsi128_si32(u[0]);
- } else if (2 == pixelsNum) {
- *(int *)dst = _mm_cvtsi128_si32(u[0]);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
- } else if (3 == pixelsNum) {
- *(int *)dst = _mm_cvtsi128_si32(u[0]);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
- *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
- }
- } else {
- if (0 == pixelsNum) {
- _mm_storel_epi64((__m128i *)dst, u[0]);
- _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
- _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
- _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
- } else if (1 == pixelsNum) {
- _mm_storel_epi64((__m128i *)dst, u[0]);
- } else if (2 == pixelsNum) {
- _mm_storel_epi64((__m128i *)dst, u[0]);
- _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
- } else if (3 == pixelsNum) {
- _mm_storel_epi64((__m128i *)dst, u[0]);
- _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
- _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
- }
- }
-}
-
-// 16-bit pixels clip with bd (10/12)
-static void highbd_clip(__m128i *p, int numVecs, int bd) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- __m128i clamped, mask;
- int i;
-
- for (i = 0; i < numVecs; i++) {
- mask = _mm_cmpgt_epi16(p[i], max);
- clamped = _mm_andnot_si128(mask, p[i]);
- mask = _mm_and_si128(mask, max);
- clamped = _mm_or_si128(mask, clamped);
- mask = _mm_cmpgt_epi16(clamped, zero);
- p[i] = _mm_and_si128(clamped, mask);
- }
-}
-
-static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
- __m128i v0, v1;
- __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
-
- u[0] = _mm_loadu_si128((__m128i const *)src);
- u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
- u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
- u[0] = _mm_add_epi32(u[0], rnd);
- u[1] = _mm_add_epi32(u[1], rnd);
- u[2] = _mm_add_epi32(u[2], rnd);
- u[3] = _mm_add_epi32(u[3], rnd);
-
- u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
- u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
- u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
- u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
-
- u[0] = _mm_packus_epi32(u[0], u[1]);
- u[1] = _mm_packus_epi32(u[2], u[3]);
-
- highbd_clip(u, 2, bd);
-
- v0 = _mm_unpacklo_epi16(u[0], u[1]);
- v1 = _mm_unpackhi_epi16(u[0], u[1]);
-
- u[0] = _mm_unpacklo_epi16(v0, v1);
- u[2] = _mm_unpackhi_epi16(v0, v1);
-
- u[1] = _mm_srli_si128(u[0], 8);
- u[3] = _mm_srli_si128(u[2], 8);
-}
-
-// pixelsNum = 0 : all 4 rows of pixels will be saved.
-// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
-void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int bd) {
- __m128i u[4];
- transClipPixel(src, src_stride, u, bd);
- writePixel(u, width, pixelsNum, dst, dst_stride);
-}
-
-void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
- int src_stride, uint16_t *dst, int dst_stride,
- int bd) {
- __m128i u[4], v[4];
- const __m128i ones = _mm_set1_epi16(1);
-
- transClipPixel(src, src_stride, u, bd);
-
- v[0] = _mm_loadl_epi64((__m128i const *)dst);
- v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
- v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
- v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-
- u[0] = _mm_add_epi16(u[0], v[0]);
- u[1] = _mm_add_epi16(u[1], v[1]);
- u[2] = _mm_add_epi16(u[2], v[2]);
- u[3] = _mm_add_epi16(u[3], v[3]);
-
- u[0] = _mm_add_epi16(u[0], ones);
- u[1] = _mm_add_epi16(u[1], ones);
- u[2] = _mm_add_epi16(u[2], ones);
- u[3] = _mm_add_epi16(u[3], ones);
-
- u[0] = _mm_srai_epi16(u[0], 1);
- u[1] = _mm_srai_epi16(u[1], 1);
- u[2] = _mm_srai_epi16(u[2], 1);
- u[3] = _mm_srai_epi16(u[3], 1);
-
- writePixel(u, width, pixelsNum, dst, dst_stride);
-}
-
-// Vertical convolutional filter
-
-typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
-
-static void highbdRndingPacks(__m128i *u) {
- __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
- u[0] = _mm_add_epi32(u[0], rnd);
- u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
- u[0] = _mm_packus_epi32(u[0], u[0]);
-}
-
-static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
- highbdRndingPacks(u);
- highbd_clip(u, 1, bd);
- *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
-}
-
-static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
- __m128i v = _mm_loadl_epi64((__m128i const *)dst);
- const __m128i ones = _mm_set1_epi16(1);
-
- highbdRndingPacks(u);
- highbd_clip(u, 1, bd);
-
- v = _mm_add_epi16(v, u[0]);
- v = _mm_add_epi16(v, ones);
- v = _mm_srai_epi16(v, 1);
- *(uint32_t *)dst = _mm_cvtsi128_si32(v);
-}
-
-WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
-
-static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
- highbdRndingPacks(u);
- highbd_clip(u, 1, bd);
- _mm_storel_epi64((__m128i *)dst, u[0]);
-}
-
-static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
- __m128i v = _mm_loadl_epi64((__m128i const *)dst);
- const __m128i ones = _mm_set1_epi16(1);
-
- highbdRndingPacks(u);
- highbd_clip(u, 1, bd);
-
- v = _mm_add_epi16(v, u[0]);
- v = _mm_add_epi16(v, ones);
- v = _mm_srai_epi16(v, 1);
- _mm_storel_epi64((__m128i *)dst, v);
-}
-
-WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
index 5db2ccf6c..0fbd5eae4 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
@@ -61,8 +61,7 @@ static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
}
-static void idct16_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -133,8 +132,8 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output,
idct16_stage7_avx2(output, x1);
}
-static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct16_low8_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -181,8 +180,8 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
idct16_stage7_avx2(output, x1);
}
-static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct16_low1_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -303,8 +302,8 @@ static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
output[15] = _mm256_subs_epi16(__zero, x1[1]);
}
-static void iadst16_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void iadst16_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -365,8 +364,8 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output,
iadst16_stage9_avx2(output, x1);
}
-static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -401,8 +400,8 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
iadst16_stage9_avx2(output, x1);
}
-static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -568,8 +567,8 @@ static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
}
-static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct32_low1_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -621,8 +620,8 @@ static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
output[16] = x[0];
}
-static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct32_low8_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -679,8 +678,8 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
idct32_stage9_avx2(output, x);
}
-static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct32_low16_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -746,8 +745,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
idct32_stage9_avx2(output, x);
}
-static void idct32_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
(void)(cos_bit);
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1104,8 +1102,8 @@ static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
}
-static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct64_low1_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -1191,8 +1189,8 @@ static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
output[32] = x[0];
}
-static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct64_low8_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1302,7 +1300,6 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
x[6] = x[1];
x[5] = x[2];
x[4] = x[3];
- x[9] = x[9];
btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
@@ -1312,8 +1309,8 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
idct64_stage11_avx2(output, x);
}
-static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct64_low16_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1428,8 +1425,8 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
idct64_stage11_avx2(output, x);
}
-static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
- int8_t cos_bit) {
+static void idct64_low32_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1577,6 +1574,9 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
idct64_stage11_avx2(output, x);
}
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+ int8_t cos_bit);
+
// 1D functions process 16 pixels at one time.
static const transform_1d_avx2
lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
@@ -1589,17 +1589,15 @@ static const transform_1d_avx2
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } },
{
- { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
- { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
- NULL },
+ { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+ { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
{ NULL, NULL, NULL, NULL },
},
- { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
- idct32_new_avx2 },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } },
- { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
- idct64_low32_new_avx2 },
+ { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
+ idct64_low32_avx2 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
@@ -1611,11 +1609,11 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
__m256i buf1[64 * 16];
int eobx, eoby;
get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div16 = txfm_size_col >> 4;
@@ -1635,6 +1633,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
assert(row_txfm != NULL);
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
__m256i buf0[64];
const int32_t *input_row = input + (i << 4) * input_stride;
@@ -1649,7 +1648,9 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
round_shift_avx2(buf0, buf0, input_stride); // rect special code
}
row_txfm(buf0, buf0, cos_bit_row);
- round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+ for (int j = 0; j < txfm_size_col; ++j) {
+ buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
+ }
__m256i *buf1_cur = buf1 + (i << 4);
if (lr_flip) {
@@ -1665,10 +1666,13 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
}
}
}
+ const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
for (int i = 0; i < buf_size_w_div16; i++) {
__m256i *buf1_cur = buf1 + i * txfm_size_row;
col_txfm(buf1_cur, buf1_cur, cos_bit_col);
- round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+ for (int j = 0; j < txfm_size_row; ++j) {
+ buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
+ }
}
for (int i = 0; i < buf_size_w_div16; i++) {
lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
@@ -1745,7 +1749,7 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
TX_SIZE tx_size,
int32_t eob) {
(void)eob;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
const int txfm_size_col = tx_size_wide[tx_size];
@@ -1767,10 +1771,10 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
TX_SIZE tx_size, int eob) {
int eobx, eoby;
get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
@@ -1807,10 +1811,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
__m256i buf1[64];
int eobx, eoby;
get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div16 = txfm_size_col >> 4;
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
index 995bc3da4..46c051ff8 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -24,8 +24,7 @@ static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
-static void idct4_new_sse2(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -51,7 +50,8 @@ static void idct4_new_sse2(const __m128i *input, __m128i *output,
btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
}
-void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct4_w4_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -77,8 +77,8 @@ void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
}
-void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct8_low1_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -102,7 +102,7 @@ void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
output[4] = x[0];
}
-void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -150,7 +150,8 @@ void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
}
-void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct8_w4_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -236,8 +237,8 @@ static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
}
-static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct16_low1_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -271,8 +272,8 @@ static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
output[8] = x[0];
}
-static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct16_low8_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -318,7 +319,7 @@ static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
idct16_stage7_sse2(output, x);
}
-void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -390,7 +391,8 @@ void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
idct16_stage7_sse2(output, x);
}
-void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void idct16_w4_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -600,8 +602,8 @@ static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
}
-static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct32_low1_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -653,8 +655,8 @@ static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
output[16] = x[0];
}
-static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct32_low8_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -711,8 +713,8 @@ static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
idct32_stage9_sse2(output, x);
}
-static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct32_low16_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -778,8 +780,7 @@ static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
idct32_stage9_sse2(output, x);
}
-static void idct32_new_sse2(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1138,8 +1139,8 @@ static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
}
-static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct64_low1_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
@@ -1225,8 +1226,8 @@ static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
output[32] = x[0];
}
-static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct64_low8_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1336,7 +1337,6 @@ static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
x[6] = x[1];
x[5] = x[2];
x[4] = x[3];
- x[9] = x[9];
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
@@ -1346,8 +1346,8 @@ static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
idct64_stage11_sse2(output, x);
}
-static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct64_low16_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1462,8 +1462,8 @@ static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
idct64_stage11_sse2(output, x);
}
-static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void idct64_low32_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -1611,7 +1611,7 @@ static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
idct64_stage11_sse2(output, x);
}
-void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
(void)cos_bit;
const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
@@ -1672,10 +1672,8 @@ void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
}
}
-// TODO(binpengsmail@gmail.com):
-// To explore the reuse of VP9 versions of corresponding SSE2 functions and
-// evaluate whether there is a possibility for further speedup.
-void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst4_w4_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
@@ -1720,8 +1718,8 @@ void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
}
}
-static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void iadst8_low1_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __zero = _mm_setzero_si128();
@@ -1767,7 +1765,7 @@ static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
output[7] = _mm_subs_epi16(__zero, x[1]);
}
-void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __zero = _mm_setzero_si128();
@@ -1835,7 +1833,8 @@ void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
output[7] = _mm_subs_epi16(__zero, x[1]);
}
-void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst8_w4_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __zero = _mm_setzero_si128();
@@ -1994,8 +1993,8 @@ static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
output[15] = _mm_subs_epi16(__zero, x[1]);
}
-static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void iadst16_low1_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2043,8 +2042,8 @@ static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
iadst16_stage9_ssse3(output, x);
}
-static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void iadst16_low8_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2079,7 +2078,8 @@ static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
iadst16_stage9_ssse3(output, x);
}
-void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+static void iadst16_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2139,8 +2139,8 @@ void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
iadst16_stage9_ssse3(output, x);
}
-void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void iadst16_w4_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int32_t *cospi = cospi_arr(INV_COS_BIT);
const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
@@ -2233,8 +2233,8 @@ void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
iadst16_stage9_ssse3(output, x);
}
-static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void iidentity4_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
@@ -2244,16 +2244,16 @@ static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
}
}
-static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void iidentity8_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
for (int i = 0; i < 8; ++i) {
output[i] = _mm_adds_epi16(input[i], input[i]);
}
}
-static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+static void iidentity16_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
(void)cos_bit;
const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
@@ -2300,11 +2300,11 @@ static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
// 1D functions process process 8 pixels at one time.
static const transform_1d_ssse3
lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
- { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
- { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
- { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
- { idct32_new_sse2, NULL, NULL },
- { idct64_low32_new_ssse3, NULL, NULL },
+ { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
+ { idct8_sse2, iadst8_sse2, iidentity8_sse2 },
+ { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
+ { idct32_sse2, NULL, NULL },
+ { idct64_low32_ssse3, NULL, NULL },
};
// functions for blocks with eob at DC and within
@@ -2312,26 +2312,24 @@ static const transform_1d_ssse3
static const transform_1d_ssse3
lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
{
- { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
- { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
- { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+ { idct4_sse2, idct4_sse2, NULL, NULL },
+ { iadst4_sse2, iadst4_sse2, NULL, NULL },
+ { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
},
- { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
- { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
- { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+ { { idct8_low1_ssse3, idct8_sse2, NULL, NULL },
+ { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL },
+ { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
{
- { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
- NULL },
- { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
- NULL },
+ { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
+ { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
{ NULL, NULL, NULL, NULL },
},
- { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
- idct32_new_sse2 },
+ { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
+ idct32_sse2 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } },
- { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
- idct64_low32_new_ssse3 },
+ { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
+ idct64_low32_ssse3 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
@@ -2340,9 +2338,9 @@ static const transform_1d_ssse3
// used in 4x4, 4x8, 4x16, 8x4, 16x4
static const transform_1d_ssse3
lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
- { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
- { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
- { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+ { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
+ { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
+ { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
{ NULL, NULL, NULL },
{ NULL, NULL, NULL },
};
@@ -2419,7 +2417,7 @@ static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
uint8_t *output, int stride,
TX_SIZE tx_size) {
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
const int txfm_size_col = tx_size_wide[tx_size];
@@ -2437,18 +2435,19 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
}
}
-void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type,
- TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
(void)tx_size_;
(void)eob;
__m128i buf[4];
const TX_SIZE tx_size = TX_4X4;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
@@ -2510,11 +2509,11 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
__m128i buf1[64 * 8];
int eobx, eoby;
get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2580,12 +2579,12 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
int eobx, eoby;
get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div8 = (eobx + 8) >> 3;
@@ -2626,10 +2625,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
__m128i buf1[64];
int eobx, eoby;
get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2708,18 +2707,19 @@ static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
}
}
-void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type,
- TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
(void)tx_size_;
(void)eob;
__m128i buf[8];
const TX_SIZE tx_size = TX_4X8;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
@@ -2747,18 +2747,19 @@ void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
}
-void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type,
- TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
(void)tx_size_;
(void)eob;
__m128i buf[8];
const TX_SIZE tx_size = TX_8X4;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
@@ -2786,18 +2787,19 @@ void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
}
-void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type,
- TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
(void)tx_size_;
(void)eob;
__m128i buf[16];
const TX_SIZE tx_size = TX_4X16;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
@@ -2816,8 +2818,22 @@ void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
row_one_loop);
transpose_16bit_4x8(buf_cur, buf_cur);
- row_txfm(buf_cur, buf_cur, cos_bit_row);
- round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+ if (row_txfm == iidentity4_ssse3) {
+ const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
+ const __m128i ones = _mm_set1_epi16(1);
+ for (int j = 0; j < 4; ++j) {
+ const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
+ const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
+ const __m128i buf_32_lo =
+ _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+ const __m128i buf_32_hi =
+ _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+ buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+ }
+ } else {
+ row_txfm(buf_cur, buf_cur, cos_bit_row);
+ round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+ }
if (lr_flip) {
__m128i temp[8];
flip_buf_sse2(buf_cur, temp, txfm_size_col);
@@ -2831,18 +2847,19 @@ void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
}
-void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
- int stride, TX_TYPE tx_type,
- TX_SIZE tx_size_, int eob) {
+static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
(void)tx_size_;
(void)eob;
__m128i buf[16];
const TX_SIZE tx_size = TX_16X4;
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
- const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
- const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
const int txfm_size_col = tx_size_wide[tx_size];
const int txfm_size_row = tx_size_high[tx_size];
const int buf_size_w_div8 = txfm_size_col >> 3;
@@ -2862,8 +2879,22 @@ void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
txfm_size_row);
transpose_16bit_8x4(buf_cur, buf_cur);
}
- row_txfm(buf, buf, cos_bit_row);
- round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+ if (row_txfm == iidentity16_ssse3) {
+ const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
+ const __m128i ones = _mm_set1_epi16(1);
+ for (int j = 0; j < 16; ++j) {
+ const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
+ const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
+ const __m128i buf_32_lo =
+ _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+ const __m128i buf_32_hi =
+ _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+ buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+ }
+ } else {
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+ }
if (lr_flip) {
__m128i temp[16];
flip_buf_sse2(buf, temp, 16);
@@ -2911,12 +2942,14 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
break;
}
}
+
void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
const TxfmParam *txfm_param) {
- const TX_TYPE tx_type = txfm_param->tx_type;
if (!txfm_param->lossless) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
txfm_param->tx_size, txfm_param->eob);
+
} else {
av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
}
diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
index 66bd339d1..7d5055deb 100644
--- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -72,13 +72,13 @@ static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
}
// 1D itx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
IDCT_1D,
IADST_1D,
IFLIPADST_1D = IADST_1D,
IIDENTITY_1D,
ITX_TYPES_1D,
-} ITX_TYPE_1D;
+} UENUM1BYTE(ITX_TYPE_1D);
static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
diff --git a/media/libaom/src/av1/common/x86/av1_txfm_sse4.c b/media/libaom/src/av1/common/x86/av1_txfm_sse4.c
index 90b9879cc..65ccd1952 100644
--- a/media/libaom/src/av1/common/x86/av1_txfm_sse4.c
+++ b/media/libaom/src/av1/common/x86/av1_txfm_sse4.c
@@ -9,7 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse4.h"
diff --git a/media/libaom/src/av1/common/x86/cfl_avx2.c b/media/libaom/src/av1/common/x86/cfl_avx2.c
index a8bfdcce6..d9c6f99d5 100644
--- a/media/libaom/src/av1/common/x86/cfl_avx2.c
+++ b/media/libaom/src/av1/common/x86/cfl_avx2.c
@@ -16,34 +16,34 @@
#include "av1/common/x86/cfl_simd.h"
-#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \
- CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \
- CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \
- CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \
- cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \
- TX_SIZE tx_size) { \
- static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
- subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \
- subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \
- subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \
- subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \
- cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \
- subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \
- subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \
- subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \
- subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \
- subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \
- subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \
- cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \
- cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \
- subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \
- subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \
- subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \
- subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \
- cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \
- cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \
- }; \
- return subfn_##sub[tx_size]; \
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \
+ TX_SIZE tx_size) { \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \
+ cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \
+ cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \
+ cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \
+ cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \
+ cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \
+ cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \
+ cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \
+ cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \
+ cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \
+ cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \
+ cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ }; \
+ return subfn_##sub[tx_size]; \
}
/**
@@ -147,6 +147,7 @@ static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+#if CONFIG_AV1_HIGHBITDEPTH
/**
* Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
* precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -238,6 +239,7 @@ static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
}
CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
__m256i alpha_sign, __m256i dc_q0) {
@@ -273,33 +275,34 @@ CFL_PREDICT_X(avx2, 32, 8, lbd);
CFL_PREDICT_X(avx2, 32, 16, lbd);
CFL_PREDICT_X(avx2, 32, 32, lbd);
-cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
- predict_lbd_4x4_ssse3, /* 4x4 */
- predict_lbd_8x8_ssse3, /* 8x8 */
- predict_lbd_16x16_ssse3, /* 16x16 */
- predict_lbd_32x32_avx2, /* 32x32 */
- cfl_predict_lbd_null, /* 64x64 (invalid CFL size) */
- predict_lbd_4x8_ssse3, /* 4x8 */
- predict_lbd_8x4_ssse3, /* 8x4 */
- predict_lbd_8x16_ssse3, /* 8x16 */
- predict_lbd_16x8_ssse3, /* 16x8 */
- predict_lbd_16x32_ssse3, /* 16x32 */
- predict_lbd_32x16_avx2, /* 32x16 */
- cfl_predict_lbd_null, /* 32x64 (invalid CFL size) */
- cfl_predict_lbd_null, /* 64x32 (invalid CFL size) */
- predict_lbd_4x16_ssse3, /* 4x16 */
- predict_lbd_16x4_ssse3, /* 16x4 */
- predict_lbd_8x32_ssse3, /* 8x32 */
- predict_lbd_32x8_avx2, /* 32x8 */
- cfl_predict_lbd_null, /* 16x64 (invalid CFL size) */
- cfl_predict_lbd_null, /* 64x16 (invalid CFL size) */
+ cfl_predict_lbd_4x4_ssse3, /* 4x4 */
+ cfl_predict_lbd_8x8_ssse3, /* 8x8 */
+ cfl_predict_lbd_16x16_ssse3, /* 16x16 */
+ cfl_predict_lbd_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_predict_lbd_4x8_ssse3, /* 4x8 */
+ cfl_predict_lbd_8x4_ssse3, /* 8x4 */
+ cfl_predict_lbd_8x16_ssse3, /* 8x16 */
+ cfl_predict_lbd_16x8_ssse3, /* 16x8 */
+ cfl_predict_lbd_16x32_ssse3, /* 16x32 */
+ cfl_predict_lbd_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_predict_lbd_4x16_ssse3, /* 4x16 */
+ cfl_predict_lbd_16x4_ssse3, /* 16x4 */
+ cfl_predict_lbd_8x32_ssse3, /* 8x32 */
+ cfl_predict_lbd_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
};
// Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
// function pointer array out of bounds.
return pred[tx_size % TX_SIZES_ALL];
}
+#if CONFIG_AV1_HIGHBITDEPTH
static __m256i highbd_max_epi16(int bd) {
const __m256i neg_one = _mm256_set1_epi16(-1);
// (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
@@ -346,32 +349,33 @@ CFL_PREDICT_X(avx2, 32, 8, hbd)
CFL_PREDICT_X(avx2, 32, 16, hbd)
CFL_PREDICT_X(avx2, 32, 32, hbd)
-cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
- predict_hbd_4x4_ssse3, /* 4x4 */
- predict_hbd_8x8_ssse3, /* 8x8 */
- predict_hbd_16x16_avx2, /* 16x16 */
- predict_hbd_32x32_avx2, /* 32x32 */
- cfl_predict_hbd_null, /* 64x64 (invalid CFL size) */
- predict_hbd_4x8_ssse3, /* 4x8 */
- predict_hbd_8x4_ssse3, /* 8x4 */
- predict_hbd_8x16_ssse3, /* 8x16 */
- predict_hbd_16x8_avx2, /* 16x8 */
- predict_hbd_16x32_avx2, /* 16x32 */
- predict_hbd_32x16_avx2, /* 32x16 */
- cfl_predict_hbd_null, /* 32x64 (invalid CFL size) */
- cfl_predict_hbd_null, /* 64x32 (invalid CFL size) */
- predict_hbd_4x16_ssse3, /* 4x16 */
- predict_hbd_16x4_avx2, /* 16x4 */
- predict_hbd_8x32_ssse3, /* 8x32 */
- predict_hbd_32x8_avx2, /* 32x8 */
- cfl_predict_hbd_null, /* 16x64 (invalid CFL size) */
- cfl_predict_hbd_null, /* 64x16 (invalid CFL size) */
+ cfl_predict_hbd_4x4_ssse3, /* 4x4 */
+ cfl_predict_hbd_8x8_ssse3, /* 8x8 */
+ cfl_predict_hbd_16x16_avx2, /* 16x16 */
+ cfl_predict_hbd_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_predict_hbd_4x8_ssse3, /* 4x8 */
+ cfl_predict_hbd_8x4_ssse3, /* 8x4 */
+ cfl_predict_hbd_8x16_ssse3, /* 8x16 */
+ cfl_predict_hbd_16x8_avx2, /* 16x8 */
+ cfl_predict_hbd_16x32_avx2, /* 16x32 */
+ cfl_predict_hbd_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_predict_hbd_4x16_ssse3, /* 4x16 */
+ cfl_predict_hbd_16x4_avx2, /* 16x4 */
+ cfl_predict_hbd_8x32_ssse3, /* 8x32 */
+ cfl_predict_hbd_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
};
// Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
// function pointer array out of bounds.
return pred[tx_size % TX_SIZES_ALL];
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
// Returns a vector where all the (32-bits) elements are the sum of all the
// lanes in a.
@@ -463,27 +467,27 @@ CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
// Based on the observation that for small blocks AVX2 does not outperform
// SSE2, we call the SSE2 code for block widths 4 and 8.
-cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
- subtract_average_4x4_sse2, /* 4x4 */
- subtract_average_8x8_sse2, /* 8x8 */
- subtract_average_16x16_avx2, /* 16x16 */
- subtract_average_32x32_avx2, /* 32x32 */
- cfl_subtract_average_null, /* 64x64 (invalid CFL size) */
- subtract_average_4x8_sse2, /* 4x8 */
- subtract_average_8x4_sse2, /* 8x4 */
- subtract_average_8x16_sse2, /* 8x16 */
- subtract_average_16x8_avx2, /* 16x8 */
- subtract_average_16x32_avx2, /* 16x32 */
- subtract_average_32x16_avx2, /* 32x16 */
- cfl_subtract_average_null, /* 32x64 (invalid CFL size) */
- cfl_subtract_average_null, /* 64x32 (invalid CFL size) */
- subtract_average_4x16_sse2, /* 4x16 */
- subtract_average_16x4_avx2, /* 16x4 */
- subtract_average_8x32_sse2, /* 8x32 */
- subtract_average_32x8_avx2, /* 32x8 */
- cfl_subtract_average_null, /* 16x64 (invalid CFL size) */
- cfl_subtract_average_null, /* 64x16 (invalid CFL size) */
+ cfl_subtract_average_4x4_sse2, /* 4x4 */
+ cfl_subtract_average_8x8_sse2, /* 8x8 */
+ cfl_subtract_average_16x16_avx2, /* 16x16 */
+ cfl_subtract_average_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_subtract_average_4x8_sse2, /* 4x8 */
+ cfl_subtract_average_8x4_sse2, /* 8x4 */
+ cfl_subtract_average_8x16_sse2, /* 8x16 */
+ cfl_subtract_average_16x8_avx2, /* 16x8 */
+ cfl_subtract_average_16x32_avx2, /* 16x32 */
+ cfl_subtract_average_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_subtract_average_4x16_sse2, /* 4x16 */
+ cfl_subtract_average_16x4_avx2, /* 16x4 */
+ cfl_subtract_average_8x32_sse2, /* 8x32 */
+ cfl_subtract_average_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
};
// Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
// index the function pointer array out of bounds.
diff --git a/media/libaom/src/av1/common/x86/cfl_simd.h b/media/libaom/src/av1/common/x86/cfl_simd.h
index 3b342cd4e..03ae02a92 100644
--- a/media/libaom/src/av1/common/x86/cfl_simd.h
+++ b/media/libaom/src/av1/common/x86/cfl_simd.h
@@ -15,229 +15,232 @@
#include "av1/common/blockd.h"
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-
-void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-
-void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-
-void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is faster for with == 16, we reuse it in AVX2
-void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+#endif // CONFIG_AV1_HIGHBITDEPTH
// SSE2 version is optimal for with == 4, we reuse them in AVX2
-void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
// SSE2 version is optimal for with == 8, we reuse them in AVX2
-void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
-void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
-
-void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-
-void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-
-void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
- int dst_stride, int alpha_q3);
-
-void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-
-void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-
-void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
- int dst_stride, int alpha_q3, int bd);
-
+void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+#endif // CONFIG_AV1_HIGHBITDEPTH
#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/media/libaom/src/av1/common/x86/cfl_ssse3.c b/media/libaom/src/av1/common/x86/cfl_ssse3.c
index bbf007295..476b6609a 100644
--- a/media/libaom/src/av1/common/x86/cfl_ssse3.c
+++ b/media/libaom/src/av1/common/x86/cfl_ssse3.c
@@ -168,6 +168,7 @@ static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
} while (pred_buf_m128i < end);
}
+#if CONFIG_AV1_HIGHBITDEPTH
/**
* Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
* precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -296,6 +297,7 @@ static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
pred_buf_q3 += CFL_BUF_LINE;
} while (pred_buf_q3 < end);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
@@ -341,6 +343,7 @@ static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
CFL_PREDICT_FN(ssse3, lbd)
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE __m128i highbd_max_epi16(int bd) {
const __m128i neg_one = _mm_set1_epi16(-1);
// (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
@@ -391,3 +394,4 @@ static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
}
CFL_PREDICT_FN(ssse3, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
index 0acafd044..e19575d72 100644
--- a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
+++ b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c
@@ -24,34 +24,18 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const int bd = 8;
-
- DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- int im_h = h + filter_params_y->taps - 1;
int im_stride = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
+ int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
const int bits =
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- __m256i filt[4], coeffs_h[4], coeffs_v[4];
-
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
-
const __m256i round_const_h = _mm256_set1_epi16(
((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -65,58 +49,96 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
((1 << (offset_bits - conv_params->round_1)) >> 1));
const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
- for (j = 0; j < w; j += 8) {
- for (i = 0; i < im_h; i += 2) {
- __m256i data = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+ __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
+ is_vert_4tap = 1;
- // Load the next line
- if (i + 1 < im_h)
+ // horz_filt as 4 tap and vert_filt as 8 tap
+ if (is_horiz_4tap) {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // horz-filter
+ for (int j = 0; j < w; j += 8) {
+ for (i = 0; i < (im_h - 2); i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ // Load the next line
data = _mm256_inserti128_si256(
data,
_mm_loadu_si128(
(__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
1);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+ round_shift_h);
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
- __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+ __m256i data_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+ __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
res =
_mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
_mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
- }
- /* Vertical filter */
- {
+ // vert filter
+ CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+ }
+ } else if (is_vert_4tap) {
+ int im_h = h + 3;
+ const int fo_vert = 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ // horz_filter
+ CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+ // vert_filter
+ __m256i s[6];
__m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
__m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
__m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
__m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
- __m256i s[8];
s[0] = _mm256_unpacklo_epi16(src_0, src_1);
s[1] = _mm256_unpacklo_epi16(src_2, src_3);
- s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
- s[4] = _mm256_unpackhi_epi16(src_0, src_1);
- s[5] = _mm256_unpackhi_epi16(src_2, src_3);
- s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+ s[3] = _mm256_unpackhi_epi16(src_0, src_1);
+ s[4] = _mm256_unpackhi_epi16(src_2, src_3);
for (i = 0; i < h; i += 2) {
const int16_t *data = &im_block[i * im_stride];
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
- __m256i res_a = convolve(s, coeffs_v);
- __m256i res_b = convolve(s + 4, coeffs_v);
+ __m256i res_a = convolve_4tap(s, coeffs_v + 1);
+ __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
// Combine V round and 2F-H-V round into a single rounding
res_a =
@@ -154,13 +176,25 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
s[0] = s[1];
s[1] = s[2];
- s[2] = s[3];
-
+ s[3] = s[4];
s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
}
}
+ } else {
+ int j;
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (j = 0; j < w; j += 8) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+
+ CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+ }
}
}
@@ -180,12 +214,12 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
(void)conv_params;
if (w >= 16) {
@@ -195,20 +229,20 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
if (w == 2) {
do {
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 4) {
do {
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
diff --git a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
index b1a62a4f6..5376ea79b 100644
--- a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
+++ b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c
@@ -22,7 +22,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const int bd = 8;
@@ -45,7 +45,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -111,7 +111,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -205,7 +205,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
if (w == 2) {
- *(uint16_t *)p = _mm_cvtsi128_si32(res);
+ *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
} else if (w == 4) {
*(uint32_t *)p = _mm_cvtsi128_si32(res);
} else {
@@ -240,12 +240,12 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
(void)conv_params;
if (w >= 16) {
@@ -255,20 +255,20 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
if (w == 2) {
do {
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 4) {
do {
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
@@ -354,24 +354,23 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
}
}
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_sse2(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const __m128i zero = _mm_setzero_si128();
const __m128i left_shift = _mm_cvtsi32_si128(bits);
int i, j;
@@ -411,14 +410,14 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
const __m128i data_ref_0_hi =
_mm_loadu_si128((__m128i *)(&dst[j + 8]));
- const __m128i comp_avg_res_lo =
- comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
const __m128i round_result_lo = convolve_rounding(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
- const __m128i comp_avg_res_hi =
- comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
const __m128i round_result_hi = convolve_rounding(
&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
@@ -449,7 +448,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/convolve_avx2.c b/media/libaom/src/av1/common/x86/convolve_avx2.c
index 0e91ea947..1d5bc6fbd 100644
--- a/media/libaom/src/av1/common/x86/convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/convolve_avx2.c
@@ -21,155 +21,241 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
+ int i, j, is_vert_4tap = 0;
// right shift is F-1 because we are already dividing
// filter co-efficients by 2
const int right_shift_bits = (FILTER_BITS - 1);
const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
const __m256i right_shift_const =
_mm256_set1_epi16((1 << right_shift_bits) >> 1);
- __m256i coeffs[4], s[8];
assert(conv_params->round_0 <= FILTER_BITS);
assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
- prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
-
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
(void)conv_params;
+ __m256i coeffs[4], s[8];
+ __m128i d[6];
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
-
- // Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
-
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
-
- const __m256i src_45a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
- s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
- s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_67a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_vert_4tap = 1;
+
+ // vert_filt as 4 tap
+ if (is_vert_4tap) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
- s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
- s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- const __m256i res_lo = convolve_lowbd(s, coeffs);
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
- /* rounding code */
- // shift by F - 1
- const __m256i res_16b_lo = _mm256_sra_epi16(
- _mm256_add_epi16(res_lo, right_shift_const), right_shift);
- // 8 bit conversion and saturation to uint8
- __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
- if (w - j > 8) {
- const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+ d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
/* rounding code */
// shift by F - 1
- const __m256i res_16b_hi = _mm256_sra_epi16(
- _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
// 8 bit conversion and saturation to uint8
- __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
- __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
- const __m128i res_0 = _mm256_castsi256_si128(res_a);
- const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_1);
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
- if (w - j > 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
res_1);
- } else if (w - j > 2) {
- xx_storel_32(&dst[i * dst_stride + j], res_0);
- xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
} else {
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
- __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
- *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
- *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
}
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
}
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
}
}
}
@@ -178,81 +264,119 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_horiz;
const int bits = FILTER_BITS - conv_params->round_0;
- __m256i filt[4], coeffs[4];
-
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
const __m256i round_0_const =
_mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(bits);
-
+ int i, is_horiz_4tap = 0;
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
assert(conv_params->round_0 > 0);
- if (w <= 8) {
- for (i = 0; i < h; i += 2) {
- const __m256i data = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
- 0x20);
-
- __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
-
- res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
- round_0_shift);
-
- res_16b =
- _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
-
- /* rounding code */
- // 8 bit conversion and saturation to uint8
- __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
- const __m128i res_0 = _mm256_castsi256_si128(res_8b);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
- if (w > 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
- } else if (w > 2) {
- xx_storel_32(&dst[i * dst_stride], res_0);
- xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
- } else {
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
- __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
- *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
- *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ __m256i coeffs[4], filt[4];
+ filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // horz_filt as 4 tap
+ if (is_horiz_4tap) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
}
}
} else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
- // 19 20 21 22 23
- const __m256i data = _mm256_inserti128_si256(
- _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
- 1);
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
__m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
@@ -266,11 +390,49 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
// 8 bit conversion and saturation to uint8
__m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
- // Store values into the destination buffer
- // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
- res_8b = _mm256_permute4x64_epi64(res_8b, 216);
- __m128i res = _mm256_castsi256_si128(res_8b);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
}
}
}
diff --git a/media/libaom/src/av1/common/x86/convolve_sse2.c b/media/libaom/src/av1/common/x86/convolve_sse2.c
index 5016642de..4323ac4d1 100644
--- a/media/libaom/src/av1/common/x86/convolve_sse2.c
+++ b/media/libaom/src/av1/common/x86/convolve_sse2.c
@@ -79,7 +79,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const int fo_vert = filter_params_y->taps / 2 - 1;
const uint8_t *src_ptr = src - fo_vert * src_stride;
@@ -88,14 +88,14 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
__m128i coeffs[4];
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
(void)conv_params;
assert(conv_params->round_0 <= FILTER_BITS);
assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
if (w <= 4) {
__m128i s[8], src6, res, res_round, res16;
@@ -132,7 +132,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
if (w == 2)
- *(uint16_t *)dst = res_int;
+ *(uint16_t *)dst = (uint16_t)res_int;
else
*(uint32_t *)dst = res_int;
@@ -145,7 +145,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
if (w == 2)
- *(uint16_t *)dst = res_int;
+ *(uint16_t *)dst = (uint16_t)res_int;
else
*(uint32_t *)dst = res_int;
@@ -240,7 +240,7 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
+ const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - fo_horiz;
@@ -253,13 +253,13 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
__m128i coeffs[4];
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
if (w <= 4) {
do {
@@ -284,7 +284,7 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
uint32_t r = _mm_cvtsi128_si32(res);
if (w == 2)
- *(uint16_t *)dst = r;
+ *(uint16_t *)dst = (uint16_t)r;
else
*(uint32_t *)dst = r;
diff --git a/media/libaom/src/av1/common/x86/filterintra_sse4.c b/media/libaom/src/av1/common/x86/filterintra_sse4.c
index c11edc1d4..99f4d9967 100644
--- a/media/libaom/src/av1/common/x86/filterintra_sse4.c
+++ b/media/libaom/src/av1/common/x86/filterintra_sse4.c
@@ -27,10 +27,6 @@ void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
assert(bw <= 32 && bh <= 32);
- // The initialization is just for silencing Jenkins static analysis warnings
- for (r = 0; r < bh + 1; ++r)
- memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
-
for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
index ae68f0bbb..396aed01b 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -24,8 +24,8 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
int im_h = h + filter_params_y->taps - 1;
@@ -58,8 +58,8 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
_mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
const __m256i zero = _mm256_setzero_si256();
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
for (j = 0; j < w; j += 8) {
/* Horizontal filter */
@@ -222,12 +222,12 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
void av1_highbd_convolve_2d_copy_sr_avx2(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
(void)conv_params;
(void)bd;
@@ -238,10 +238,10 @@ void av1_highbd_convolve_2d_copy_sr_avx2(
if (w == 2) {
do {
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
index 15f8872c1..f758775ee 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -74,12 +74,12 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
void av1_highbd_convolve_2d_copy_sr_sse2(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
(void)conv_params;
(void)bd;
if (w >= 16) {
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
index 3f8dafb4b..d2ff47c1f 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,23 +21,23 @@
#include "aom_dsp/x86/convolve_sse4_1.h"
#include "av1/common/convolve.h"
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m128i wt0 = _mm_set1_epi32(w0);
@@ -75,15 +75,17 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
const __m128i res_unsigned_lo =
_mm_add_epi32(res_32b_lo, offset_const);
- const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
const __m128i res_unsigned_hi =
_mm_add_epi32(res_32b_hi, offset_const);
- const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo = highbd_convolve_rounding_sse2(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -132,9 +134,9 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
_mm_add_epi32(res_32b_hi, offset_const);
const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo = highbd_convolve_rounding_sse2(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -166,11 +168,11 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
}
}
-void av1_highbd_jnt_convolve_2d_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -179,7 +181,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
int im_stride = MAX_SB_SIZE;
int i, j;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -206,7 +208,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -273,7 +275,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -359,8 +361,9 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
- const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result = highbd_convolve_rounding_sse2(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -391,10 +394,12 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
- const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo =
highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
index 1d029db39..5318fcaa8 100644
--- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -22,8 +22,8 @@
void av1_highbd_convolve_2d_sr_ssse3(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
int im_h = h + filter_params_y->taps - 1;
int im_stride = 8;
@@ -54,8 +54,8 @@ void av1_highbd_convolve_2d_sr_ssse3(
_mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
const __m128i zero = _mm_setzero_si128();
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
for (j = 0; j < w; j += 8) {
/* Horizontal filter */
diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
index ade2af03e..93e98e4b3 100644
--- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -18,6 +18,7 @@
#include "av1/common/idct.h"
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
// Note:
// Total 32x4 registers to represent 32x32 block coefficients.
@@ -46,6 +47,47 @@ static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
return clamped;
}
+static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
+ if (shift != 0) {
+ __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+ in[0] = _mm256_add_epi32(in[0], rnding);
+ in[1] = _mm256_add_epi32(in[1], rnding);
+ in[2] = _mm256_add_epi32(in[2], rnding);
+ in[3] = _mm256_add_epi32(in[3], rnding);
+
+ in[0] = _mm256_srai_epi32(in[0], shift);
+ in[1] = _mm256_srai_epi32(in[1], shift);
+ in[2] = _mm256_srai_epi32(in[2], shift);
+ in[3] = _mm256_srai_epi32(in[3], shift);
+ }
+}
+
+static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
+ round_shift_4x4_avx2(in, shift);
+ round_shift_4x4_avx2(in + 4, shift);
+ round_shift_4x4_avx2(in + 8, shift);
+ round_shift_4x4_avx2(in + 12, shift);
+}
+
+static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi, int size) {
+ __m256i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm256_max_epi32(in[i], *clamp_lo);
+ out[i] = _mm256_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
+ out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
+ out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
+ out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
+ }
+}
+
static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
__m256i res0, __m256i res1,
const int bd) {
@@ -72,30 +114,48 @@ static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
_mm256_storeu_si256((__m256i *)(output + i * stride), u);
}
}
-
-static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
- __m256i tmp, round;
- round = _mm256_set1_epi32(1 << (bit - 1));
- tmp = _mm256_add_epi32(vec, round);
- return _mm256_srai_epi32(tmp, bit);
+static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
+ const int bd) {
+ __m256i x0 = pred;
+ x0 = _mm256_add_epi32(res, x0);
+ x0 = _mm256_packus_epi32(x0, x0);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
}
-static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
- __m256i *output,
- const int size,
- const int bit) {
- if (bit > 0) {
- int i;
- for (i = 0; i < size; i++) {
- output[i] = av1_round_shift_32_avx2(input[i], bit);
- }
- } else {
- int i;
- for (i = 0; i < size; i++) {
- output[i] = _mm256_slli_epi32(input[i], -bit);
- }
+static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ __m128i temp;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m256i v = _mm256_cvtepi16_epi32(temp);
+ __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
+ __m128i u1 = _mm256_castsi256_si128(u);
+ _mm_storeu_si128((__m128i *)(output + i * stride), u1);
}
}
+static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+ __m256i *out1, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, int shift) {
+ __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+ __m256i a0 = _mm256_add_epi32(offset, in0);
+ __m256i a1 = _mm256_sub_epi32(offset, in1);
+
+ a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
__m256i u0, u1, u2, u3, u4, u5, u6, u7;
@@ -134,6 +194,43 @@ static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
}
+static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[7], in[6]);
+ u1 = _mm256_unpackhi_epi32(in[7], in[6]);
+
+ u2 = _mm256_unpacklo_epi32(in[5], in[4]);
+ u3 = _mm256_unpackhi_epi32(in[5], in[4]);
+
+ u4 = _mm256_unpacklo_epi32(in[3], in[2]);
+ u5 = _mm256_unpackhi_epi32(in[3], in[2]);
+
+ u6 = _mm256_unpacklo_epi32(in[1], in[0]);
+ u7 = _mm256_unpackhi_epi32(in[1], in[0]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
int input_stiride, int size) {
int i;
@@ -179,36 +276,6 @@ static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
*out1 = a1;
}
-static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
- __m256i *out0, __m256i *out1) {
- __m256i a0 = _mm256_add_epi32(in0, in1);
- __m256i a1 = _mm256_sub_epi32(in0, in1);
-
- *out0 = a0;
- *out1 = a1;
-}
-
-static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
- __m256i *out0, __m256i *out1,
- const __m256i *clamp_lo, const __m256i *clamp_hi,
- int shift) {
- __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
- __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
- __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
- __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
-
- a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
- a0 = _mm256_max_epi32(a0, *clamp_lo);
- a0 = _mm256_min_epi32(a0, *clamp_hi);
- a1 = _mm256_max_epi32(a1, *clamp_lo);
- a1 = _mm256_min_epi32(a1, *clamp_hi);
-
- *out0 = a0;
- *out1 = a1;
-}
-
static INLINE void idct32_stage4_avx2(
__m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
@@ -344,63 +411,32 @@ static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
const int do_cols, const int bd,
const int out_shift,
- const int log_range) {
- if (do_cols) {
- addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
- addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
- addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
- addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
- addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
- addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
- addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
- addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
- addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
- addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
- addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
- addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
- addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
- addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
- addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
- addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
- } else {
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
- &clamp_hi_out, out_shift);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
}
}
@@ -410,8 +446,8 @@ static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
- const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
- const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
__m256i x;
// stage 0
// stage 1
@@ -427,22 +463,16 @@ static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
// stage 7
// stage 8
// stage 9
- if (do_cols) {
- x = _mm256_max_epi32(x, clamp_lo);
- x = _mm256_min_epi32(x, clamp_hi);
- } else {
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
__m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
x = _mm256_add_epi32(offset, x);
x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
- x = _mm256_max_epi32(x, clamp_lo_out);
- x = _mm256_min_epi32(x, clamp_hi_out);
}
-
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
out[0] = x;
out[1] = x;
out[2] = x;
@@ -586,7 +616,7 @@ static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
&rounding, bit);
// stage 9
- idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
}
}
@@ -736,7 +766,7 @@ static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
&rounding, bit);
// stage 9
- idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
}
}
@@ -1094,66 +1124,2958 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
bf0[31] = bf1[31];
// stage 9
+ addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+ }
+}
+static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm256_mullo_epi32(in[0], cospi32);
+ in[0] = _mm256_add_epi32(in[0], rnding);
+ in[0] = _mm256_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm256_add_epi32(in[0], offset);
+ in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ }
+ in[0] = _mm256_max_epi32(in[0], clamp_lo);
+ in[0] = _mm256_min_epi32(in[0], clamp_hi);
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
+ }
+}
+
+static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
+
+ addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm256_mullo_epi32(u[0], cospi32);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+ u[1] = u[0];
+
+ u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+
+ addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+ x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm256_mullo_epi32(u[5], cospi32);
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[13], cospi32);
+ u[10] = _mm256_sub_epi32(y, x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[13] = _mm256_add_epi32(x, y);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ y = _mm256_mullo_epi32(u[12], cospi32);
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ u[12] = _mm256_add_epi32(x, y);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+ // stage 7
+ addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+ v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+ v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+ u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+ addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm256_mullo_epi32(u[0], cospi32);
+ y = _mm256_mullo_epi32(u[1], cospi32);
+ v[0] = _mm256_add_epi32(x, y);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_sub_epi32(x, y);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+ v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+ addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ v[15] = u[15];
+
+ // stage 5
+ addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[4] = v[4];
+
+ x = _mm256_mullo_epi32(v[5], cospi32);
+ y = _mm256_mullo_epi32(v[6], cospi32);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[13], cospi32);
+ v[10] = _mm256_sub_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_add_epi32(x, y);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ y = _mm256_mullo_epi32(u[12], cospi32);
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_add_epi32(x, y);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i v[16], x, y, temp1, temp2;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ v[0] = _mm256_add_epi32(x, rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ v[1] = _mm256_sub_epi32(zero, x);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm256_mullo_epi32(v[8], cospi8);
+ x = _mm256_mullo_epi32(v[9], cospi56);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[8], cospi56);
+ x = _mm256_mullo_epi32(v[9], cospi8);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm256_mullo_epi32(v[4], cospi16);
+ x = _mm256_mullo_epi32(v[5], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[4], cospi48);
+ x = _mm256_mullo_epi32(v[5], cospi16);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm256_mullo_epi32(v[12], cospi16);
+ x = _mm256_mullo_epi32(v[13], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[12], cospi48);
+ x = _mm256_mullo_epi32(v[13], cospi16);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm256_mullo_epi32(v[2], cospi32);
+ x = _mm256_mullo_epi32(v[3], cospi32);
+ v[2] = _mm256_add_epi32(y, x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(y, x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ y = _mm256_mullo_epi32(v[6], cospi32);
+ x = _mm256_mullo_epi32(v[7], cospi32);
+ v[6] = _mm256_add_epi32(y, x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(y, x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ y = _mm256_mullo_epi32(v[10], cospi32);
+ x = _mm256_mullo_epi32(v[11], cospi32);
+ v[10] = _mm256_add_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ y = _mm256_mullo_epi32(v[14], cospi32);
+ x = _mm256_mullo_epi32(v[15], cospi32);
+ v[14] = _mm256_add_epi32(y, x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(y, x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ __m256i zero = _mm256_setzero_si256();
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ u[1] = _mm256_sub_epi32(zero, x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ x = _mm256_mullo_epi32(in[2], cospi54);
+ u[2] = _mm256_add_epi32(x, rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ x = _mm256_mullo_epi32(in[2], cospi10);
+ u[3] = _mm256_sub_epi32(zero, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ x = _mm256_mullo_epi32(in[4], cospi46);
+ u[4] = _mm256_add_epi32(x, rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ x = _mm256_mullo_epi32(in[4], cospi18);
+ u[5] = _mm256_sub_epi32(zero, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ x = _mm256_mullo_epi32(in[6], cospi38);
+ u[6] = _mm256_add_epi32(x, rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ x = _mm256_mullo_epi32(in[6], cospi26);
+ u[7] = _mm256_sub_epi32(zero, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ u[8] = _mm256_mullo_epi32(in[7], cospi34);
+ u[8] = _mm256_add_epi32(u[8], rnding);
+ u[8] = _mm256_srai_epi32(u[8], bit);
+
+ u[9] = _mm256_mullo_epi32(in[7], cospi30);
+ u[9] = _mm256_add_epi32(u[9], rnding);
+ u[9] = _mm256_srai_epi32(u[9], bit);
+
+ u[10] = _mm256_mullo_epi32(in[5], cospi42);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[11] = _mm256_mullo_epi32(in[5], cospi22);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ u[12] = _mm256_mullo_epi32(in[3], cospi50);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ u[13] = _mm256_mullo_epi32(in[3], cospi14);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ u[14] = _mm256_mullo_epi32(in[1], cospi58);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ u[15] = _mm256_mullo_epi32(in[1], cospi6);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm256_mullo_epi32(u[8], cospi56);
+ x = _mm256_mullo_epi32(u[9], cospi56);
+ u[8] = _mm256_mullo_epi32(u[8], cospi8);
+ u[8] = _mm256_add_epi32(u[8], x);
+ u[8] = _mm256_add_epi32(u[8], rnding);
+ u[8] = _mm256_srai_epi32(u[8], bit);
+
+ x = _mm256_mullo_epi32(u[9], cospi8);
+ u[9] = _mm256_sub_epi32(y, x);
+ u[9] = _mm256_add_epi32(u[9], rnding);
+ u[9] = _mm256_srai_epi32(u[9], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi24);
+ y = _mm256_mullo_epi32(u[10], cospi24);
+ u[10] = _mm256_mullo_epi32(u[10], cospi40);
+ u[10] = _mm256_add_epi32(u[10], x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi40);
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi8);
+ y = _mm256_mullo_epi32(u[12], cospi8);
+ u[12] = _mm256_mullo_epi32(u[12], cospim56);
+ u[12] = _mm256_add_epi32(u[12], x);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospim56);
+ u[13] = _mm256_sub_epi32(y, x);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospi40);
+ y = _mm256_mullo_epi32(u[14], cospi40);
+ u[14] = _mm256_mullo_epi32(u[14], cospim24);
+ u[14] = _mm256_add_epi32(u[14], x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospim24);
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm256_mullo_epi32(u[5], cospi48);
+ y = _mm256_mullo_epi32(u[4], cospi48);
+ u[4] = _mm256_mullo_epi32(u[4], cospi16);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ x = _mm256_mullo_epi32(u[5], cospi16);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ x = _mm256_mullo_epi32(u[7], cospi16);
+ y = _mm256_mullo_epi32(u[6], cospi16);
+ u[6] = _mm256_mullo_epi32(u[6], cospim48);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ x = _mm256_mullo_epi32(u[7], cospim48);
+ u[7] = _mm256_sub_epi32(y, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi48);
+ y = _mm256_mullo_epi32(u[12], cospi48);
+ u[12] = _mm256_mullo_epi32(u[12], cospi16);
+ u[12] = _mm256_add_epi32(u[12], x);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi16);
+ u[13] = _mm256_sub_epi32(y, x);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospi16);
+ y = _mm256_mullo_epi32(u[14], cospi16);
+ u[14] = _mm256_mullo_epi32(u[14], cospim48);
+ u[14] = _mm256_add_epi32(u[14], x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospim48);
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm256_mullo_epi32(u[2], cospi32);
+ x = _mm256_mullo_epi32(u[3], cospi32);
+ u[2] = _mm256_add_epi32(y, x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(y, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ x = _mm256_mullo_epi32(u[7], cospi32);
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(y, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ y = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ u[10] = _mm256_add_epi32(y, x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ y = _mm256_mullo_epi32(u[14], cospi32);
+ x = _mm256_mullo_epi32(u[15], cospi32);
+ u[14] = _mm256_add_epi32(y, x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
+ out[2] = u[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
+ out[4] = u[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
+ out[6] = u[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
+ out[8] = u[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
+ out[10] = u[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
+ out[12] = u[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
+ out[14] = u[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm256_mullo_epi32(in[15], cospi2);
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ v[0] = _mm256_add_epi32(v[0], x);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_mullo_epi32(in[15], cospi62);
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ v[1] = _mm256_sub_epi32(v[1], x);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = _mm256_mullo_epi32(in[13], cospi10);
+ x = _mm256_mullo_epi32(in[2], cospi54);
+ v[2] = _mm256_add_epi32(v[2], x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_mullo_epi32(in[13], cospi54);
+ x = _mm256_mullo_epi32(in[2], cospi10);
+ v[3] = _mm256_sub_epi32(v[3], x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = _mm256_mullo_epi32(in[11], cospi18);
+ x = _mm256_mullo_epi32(in[4], cospi46);
+ v[4] = _mm256_add_epi32(v[4], x);
+ v[4] = _mm256_add_epi32(v[4], rnding);
+ v[4] = _mm256_srai_epi32(v[4], bit);
+
+ v[5] = _mm256_mullo_epi32(in[11], cospi46);
+ x = _mm256_mullo_epi32(in[4], cospi18);
+ v[5] = _mm256_sub_epi32(v[5], x);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ v[6] = _mm256_mullo_epi32(in[9], cospi26);
+ x = _mm256_mullo_epi32(in[6], cospi38);
+ v[6] = _mm256_add_epi32(v[6], x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_mullo_epi32(in[9], cospi38);
+ x = _mm256_mullo_epi32(in[6], cospi26);
+ v[7] = _mm256_sub_epi32(v[7], x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = _mm256_mullo_epi32(in[7], cospi34);
+ x = _mm256_mullo_epi32(in[8], cospi30);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[9] = _mm256_mullo_epi32(in[7], cospi30);
+ x = _mm256_mullo_epi32(in[8], cospi34);
+ v[9] = _mm256_sub_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[10] = _mm256_mullo_epi32(in[5], cospi42);
+ x = _mm256_mullo_epi32(in[10], cospi22);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_mullo_epi32(in[5], cospi22);
+ x = _mm256_mullo_epi32(in[10], cospi42);
+ v[11] = _mm256_sub_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(in[3], cospi50);
+ x = _mm256_mullo_epi32(in[12], cospi14);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(in[3], cospi14);
+ x = _mm256_mullo_epi32(in[12], cospi50);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(in[1], cospi58);
+ x = _mm256_mullo_epi32(in[14], cospi6);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(in[1], cospi6);
+ x = _mm256_mullo_epi32(in[14], cospi58);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 3
+ addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm256_mullo_epi32(u[8], cospi8);
+ x = _mm256_mullo_epi32(u[9], cospi56);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[9] = _mm256_mullo_epi32(u[8], cospi56);
+ x = _mm256_mullo_epi32(u[9], cospi8);
+ v[9] = _mm256_sub_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospi40);
+ x = _mm256_mullo_epi32(u[11], cospi24);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_mullo_epi32(u[10], cospi24);
+ x = _mm256_mullo_epi32(u[11], cospi40);
+ v[11] = _mm256_sub_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[12], cospim56);
+ x = _mm256_mullo_epi32(u[13], cospi8);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(u[12], cospi8);
+ x = _mm256_mullo_epi32(u[13], cospim56);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(u[14], cospim24);
+ x = _mm256_mullo_epi32(u[15], cospi40);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(u[14], cospi40);
+ x = _mm256_mullo_epi32(u[15], cospim24);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 5
+ addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm256_mullo_epi32(u[4], cospi16);
+ x = _mm256_mullo_epi32(u[5], cospi48);
+ v[4] = _mm256_add_epi32(v[4], x);
+ v[4] = _mm256_add_epi32(v[4], rnding);
+ v[4] = _mm256_srai_epi32(v[4], bit);
+
+ v[5] = _mm256_mullo_epi32(u[4], cospi48);
+ x = _mm256_mullo_epi32(u[5], cospi16);
+ v[5] = _mm256_sub_epi32(v[5], x);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ v[6] = _mm256_mullo_epi32(u[6], cospim48);
+ x = _mm256_mullo_epi32(u[7], cospi16);
+ v[6] = _mm256_add_epi32(v[6], x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_mullo_epi32(u[6], cospi16);
+ x = _mm256_mullo_epi32(u[7], cospim48);
+ v[7] = _mm256_sub_epi32(v[7], x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm256_mullo_epi32(u[12], cospi16);
+ x = _mm256_mullo_epi32(u[13], cospi48);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(u[12], cospi48);
+ x = _mm256_mullo_epi32(u[13], cospi16);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(u[14], cospim48);
+ x = _mm256_mullo_epi32(u[15], cospi16);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(u[14], cospi16);
+ x = _mm256_mullo_epi32(u[15], cospim48);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 7
+ addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm256_mullo_epi32(u[2], cospi32);
+ x = _mm256_mullo_epi32(u[3], cospi32);
+ v[2] = _mm256_add_epi32(y, x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(y, x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ x = _mm256_mullo_epi32(u[7], cospi32);
+ v[6] = _mm256_add_epi32(y, x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(y, x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ v[10] = _mm256_add_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm256_mullo_epi32(u[14], cospi32);
+ x = _mm256_mullo_epi32(u[15], cospi32);
+ v[14] = _mm256_add_epi32(y, x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(y, x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 9
if (do_cols) {
- addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
- addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
- addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
- addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
- addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
- addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
- addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
- addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
- addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
- addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
- addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
- addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
- addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
- addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
- addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
- addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
+ out[0] = v[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
} else {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
- &clamp_hi_out, out_shift);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rnding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ x = _mm256_add_epi32(x, offset);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+}
+static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm256_mullo_epi32(in[1], cospi56);
+ y = _mm256_mullo_epi32(in[7], cospim8);
+ u4 = _mm256_add_epi32(x, y);
+ u4 = _mm256_add_epi32(u4, rnding);
+ u4 = _mm256_srai_epi32(u4, bit);
+
+ x = _mm256_mullo_epi32(in[1], cospi8);
+ y = _mm256_mullo_epi32(in[7], cospi56);
+ u7 = _mm256_add_epi32(x, y);
+ u7 = _mm256_add_epi32(u7, rnding);
+ u7 = _mm256_srai_epi32(u7, bit);
+
+ x = _mm256_mullo_epi32(in[5], cospi24);
+ y = _mm256_mullo_epi32(in[3], cospim40);
+ u5 = _mm256_add_epi32(x, y);
+ u5 = _mm256_add_epi32(u5, rnding);
+ u5 = _mm256_srai_epi32(u5, bit);
+
+ x = _mm256_mullo_epi32(in[5], cospi40);
+ y = _mm256_mullo_epi32(in[3], cospi24);
+ u6 = _mm256_add_epi32(x, y);
+ u6 = _mm256_add_epi32(u6, rnding);
+ u6 = _mm256_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm256_mullo_epi32(u0, cospi32);
+ y = _mm256_mullo_epi32(u1, cospi32);
+ v0 = _mm256_add_epi32(x, y);
+ v0 = _mm256_add_epi32(v0, rnding);
+ v0 = _mm256_srai_epi32(v0, bit);
+
+ v1 = _mm256_sub_epi32(x, y);
+ v1 = _mm256_add_epi32(v1, rnding);
+ v1 = _mm256_srai_epi32(v1, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi48);
+ y = _mm256_mullo_epi32(u3, cospim16);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi16);
+ y = _mm256_mullo_epi32(u3, cospi48);
+ v3 = _mm256_add_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm256_mullo_epi32(v5, cospi32);
+ y = _mm256_mullo_epi32(v6, cospi32);
+ u6 = _mm256_add_epi32(y, x);
+ u6 = _mm256_add_epi32(u6, rnding);
+ u6 = _mm256_srai_epi32(u6, bit);
+
+ u5 = _mm256_sub_epi32(y, x);
+ u5 = _mm256_add_epi32(u5, rnding);
+ u5 = _mm256_srai_epi32(u5, bit);
+
+ addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+ addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+ addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_4x4_avx2(out, out_shift);
+ round_shift_4x4_avx2(out + 4, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+ }
+}
+static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i kZero = _mm256_setzero_si256();
+ __m256i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm256_mullo_epi32(in[0], cospi60);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi4);
+ u[1] = _mm256_sub_epi32(kZero, x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m256i temp1, temp2;
+ temp1 = _mm256_mullo_epi32(u[0], cospi16);
+ x = _mm256_mullo_epi32(u[1], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm256_mullo_epi32(u[0], cospi48);
+ x = _mm256_mullo_epi32(u[1], cospi16);
+ u[5] = _mm256_sub_epi32(temp2, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm256_mullo_epi32(u[0], cospi32);
+ x = _mm256_mullo_epi32(u[1], cospi32);
+ u[2] = _mm256_add_epi32(temp1, x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(temp1, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ temp1 = _mm256_mullo_epi32(u[4], cospi32);
+ x = _mm256_mullo_epi32(u[5], cospi32);
+ u[6] = _mm256_add_epi32(temp1, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(temp1, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm256_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm256_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm256_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i kZero = _mm256_setzero_si256();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ u[0] = _mm256_mullo_epi32(in[7], cospi4);
+ x = _mm256_mullo_epi32(in[0], cospi60);
+ u[0] = _mm256_add_epi32(u[0], x);
+ u[0] = _mm256_add_epi32(u[0], rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ u[1] = _mm256_mullo_epi32(in[7], cospi60);
+ x = _mm256_mullo_epi32(in[0], cospi4);
+ u[1] = _mm256_sub_epi32(u[1], x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ u[2] = _mm256_mullo_epi32(in[5], cospi20);
+ x = _mm256_mullo_epi32(in[2], cospi44);
+ u[2] = _mm256_add_epi32(u[2], x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_mullo_epi32(in[5], cospi44);
+ x = _mm256_mullo_epi32(in[2], cospi20);
+ u[3] = _mm256_sub_epi32(u[3], x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ u[4] = _mm256_mullo_epi32(in[3], cospi36);
+ x = _mm256_mullo_epi32(in[4], cospi28);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[5] = _mm256_mullo_epi32(in[3], cospi28);
+ x = _mm256_mullo_epi32(in[4], cospi36);
+ u[5] = _mm256_sub_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(in[1], cospi52);
+ x = _mm256_mullo_epi32(in[6], cospi12);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_mullo_epi32(in[1], cospi12);
+ x = _mm256_mullo_epi32(in[6], cospi52);
+ u[7] = _mm256_sub_epi32(u[7], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm256_mullo_epi32(v[4], cospi16);
+ x = _mm256_mullo_epi32(v[5], cospi48);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[5] = _mm256_mullo_epi32(v[4], cospi48);
+ x = _mm256_mullo_epi32(v[5], cospi16);
+ u[5] = _mm256_sub_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[6], cospim48);
+ x = _mm256_mullo_epi32(v[7], cospi16);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_mullo_epi32(v[6], cospi16);
+ x = _mm256_mullo_epi32(v[7], cospim48);
+ u[7] = _mm256_sub_epi32(u[7], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm256_mullo_epi32(v[2], cospi32);
+ x = _mm256_mullo_epi32(v[3], cospi32);
+ u[2] = _mm256_add_epi32(v[0], x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(v[0], x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ v[0] = _mm256_mullo_epi32(v[6], cospi32);
+ x = _mm256_mullo_epi32(v[7], cospi32);
+ u[6] = _mm256_add_epi32(v[0], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(v[0], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm256_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm256_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm256_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+static INLINE void idct64_stage8_avx2(
+ __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ int i;
+ __m256i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ int i;
+ __m256i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ __m256i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
+ int bd, int out_shift,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ for (int i = 0; i < 32; i++) {
+ addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ round_shift_8x8_avx2(out + 32, out_shift);
+ round_shift_8x8_avx2(out + 48, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+ }
+}
+
+static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+
+ {
+ __m256i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ x = _mm256_add_epi32(x, offset);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
}
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+ out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
}
}
+static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+
+ {
+ __m256i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
+
+ // stage 4
+ __m256i temp1, temp2;
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
+
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
+ // stage 6
+ temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+
+ idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+ {
+ __m256i u[64];
+ __m256i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
+
+ // stage 4
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
+
+ // stage 5
+ u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+ const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+ {
+ __m256i u[64], v[64];
+
+ // stage 1
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
+
+ // stage 2
+ v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
+ v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
+ v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+ v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+ v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
+ v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
+ v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
+ v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
+ v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+ v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+ v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
+ v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
+ v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+ v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
+ v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
+ v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+ v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+ v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
+ v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
+ v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
+ v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
+ v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+ v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+ v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
+ v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
+ v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+
+ // stage 3
+ u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
+ u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
+ u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
+ u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
+ u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
+ u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
+ u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
+ u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
+ u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
+ u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
+ u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
+ u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
+ u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
+ u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+ v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+ v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+ // stage 5
+ u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
+ u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+ u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+ u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+ u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+ v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+
+ addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+ // stage 7
+ addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+ addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+ u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+ u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+ v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+ v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+ v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+ v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+ v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+ v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+ v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+ v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+ v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+ v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+ for (i = 32; i < 40; i++) {
+ addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ for (i = 0; i < 32; i++) {
+ addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+ &clamp_hi);
+ }
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ round_shift_8x8_avx2(out + 32, out_shift);
+ round_shift_8x8_avx2(out + 48, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+ }
+ }
+}
typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
int do_cols, int bd, int out_shift);
@@ -1164,19 +4086,21 @@ static const transform_1d_avx2
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL },
},
- { { NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL } },
{
+ { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
+ { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
{ NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL },
+ },
+ {
+ { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+ { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
{ NULL, NULL, NULL, NULL },
},
{ { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } },
- { { NULL, NULL, NULL, NULL },
+ { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
@@ -1186,10 +4110,10 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
TX_TYPE tx_type,
TX_SIZE tx_size, int eob,
const int bd) {
- __m256i buf1[64 * 2];
+ __m256i buf1[64 * 8];
int eobx, eoby;
get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
const int txfm_size_col = tx_size_wide[tx_size];
@@ -1198,7 +4122,7 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
const int input_stride = AOMMIN(32, txfm_size_col);
-
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
const transform_1d_avx2 row_txfm =
@@ -1213,7 +4137,7 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
// 1st stage: column transform
for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
- __m256i buf0[32];
+ __m256i buf0[64];
const int32_t *input_row = input + i * input_stride * 8;
for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
__m256i *buf0_cur = buf0 + j * 8;
@@ -1221,18 +4145,29 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
}
-
- row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_avx2(
+ buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
__m256i *_buf1 = buf1 + i * 8;
- for (int j = 0; j < buf_size_w_div8; ++j) {
- transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_flip_avx2(
+ &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+ }
}
}
// 2nd stage: column transform
for (int i = 0; i < buf_size_w_div8; i++) {
col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
- inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
buf1 + i * txfm_size_row, txfm_size_row,
@@ -1240,12 +4175,15 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
}
// write to buffer
- {
+ if (txfm_size_col >= 16) {
for (int i = 0; i < (txfm_size_col >> 4); i++) {
highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
output + 16 * i, stride, ud_flip,
txfm_size_row, bd);
}
+ } else if (txfm_size_col == 8) {
+ highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
}
}
@@ -1255,95 +4193,54 @@ void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
int eob, const int bd) {
switch (tx_type) {
case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
stride, tx_type, tx_size, eob, bd);
break;
- default: assert(0); break;
- }
-}
-
-void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- const int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- // Assembly version doesn't support IDTX, so use C version for it.
case IDTX:
- av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+ tx_size, eob, bd);
break;
-
- default: assert(0);
+ default: assert(0); break;
}
}
-
void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
- case TX_32X32:
- av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
- break;
- case TX_16X16:
- av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_8X8:
- av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
- break;
case TX_4X8:
- av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
break;
case TX_8X4:
- av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
- break;
- case TX_8X16:
- av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X8:
- av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X32:
- av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
- break;
- case TX_32X16:
- av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
- break;
- case TX_32X64:
- av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
- break;
- case TX_64X32:
- av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
break;
case TX_4X4:
av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
break;
case TX_16X4:
- av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
break;
case TX_4X16:
- av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
- break;
- case TX_8X32:
- av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
- break;
- case TX_32X8:
- av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
break;
- case TX_64X64:
- case TX_16X64:
- case TX_64X16:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ default:
+ av1_highbd_inv_txfm2d_add_universe_avx2(
input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
txfm_param->eob, txfm_param->bd);
break;
- default: assert(0 && "Invalid transform size"); break;
}
}
diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
index e29e0baf5..03eaef832 100644
--- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -17,6 +17,7 @@
#include "av1/common/av1_inv_txfm1d_cfg.h"
#include "av1/common/idct.h"
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
#include "av1/common/x86/av1_txfm_sse4.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
@@ -36,19 +37,87 @@ static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
return clamped;
}
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+ if (shift != 0) {
+ __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[1] = _mm_add_epi32(in[1], rnding);
+ in[2] = _mm_add_epi32(in[2], rnding);
+ in[3] = _mm_add_epi32(in[3], rnding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+ round_shift_4x4(&in[0], shift);
+ round_shift_4x4(&in[4], shift);
+ round_shift_4x4(&in[8], shift);
+ round_shift_4x4(&in[12], shift);
+}
+
+static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int size) {
+ __m128i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm_max_epi32(in[i], *clamp_lo);
+ out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+ out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+ out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+ out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+ }
+}
+
static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
__m128i res0, __m128i res1,
const int bd) {
__m128i x0 = _mm_cvtepi16_epi32(pred);
__m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
-
+ __m128i min_clip_val = _mm_setzero_si128();
+ __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
x0 = _mm_add_epi32(res0, x0);
x1 = _mm_add_epi32(res1, x1);
+ x0 = _mm_max_epi32(x0, min_clip_val);
+ x0 = _mm_min_epi32(x0, max_clip_val);
+ x1 = _mm_max_epi32(x1, min_clip_val);
+ x1 = _mm_min_epi32(x1, max_clip_val);
x0 = _mm_packus_epi32(x0, x1);
+ return x0;
+}
+
+static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
+ __m128i res0, const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+
+ x0 = _mm_add_epi32(res0, x0);
+ x0 = _mm_packus_epi32(x0, x0);
x0 = highbd_clamp_epi16(x0, bd);
return x0;
}
+static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
+
+ _mm_storel_epi64((__m128i *)(output + i * stride), u);
+ }
+}
+
static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
int stride, int flipud,
int height, const int bd) {
@@ -91,34 +160,23 @@ static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
*out1 = a1;
}
-static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
- __m128i *out0, __m128i *out1) {
- __m128i a0 = _mm_add_epi32(in0, in1);
- __m128i a1 = _mm_sub_epi32(in0, in1);
-
- *out0 = a0;
- *out1 = a1;
-}
-
-static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
- __m128i *out0, __m128i *out1,
- const __m128i *clamp_lo,
- const __m128i *clamp_hi, int shift) {
+static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int shift) {
__m128i offset = _mm_set1_epi32((1 << shift) >> 1);
- __m128i in0_w_offset = _mm_add_epi32(in0, offset);
- __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
- __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+ __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
+ __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
- a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+ in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
+ in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
- a0 = _mm_max_epi32(a0, *clamp_lo);
- a0 = _mm_min_epi32(a0, *clamp_hi);
- a1 = _mm_max_epi32(a1, *clamp_lo);
- a1 = _mm_min_epi32(a1, *clamp_hi);
+ in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
+ in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
+ in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
+ in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
- *out0 = a0;
- *out1 = a1;
+ *in0 = in0_w_offset;
+ *in1 = in1_w_offset;
}
static INLINE void idct32_stage4_sse4_1(
@@ -274,63 +332,34 @@ static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
const int do_cols, const int bd,
const int out_shift,
- const int log_range) {
- if (do_cols) {
- addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
- addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
- addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
- addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
- addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
- addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
- addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
- addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
- addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
- addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
- addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
- addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
- addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
- addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
- addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
- addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
- } else {
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
- &clamp_hi_out, out_shift);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ for (int i = 0; i < 32; i += 8) {
+ round_shift_4x4(out + i, out_shift);
+ round_shift_4x4(out + i + 4, out_shift);
+ }
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
}
}
@@ -354,17 +383,23 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
*out1 = a1;
}
-static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3, x, y;
+ // Stage 0
+ // Stage 1
+ // Stage 2
v0 = _mm_unpacklo_epi32(in[0], in[1]);
v1 = _mm_unpackhi_epi32(in[0], in[1]);
v2 = _mm_unpacklo_epi32(in[2], in[3]);
@@ -397,21 +432,27 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
v3 = _mm_add_epi32(v3, rnding);
v3 = _mm_srai_epi32(v3, bit);
- if (do_cols) {
- addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
- addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
- } else {
- const int log_range = AOMMAX(16, bd + 6);
- const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
- const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
- addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
- addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+ // Stage 3
+ addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
+ shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
}
}
-static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
const int32_t *sinpi = sinpi_arr(bit);
- const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_set1_epi32(0);
+ __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
+ rnding = _mm_unpacklo_epi32(rnding, zero);
+ const __m128i mul = _mm_set1_epi32(1 << 4);
const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
@@ -421,6 +462,8 @@ static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
__m128i x0, x1, x2, x3;
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3;
+ __m128i u0_low, u1_low, u2_low, u3_low;
+ __m128i u0_high, u1_high, u2_high, u3_high;
v0 = _mm_unpacklo_epi32(in[0], in[1]);
v1 = _mm_unpackhi_epi32(in[0], in[1]);
@@ -455,51 +498,78 @@ static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
t = _mm_add_epi32(s0, s1);
u3 = _mm_sub_epi32(t, s3);
- u0 = _mm_add_epi32(u0, rnding);
- u0 = _mm_srai_epi32(u0, bit);
+ // u0
+ u0_low = _mm_mul_epi32(u0, mul);
+ u0_low = _mm_add_epi64(u0_low, rnding);
+
+ u0 = _mm_srli_si128(u0, 4);
+ u0_high = _mm_mul_epi32(u0, mul);
+ u0_high = _mm_add_epi64(u0_high, rnding);
+
+ u0_low = _mm_srli_si128(u0_low, 2);
+ u0_high = _mm_srli_si128(u0_high, 2);
+
+ u0 = _mm_unpacklo_epi32(u0_low, u0_high);
+ u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
+ u0 = _mm_unpacklo_epi64(u0, u0_high);
+
+ // u1
+ u1_low = _mm_mul_epi32(u1, mul);
+ u1_low = _mm_add_epi64(u1_low, rnding);
+
+ u1 = _mm_srli_si128(u1, 4);
+ u1_high = _mm_mul_epi32(u1, mul);
+ u1_high = _mm_add_epi64(u1_high, rnding);
+
+ u1_low = _mm_srli_si128(u1_low, 2);
+ u1_high = _mm_srli_si128(u1_high, 2);
+
+ u1 = _mm_unpacklo_epi32(u1_low, u1_high);
+ u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
+ u1 = _mm_unpacklo_epi64(u1, u1_high);
+
+ // u2
+ u2_low = _mm_mul_epi32(u2, mul);
+ u2_low = _mm_add_epi64(u2_low, rnding);
+
+ u2 = _mm_srli_si128(u2, 4);
+ u2_high = _mm_mul_epi32(u2, mul);
+ u2_high = _mm_add_epi64(u2_high, rnding);
+
+ u2_low = _mm_srli_si128(u2_low, 2);
+ u2_high = _mm_srli_si128(u2_high, 2);
+
+ u2 = _mm_unpacklo_epi32(u2_low, u2_high);
+ u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
+ u2 = _mm_unpacklo_epi64(u2, u2_high);
+
+ // u3
+ u3_low = _mm_mul_epi32(u3, mul);
+ u3_low = _mm_add_epi64(u3_low, rnding);
+
+ u3 = _mm_srli_si128(u3, 4);
+ u3_high = _mm_mul_epi32(u3, mul);
+ u3_high = _mm_add_epi64(u3_high, rnding);
- u1 = _mm_add_epi32(u1, rnding);
- u1 = _mm_srai_epi32(u1, bit);
+ u3_low = _mm_srli_si128(u3_low, 2);
+ u3_high = _mm_srli_si128(u3_high, 2);
- u2 = _mm_add_epi32(u2, rnding);
- u2 = _mm_srai_epi32(u2, bit);
+ u3 = _mm_unpacklo_epi32(u3_low, u3_high);
+ u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
+ u3 = _mm_unpacklo_epi64(u3, u3_high);
- u3 = _mm_add_epi32(u3, rnding);
- u3 = _mm_srai_epi32(u3, bit);
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
if (!do_cols) {
const int log_range = AOMMAX(16, bd + 6);
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-
- u0 = _mm_max_epi32(u0, clamp_lo);
- u0 = _mm_min_epi32(u0, clamp_hi);
- u1 = _mm_max_epi32(u1, clamp_lo);
- u1 = _mm_min_epi32(u1, clamp_hi);
- u2 = _mm_max_epi32(u2, clamp_lo);
- u2 = _mm_min_epi32(u2, clamp_hi);
- u3 = _mm_max_epi32(u3, clamp_lo);
- u3 = _mm_min_epi32(u3, clamp_hi);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
}
-
- in[0] = u0;
- in[1] = u1;
- in[2] = u2;
- in[3] = u3;
-}
-
-static INLINE void round_shift_4x4(__m128i *in, int shift) {
- __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
-
- in[0] = _mm_add_epi32(in[0], rnding);
- in[1] = _mm_add_epi32(in[1], rnding);
- in[2] = _mm_add_epi32(in[2], rnding);
- in[3] = _mm_add_epi32(in[3], rnding);
-
- in[0] = _mm_srai_epi32(in[0], shift);
- in[1] = _mm_srai_epi32(in[1], shift);
- in[2] = _mm_srai_epi32(in[2], shift);
- in[3] = _mm_srai_epi32(in[3], shift);
}
static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
@@ -556,68 +626,164 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
_mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
}
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ __m128i v[4];
+ __m128i zero = _mm_set1_epi32(0);
+ __m128i fact = _mm_set1_epi32(NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0_low, a1_low;
+ __m128i a0_high, a1_high;
+
+ offset = _mm_unpacklo_epi32(offset, zero);
+
+ for (int i = 0; i < 4; i++) {
+ a0_low = _mm_mul_epi32(in[i], fact);
+ a0_low = _mm_add_epi32(a0_low, offset);
+ a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+ a0_high = _mm_srli_si128(in[i], 4);
+ a0_high = _mm_mul_epi32(a0_high, fact);
+ a0_high = _mm_add_epi32(a0_high, offset);
+ a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+ a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+ a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+ out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+ }
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+
+ // Transpose for 4x4
+ v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+ v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+ v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+ v[3] = _mm_unpackhi_epi32(out[2], out[3]);
+
+ out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+ out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+ out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+ out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[4];
- const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
const int txw_idx = get_txw_idx(TX_4X4);
const int txh_idx = get_txh_idx(TX_4X4);
switch (tx_type) {
case DCT_DCT:
- load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
- load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
- load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
- load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
- load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
- load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
- load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
break;
case ADST_FLIPADST:
- load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_ADST:
- load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ 0);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+ 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ 0);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+ 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+ 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ 0);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+ 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
default: assert(0);
}
}
@@ -745,26 +911,22 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
u5 = _mm_srai_epi32(u5, bit);
// stage 5
- if (do_cols) {
- addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
- addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
- addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
- addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
- } else {
- const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
- addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
- &clamp_lo_out, &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
- &clamp_lo_out, &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
- &clamp_lo_out, &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
- &clamp_lo_out, &clamp_hi_out, out_shift);
- }
+ addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
}
}
@@ -1089,11 +1251,26 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
}
}
-static void round_shift_8x8(__m128i *in, int shift) {
- round_shift_4x4(&in[0], shift);
- round_shift_4x4(&in[4], shift);
- round_shift_4x4(&in[8], shift);
- round_shift_4x4(&in[12], shift);
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ out[0] = _mm_add_epi32(in[0], in[0]);
+ out[1] = _mm_add_epi32(in[1], in[1]);
+ out[2] = _mm_add_epi32(in[2], in[2]);
+ out[3] = _mm_add_epi32(in[3], in[3]);
+ out[4] = _mm_add_epi32(in[4], in[4]);
+ out[5] = _mm_add_epi32(in[5], in[5]);
+ out[6] = _mm_add_epi32(in[6], in[6]);
+ out[7] = _mm_add_epi32(in[7], in[7]);
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
+ }
}
static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
@@ -1165,93 +1342,93 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
_mm_store_si128((__m128i *)(output + 7 * stride), u7);
}
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[16], out[16];
- const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
const int txw_idx = get_txw_idx(TX_8X8);
const int txh_idx = get_txh_idx(TX_8X8);
switch (tx_type) {
case DCT_DCT:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
@@ -1264,6 +1441,8 @@ static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i x;
// stage 0
@@ -1278,18 +1457,16 @@ static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
// stage 5
if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
__m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
x = _mm_add_epi32(x, offset);
x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
- x = _mm_max_epi32(x, clamp_lo_out);
- x = _mm_min_epi32(x, clamp_hi_out);
}
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
out[0] = x;
out[1] = x;
out[2] = x;
@@ -1396,25 +1573,19 @@ static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
u5 = _mm_srai_epi32(u5, bit);
// stage 5
- if (do_cols) {
- addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
- addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
- addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
- addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
- } else {
+ addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
- addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
- out_shift);
- addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
- out_shift);
- addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
- out_shift);
- addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
- out_shift);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
}
}
@@ -1683,56 +1854,50 @@ static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
- const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
- const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
-
- {
- // stage 0
- // stage 1
- // stage 2
- // stage 3
- // stage 4
- in[0] = _mm_mullo_epi32(in[0], cospi32);
- in[0] = _mm_add_epi32(in[0], rnding);
- in[0] = _mm_srai_epi32(in[0], bit);
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm_mullo_epi32(in[0], cospi32);
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[0] = _mm_srai_epi32(in[0], bit);
- // stage 5
- // stage 6
- // stage 7
- if (do_cols) {
- in[0] = _mm_max_epi32(in[0], clamp_lo);
- in[0] = _mm_min_epi32(in[0], clamp_hi);
- } else {
- const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ // stage 5
+ // stage 6
+ // stage 7
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ if (out_shift != 0) {
__m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
in[0] = _mm_add_epi32(in[0], offset);
in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
- in[0] = _mm_max_epi32(in[0], clamp_lo_out);
- in[0] = _mm_min_epi32(in[0], clamp_hi_out);
}
-
- out[0] = in[0];
- out[1] = in[0];
- out[2] = in[0];
- out[3] = in[0];
- out[4] = in[0];
- out[5] = in[0];
- out[6] = in[0];
- out[7] = in[0];
- out[8] = in[0];
- out[9] = in[0];
- out[10] = in[0];
- out[11] = in[0];
- out[12] = in[0];
- out[13] = in[0];
- out[14] = in[0];
- out[15] = in[0];
}
+
+ in[0] = _mm_max_epi32(in[0], clamp_lo);
+ in[0] = _mm_min_epi32(in[0], clamp_hi);
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
}
static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
@@ -1760,140 +1925,120 @@ static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u[16], x, y;
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
- {
- // stage 0
- // stage 1
- u[0] = in[0];
- u[2] = in[4];
- u[4] = in[2];
- u[6] = in[6];
- u[8] = in[1];
- u[10] = in[5];
- u[12] = in[3];
- u[14] = in[7];
-
- // stage 2
- u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
- u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
-
- u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
- u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
-
- u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
- u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
-
- u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
- u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+ // stage 2
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
- // stage 3
- u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
- u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
- u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
- u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+ u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
- addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+ u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
- // stage 4
- x = _mm_mullo_epi32(u[0], cospi32);
- u[0] = _mm_add_epi32(x, rnding);
- u[0] = _mm_srai_epi32(u[0], bit);
- u[1] = u[0];
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
- u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
- u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+ // stage 3
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
- addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
- x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
- u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
- u[9] = x;
- y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
- u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
- u[10] = y;
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+ u[1] = u[0];
- // stage 5
- addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
- x = _mm_mullo_epi32(u[5], cospi32);
- y = _mm_mullo_epi32(u[6], cospi32);
- u[5] = _mm_sub_epi32(y, x);
- u[5] = _mm_add_epi32(u[5], rnding);
- u[5] = _mm_srai_epi32(u[5], bit);
+ addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
- u[6] = _mm_add_epi32(y, x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
+ x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
- addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+ // stage 5
+ addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
- // stage 6
- addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+ x = _mm_mullo_epi32(u[5], cospi32);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
- x = _mm_mullo_epi32(u[10], cospi32);
- y = _mm_mullo_epi32(u[13], cospi32);
- u[10] = _mm_sub_epi32(y, x);
- u[10] = _mm_add_epi32(u[10], rnding);
- u[10] = _mm_srai_epi32(u[10], bit);
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
- u[13] = _mm_add_epi32(x, y);
- u[13] = _mm_add_epi32(u[13], rnding);
- u[13] = _mm_srai_epi32(u[13], bit);
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
- x = _mm_mullo_epi32(u[11], cospi32);
- y = _mm_mullo_epi32(u[12], cospi32);
- u[11] = _mm_sub_epi32(y, x);
- u[11] = _mm_add_epi32(u[11], rnding);
- u[11] = _mm_srai_epi32(u[11], bit);
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ u[10] = _mm_sub_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[13] = _mm_add_epi32(x, y);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_add_epi32(x, y);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+ // stage 7
+ addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
- u[12] = _mm_add_epi32(x, y);
- u[12] = _mm_add_epi32(u[12], rnding);
- u[12] = _mm_srai_epi32(u[12], bit);
- // stage 7
- if (do_cols) {
- addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
- addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
- addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
- addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
- addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
- addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
- addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
- addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
- } else {
- const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- }
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
}
}
@@ -1910,167 +2055,162 @@ static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
const __m128i zero = _mm_setzero_si128();
__m128i v[16], x, y, temp1, temp2;
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(x, rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
- // Calculate the column 0, 1, 2, 3
- {
- // stage 0
- // stage 1
- // stage 2
- x = _mm_mullo_epi32(in[0], cospi62);
- v[0] = _mm_add_epi32(x, rnding);
- v[0] = _mm_srai_epi32(v[0], bit);
-
- x = _mm_mullo_epi32(in[0], cospi2);
- v[1] = _mm_sub_epi32(zero, x);
- v[1] = _mm_add_epi32(v[1], rnding);
- v[1] = _mm_srai_epi32(v[1], bit);
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(zero, x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
- // stage 3
- v[8] = v[0];
- v[9] = v[1];
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
- // stage 4
- temp1 = _mm_mullo_epi32(v[8], cospi8);
- x = _mm_mullo_epi32(v[9], cospi56);
- temp1 = _mm_add_epi32(temp1, x);
- temp1 = _mm_add_epi32(temp1, rnding);
- temp1 = _mm_srai_epi32(temp1, bit);
-
- temp2 = _mm_mullo_epi32(v[8], cospi56);
- x = _mm_mullo_epi32(v[9], cospi8);
- temp2 = _mm_sub_epi32(temp2, x);
- temp2 = _mm_add_epi32(temp2, rnding);
- temp2 = _mm_srai_epi32(temp2, bit);
- v[8] = temp1;
- v[9] = temp2;
+ // stage 4
+ temp1 = _mm_mullo_epi32(v[8], cospi8);
+ x = _mm_mullo_epi32(v[9], cospi56);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
- // stage 5
- v[4] = v[0];
- v[5] = v[1];
- v[12] = v[8];
- v[13] = v[9];
+ temp2 = _mm_mullo_epi32(v[8], cospi56);
+ x = _mm_mullo_epi32(v[9], cospi8);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
- // stage 6
- temp1 = _mm_mullo_epi32(v[4], cospi16);
- x = _mm_mullo_epi32(v[5], cospi48);
- temp1 = _mm_add_epi32(temp1, x);
- temp1 = _mm_add_epi32(temp1, rnding);
- temp1 = _mm_srai_epi32(temp1, bit);
-
- temp2 = _mm_mullo_epi32(v[4], cospi48);
- x = _mm_mullo_epi32(v[5], cospi16);
- temp2 = _mm_sub_epi32(temp2, x);
- temp2 = _mm_add_epi32(temp2, rnding);
- temp2 = _mm_srai_epi32(temp2, bit);
- v[4] = temp1;
- v[5] = temp2;
-
- temp1 = _mm_mullo_epi32(v[12], cospi16);
- x = _mm_mullo_epi32(v[13], cospi48);
- temp1 = _mm_add_epi32(temp1, x);
- temp1 = _mm_add_epi32(temp1, rnding);
- temp1 = _mm_srai_epi32(temp1, bit);
-
- temp2 = _mm_mullo_epi32(v[12], cospi48);
- x = _mm_mullo_epi32(v[13], cospi16);
- temp2 = _mm_sub_epi32(temp2, x);
- temp2 = _mm_add_epi32(temp2, rnding);
- temp2 = _mm_srai_epi32(temp2, bit);
- v[12] = temp1;
- v[13] = temp2;
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
- // stage 7
- v[2] = v[0];
- v[3] = v[1];
- v[6] = v[4];
- v[7] = v[5];
- v[10] = v[8];
- v[11] = v[9];
- v[14] = v[12];
- v[15] = v[13];
+ // stage 6
+ temp1 = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
- // stage 8
- y = _mm_mullo_epi32(v[2], cospi32);
- x = _mm_mullo_epi32(v[3], cospi32);
- v[2] = _mm_add_epi32(y, x);
- v[2] = _mm_add_epi32(v[2], rnding);
- v[2] = _mm_srai_epi32(v[2], bit);
+ temp2 = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm_mullo_epi32(v[12], cospi16);
+ x = _mm_mullo_epi32(v[13], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
- v[3] = _mm_sub_epi32(y, x);
- v[3] = _mm_add_epi32(v[3], rnding);
- v[3] = _mm_srai_epi32(v[3], bit);
+ temp2 = _mm_mullo_epi32(v[12], cospi48);
+ x = _mm_mullo_epi32(v[13], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
- y = _mm_mullo_epi32(v[6], cospi32);
- x = _mm_mullo_epi32(v[7], cospi32);
- v[6] = _mm_add_epi32(y, x);
- v[6] = _mm_add_epi32(v[6], rnding);
- v[6] = _mm_srai_epi32(v[6], bit);
-
- v[7] = _mm_sub_epi32(y, x);
- v[7] = _mm_add_epi32(v[7], rnding);
- v[7] = _mm_srai_epi32(v[7], bit);
-
- y = _mm_mullo_epi32(v[10], cospi32);
- x = _mm_mullo_epi32(v[11], cospi32);
- v[10] = _mm_add_epi32(y, x);
- v[10] = _mm_add_epi32(v[10], rnding);
- v[10] = _mm_srai_epi32(v[10], bit);
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
- v[11] = _mm_sub_epi32(y, x);
- v[11] = _mm_add_epi32(v[11], rnding);
- v[11] = _mm_srai_epi32(v[11], bit);
+ // stage 8
+ y = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
- y = _mm_mullo_epi32(v[14], cospi32);
- x = _mm_mullo_epi32(v[15], cospi32);
- v[14] = _mm_add_epi32(y, x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
- v[15] = _mm_sub_epi32(y, x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
+ y = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ y = _mm_mullo_epi32(v[10], cospi32);
+ x = _mm_mullo_epi32(v[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ y = _mm_mullo_epi32(v[14], cospi32);
+ x = _mm_mullo_epi32(v[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
- // stage 9
- if (do_cols) {
- out[0] = v[0];
- out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
- out[2] = v[12];
- out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
- out[4] = v[6];
- out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
- out[6] = v[10];
- out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
- out[8] = v[3];
- out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
- out[10] = v[15];
- out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
- out[12] = v[5];
- out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
- out[14] = v[9];
- out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
- } else {
- const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
- const __m128i clamp_hi_out =
- _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(zero, v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(zero, v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(zero, v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(zero, v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(zero, v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(zero, v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(zero, v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(zero, v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
- neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- }
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
@@ -2107,291 +2247,287 @@ static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i zero = _mm_setzero_si128();
__m128i u[16], x, y;
- // Calculate the column 0, 1, 2, 3
- {
- // stage 0
- // stage 1
- // stage 2
- __m128i zero = _mm_setzero_si128();
- x = _mm_mullo_epi32(in[0], cospi62);
- u[0] = _mm_add_epi32(x, rnding);
- u[0] = _mm_srai_epi32(u[0], bit);
-
- x = _mm_mullo_epi32(in[0], cospi2);
- u[1] = _mm_sub_epi32(zero, x);
- u[1] = _mm_add_epi32(u[1], rnding);
- u[1] = _mm_srai_epi32(u[1], bit);
-
- x = _mm_mullo_epi32(in[2], cospi54);
- u[2] = _mm_add_epi32(x, rnding);
- u[2] = _mm_srai_epi32(u[2], bit);
-
- x = _mm_mullo_epi32(in[2], cospi10);
- u[3] = _mm_sub_epi32(zero, x);
- u[3] = _mm_add_epi32(u[3], rnding);
- u[3] = _mm_srai_epi32(u[3], bit);
-
- x = _mm_mullo_epi32(in[4], cospi46);
- u[4] = _mm_add_epi32(x, rnding);
- u[4] = _mm_srai_epi32(u[4], bit);
-
- x = _mm_mullo_epi32(in[4], cospi18);
- u[5] = _mm_sub_epi32(zero, x);
- u[5] = _mm_add_epi32(u[5], rnding);
- u[5] = _mm_srai_epi32(u[5], bit);
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
- x = _mm_mullo_epi32(in[6], cospi38);
- u[6] = _mm_add_epi32(x, rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
+ x = _mm_mullo_epi32(in[0], cospi2);
+ u[1] = _mm_sub_epi32(zero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
- x = _mm_mullo_epi32(in[6], cospi26);
- u[7] = _mm_sub_epi32(zero, x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
+ x = _mm_mullo_epi32(in[2], cospi54);
+ u[2] = _mm_add_epi32(x, rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
- u[8] = _mm_mullo_epi32(in[7], cospi34);
- u[8] = _mm_add_epi32(u[8], rnding);
- u[8] = _mm_srai_epi32(u[8], bit);
+ x = _mm_mullo_epi32(in[2], cospi10);
+ u[3] = _mm_sub_epi32(zero, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
- u[9] = _mm_mullo_epi32(in[7], cospi30);
- u[9] = _mm_add_epi32(u[9], rnding);
- u[9] = _mm_srai_epi32(u[9], bit);
+ x = _mm_mullo_epi32(in[4], cospi46);
+ u[4] = _mm_add_epi32(x, rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
- u[10] = _mm_mullo_epi32(in[5], cospi42);
- u[10] = _mm_add_epi32(u[10], rnding);
- u[10] = _mm_srai_epi32(u[10], bit);
+ x = _mm_mullo_epi32(in[4], cospi18);
+ u[5] = _mm_sub_epi32(zero, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
- u[11] = _mm_mullo_epi32(in[5], cospi22);
- u[11] = _mm_add_epi32(u[11], rnding);
- u[11] = _mm_srai_epi32(u[11], bit);
+ x = _mm_mullo_epi32(in[6], cospi38);
+ u[6] = _mm_add_epi32(x, rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
- u[12] = _mm_mullo_epi32(in[3], cospi50);
- u[12] = _mm_add_epi32(u[12], rnding);
- u[12] = _mm_srai_epi32(u[12], bit);
+ x = _mm_mullo_epi32(in[6], cospi26);
+ u[7] = _mm_sub_epi32(zero, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
- u[13] = _mm_mullo_epi32(in[3], cospi14);
- u[13] = _mm_add_epi32(u[13], rnding);
- u[13] = _mm_srai_epi32(u[13], bit);
+ u[8] = _mm_mullo_epi32(in[7], cospi34);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
- u[14] = _mm_mullo_epi32(in[1], cospi58);
- u[14] = _mm_add_epi32(u[14], rnding);
- u[14] = _mm_srai_epi32(u[14], bit);
+ u[9] = _mm_mullo_epi32(in[7], cospi30);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
- u[15] = _mm_mullo_epi32(in[1], cospi6);
- u[15] = _mm_add_epi32(u[15], rnding);
- u[15] = _mm_srai_epi32(u[15], bit);
+ u[10] = _mm_mullo_epi32(in[5], cospi42);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
- // stage 3
- addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+ u[11] = _mm_mullo_epi32(in[5], cospi22);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
- // stage 4
- y = _mm_mullo_epi32(u[8], cospi56);
- x = _mm_mullo_epi32(u[9], cospi56);
- u[8] = _mm_mullo_epi32(u[8], cospi8);
- u[8] = _mm_add_epi32(u[8], x);
- u[8] = _mm_add_epi32(u[8], rnding);
- u[8] = _mm_srai_epi32(u[8], bit);
-
- x = _mm_mullo_epi32(u[9], cospi8);
- u[9] = _mm_sub_epi32(y, x);
- u[9] = _mm_add_epi32(u[9], rnding);
- u[9] = _mm_srai_epi32(u[9], bit);
-
- x = _mm_mullo_epi32(u[11], cospi24);
- y = _mm_mullo_epi32(u[10], cospi24);
- u[10] = _mm_mullo_epi32(u[10], cospi40);
- u[10] = _mm_add_epi32(u[10], x);
- u[10] = _mm_add_epi32(u[10], rnding);
- u[10] = _mm_srai_epi32(u[10], bit);
-
- x = _mm_mullo_epi32(u[11], cospi40);
- u[11] = _mm_sub_epi32(y, x);
- u[11] = _mm_add_epi32(u[11], rnding);
- u[11] = _mm_srai_epi32(u[11], bit);
-
- x = _mm_mullo_epi32(u[13], cospi8);
- y = _mm_mullo_epi32(u[12], cospi8);
- u[12] = _mm_mullo_epi32(u[12], cospim56);
- u[12] = _mm_add_epi32(u[12], x);
- u[12] = _mm_add_epi32(u[12], rnding);
- u[12] = _mm_srai_epi32(u[12], bit);
-
- x = _mm_mullo_epi32(u[13], cospim56);
- u[13] = _mm_sub_epi32(y, x);
- u[13] = _mm_add_epi32(u[13], rnding);
- u[13] = _mm_srai_epi32(u[13], bit);
-
- x = _mm_mullo_epi32(u[15], cospi40);
- y = _mm_mullo_epi32(u[14], cospi40);
- u[14] = _mm_mullo_epi32(u[14], cospim24);
- u[14] = _mm_add_epi32(u[14], x);
- u[14] = _mm_add_epi32(u[14], rnding);
- u[14] = _mm_srai_epi32(u[14], bit);
-
- x = _mm_mullo_epi32(u[15], cospim24);
- u[15] = _mm_sub_epi32(y, x);
- u[15] = _mm_add_epi32(u[15], rnding);
- u[15] = _mm_srai_epi32(u[15], bit);
+ u[12] = _mm_mullo_epi32(in[3], cospi50);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
- // stage 5
- addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+ u[13] = _mm_mullo_epi32(in[3], cospi14);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
- // stage 6
- x = _mm_mullo_epi32(u[5], cospi48);
- y = _mm_mullo_epi32(u[4], cospi48);
- u[4] = _mm_mullo_epi32(u[4], cospi16);
- u[4] = _mm_add_epi32(u[4], x);
- u[4] = _mm_add_epi32(u[4], rnding);
- u[4] = _mm_srai_epi32(u[4], bit);
-
- x = _mm_mullo_epi32(u[5], cospi16);
- u[5] = _mm_sub_epi32(y, x);
- u[5] = _mm_add_epi32(u[5], rnding);
- u[5] = _mm_srai_epi32(u[5], bit);
+ u[14] = _mm_mullo_epi32(in[1], cospi58);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
- x = _mm_mullo_epi32(u[7], cospi16);
- y = _mm_mullo_epi32(u[6], cospi16);
- u[6] = _mm_mullo_epi32(u[6], cospim48);
- u[6] = _mm_add_epi32(u[6], x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
+ u[15] = _mm_mullo_epi32(in[1], cospi6);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
- x = _mm_mullo_epi32(u[7], cospim48);
- u[7] = _mm_sub_epi32(y, x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
-
- x = _mm_mullo_epi32(u[13], cospi48);
- y = _mm_mullo_epi32(u[12], cospi48);
- u[12] = _mm_mullo_epi32(u[12], cospi16);
- u[12] = _mm_add_epi32(u[12], x);
- u[12] = _mm_add_epi32(u[12], rnding);
- u[12] = _mm_srai_epi32(u[12], bit);
-
- x = _mm_mullo_epi32(u[13], cospi16);
- u[13] = _mm_sub_epi32(y, x);
- u[13] = _mm_add_epi32(u[13], rnding);
- u[13] = _mm_srai_epi32(u[13], bit);
-
- x = _mm_mullo_epi32(u[15], cospi16);
- y = _mm_mullo_epi32(u[14], cospi16);
- u[14] = _mm_mullo_epi32(u[14], cospim48);
- u[14] = _mm_add_epi32(u[14], x);
- u[14] = _mm_add_epi32(u[14], rnding);
- u[14] = _mm_srai_epi32(u[14], bit);
-
- x = _mm_mullo_epi32(u[15], cospim48);
- u[15] = _mm_sub_epi32(y, x);
- u[15] = _mm_add_epi32(u[15], rnding);
- u[15] = _mm_srai_epi32(u[15], bit);
+ // stage 3
+ addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
- // stage 7
- addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
- addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+ // stage 4
+ y = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ u[8] = _mm_mullo_epi32(u[8], cospi8);
+ u[8] = _mm_add_epi32(u[8], x);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ x = _mm_mullo_epi32(u[9], cospi8);
+ u[9] = _mm_sub_epi32(y, x);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi24);
+ y = _mm_mullo_epi32(u[10], cospi24);
+ u[10] = _mm_mullo_epi32(u[10], cospi40);
+ u[10] = _mm_add_epi32(u[10], x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi40);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi8);
+ y = _mm_mullo_epi32(u[12], cospi8);
+ u[12] = _mm_mullo_epi32(u[12], cospim56);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospim56);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi40);
+ y = _mm_mullo_epi32(u[14], cospi40);
+ u[14] = _mm_mullo_epi32(u[14], cospim24);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim24);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
- // stage 8
- y = _mm_mullo_epi32(u[2], cospi32);
- x = _mm_mullo_epi32(u[3], cospi32);
- u[2] = _mm_add_epi32(y, x);
- u[2] = _mm_add_epi32(u[2], rnding);
- u[2] = _mm_srai_epi32(u[2], bit);
-
- u[3] = _mm_sub_epi32(y, x);
- u[3] = _mm_add_epi32(u[3], rnding);
- u[3] = _mm_srai_epi32(u[3], bit);
- y = _mm_mullo_epi32(u[6], cospi32);
- x = _mm_mullo_epi32(u[7], cospi32);
- u[6] = _mm_add_epi32(y, x);
- u[6] = _mm_add_epi32(u[6], rnding);
- u[6] = _mm_srai_epi32(u[6], bit);
+ // stage 5
+ addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
- u[7] = _mm_sub_epi32(y, x);
- u[7] = _mm_add_epi32(u[7], rnding);
- u[7] = _mm_srai_epi32(u[7], bit);
+ // stage 6
+ x = _mm_mullo_epi32(u[5], cospi48);
+ y = _mm_mullo_epi32(u[4], cospi48);
+ u[4] = _mm_mullo_epi32(u[4], cospi16);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
- y = _mm_mullo_epi32(u[10], cospi32);
- x = _mm_mullo_epi32(u[11], cospi32);
- u[10] = _mm_add_epi32(y, x);
- u[10] = _mm_add_epi32(u[10], rnding);
- u[10] = _mm_srai_epi32(u[10], bit);
+ x = _mm_mullo_epi32(u[5], cospi16);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(u[7], cospi16);
+ y = _mm_mullo_epi32(u[6], cospi16);
+ u[6] = _mm_mullo_epi32(u[6], cospim48);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
- u[11] = _mm_sub_epi32(y, x);
- u[11] = _mm_add_epi32(u[11], rnding);
- u[11] = _mm_srai_epi32(u[11], bit);
+ x = _mm_mullo_epi32(u[7], cospim48);
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
- y = _mm_mullo_epi32(u[14], cospi32);
- x = _mm_mullo_epi32(u[15], cospi32);
- u[14] = _mm_add_epi32(y, x);
- u[14] = _mm_add_epi32(u[14], rnding);
- u[14] = _mm_srai_epi32(u[14], bit);
+ x = _mm_mullo_epi32(u[13], cospi48);
+ y = _mm_mullo_epi32(u[12], cospi48);
+ u[12] = _mm_mullo_epi32(u[12], cospi16);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi16);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi16);
+ y = _mm_mullo_epi32(u[14], cospi16);
+ u[14] = _mm_mullo_epi32(u[14], cospim48);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim48);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
- u[15] = _mm_sub_epi32(y, x);
- u[15] = _mm_add_epi32(u[15], rnding);
- u[15] = _mm_srai_epi32(u[15], bit);
+ // stage 7
+ addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
- // stage 9
- if (do_cols) {
- out[0] = u[0];
- out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
- out[2] = u[12];
- out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
- out[4] = u[6];
- out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
- out[6] = u[10];
- out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
- out[8] = u[3];
- out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
- out[10] = u[15];
- out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
- out[12] = u[5];
- out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
- out[14] = u[9];
- out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
- } else {
- const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
- const __m128i clamp_hi_out =
- _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ // stage 8
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ u[2] = _mm_add_epi32(y, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
- neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- }
+ u[3] = _mm_sub_epi32(y, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ u[10] = _mm_add_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ u[14] = _mm_add_epi32(y, x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(zero, u[8]);
+ out[2] = u[12];
+ out[3] = _mm_sub_epi32(zero, u[4]);
+ out[4] = u[6];
+ out[5] = _mm_sub_epi32(zero, u[14]);
+ out[6] = u[10];
+ out[7] = _mm_sub_epi32(zero, u[2]);
+ out[8] = u[3];
+ out[9] = _mm_sub_epi32(zero, u[11]);
+ out[10] = u[15];
+ out[11] = _mm_sub_epi32(zero, u[7]);
+ out[12] = u[5];
+ out[13] = _mm_sub_epi32(zero, u[13]);
+ out[14] = u[9];
+ out[15] = _mm_sub_epi32(zero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
@@ -2557,38 +2693,22 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
v[15] = u[15];
// stage 7
- if (do_cols) {
- addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
- addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
- addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
- addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
- addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
- addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
- addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
- addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
- } else {
+ addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
- &clamp_hi_out, out_shift);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
}
}
}
@@ -2626,353 +2746,381 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ const __m128i zero = _mm_setzero_si128();
__m128i u[16], v[16], x, y;
-
// Calculate the column 0, 1, 2, 3
- {
- // stage 0
- // stage 1
- // stage 2
- v[0] = _mm_mullo_epi32(in[15], cospi2);
- x = _mm_mullo_epi32(in[0], cospi62);
- v[0] = _mm_add_epi32(v[0], x);
- v[0] = _mm_add_epi32(v[0], rnding);
- v[0] = _mm_srai_epi32(v[0], bit);
-
- v[1] = _mm_mullo_epi32(in[15], cospi62);
- x = _mm_mullo_epi32(in[0], cospi2);
- v[1] = _mm_sub_epi32(v[1], x);
- v[1] = _mm_add_epi32(v[1], rnding);
- v[1] = _mm_srai_epi32(v[1], bit);
-
- v[2] = _mm_mullo_epi32(in[13], cospi10);
- x = _mm_mullo_epi32(in[2], cospi54);
- v[2] = _mm_add_epi32(v[2], x);
- v[2] = _mm_add_epi32(v[2], rnding);
- v[2] = _mm_srai_epi32(v[2], bit);
-
- v[3] = _mm_mullo_epi32(in[13], cospi54);
- x = _mm_mullo_epi32(in[2], cospi10);
- v[3] = _mm_sub_epi32(v[3], x);
- v[3] = _mm_add_epi32(v[3], rnding);
- v[3] = _mm_srai_epi32(v[3], bit);
-
- v[4] = _mm_mullo_epi32(in[11], cospi18);
- x = _mm_mullo_epi32(in[4], cospi46);
- v[4] = _mm_add_epi32(v[4], x);
- v[4] = _mm_add_epi32(v[4], rnding);
- v[4] = _mm_srai_epi32(v[4], bit);
-
- v[5] = _mm_mullo_epi32(in[11], cospi46);
- x = _mm_mullo_epi32(in[4], cospi18);
- v[5] = _mm_sub_epi32(v[5], x);
- v[5] = _mm_add_epi32(v[5], rnding);
- v[5] = _mm_srai_epi32(v[5], bit);
-
- v[6] = _mm_mullo_epi32(in[9], cospi26);
- x = _mm_mullo_epi32(in[6], cospi38);
- v[6] = _mm_add_epi32(v[6], x);
- v[6] = _mm_add_epi32(v[6], rnding);
- v[6] = _mm_srai_epi32(v[6], bit);
-
- v[7] = _mm_mullo_epi32(in[9], cospi38);
- x = _mm_mullo_epi32(in[6], cospi26);
- v[7] = _mm_sub_epi32(v[7], x);
- v[7] = _mm_add_epi32(v[7], rnding);
- v[7] = _mm_srai_epi32(v[7], bit);
-
- v[8] = _mm_mullo_epi32(in[7], cospi34);
- x = _mm_mullo_epi32(in[8], cospi30);
- v[8] = _mm_add_epi32(v[8], x);
- v[8] = _mm_add_epi32(v[8], rnding);
- v[8] = _mm_srai_epi32(v[8], bit);
-
- v[9] = _mm_mullo_epi32(in[7], cospi30);
- x = _mm_mullo_epi32(in[8], cospi34);
- v[9] = _mm_sub_epi32(v[9], x);
- v[9] = _mm_add_epi32(v[9], rnding);
- v[9] = _mm_srai_epi32(v[9], bit);
-
- v[10] = _mm_mullo_epi32(in[5], cospi42);
- x = _mm_mullo_epi32(in[10], cospi22);
- v[10] = _mm_add_epi32(v[10], x);
- v[10] = _mm_add_epi32(v[10], rnding);
- v[10] = _mm_srai_epi32(v[10], bit);
-
- v[11] = _mm_mullo_epi32(in[5], cospi22);
- x = _mm_mullo_epi32(in[10], cospi42);
- v[11] = _mm_sub_epi32(v[11], x);
- v[11] = _mm_add_epi32(v[11], rnding);
- v[11] = _mm_srai_epi32(v[11], bit);
-
- v[12] = _mm_mullo_epi32(in[3], cospi50);
- x = _mm_mullo_epi32(in[12], cospi14);
- v[12] = _mm_add_epi32(v[12], x);
- v[12] = _mm_add_epi32(v[12], rnding);
- v[12] = _mm_srai_epi32(v[12], bit);
-
- v[13] = _mm_mullo_epi32(in[3], cospi14);
- x = _mm_mullo_epi32(in[12], cospi50);
- v[13] = _mm_sub_epi32(v[13], x);
- v[13] = _mm_add_epi32(v[13], rnding);
- v[13] = _mm_srai_epi32(v[13], bit);
-
- v[14] = _mm_mullo_epi32(in[1], cospi58);
- x = _mm_mullo_epi32(in[14], cospi6);
- v[14] = _mm_add_epi32(v[14], x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
-
- v[15] = _mm_mullo_epi32(in[1], cospi6);
- x = _mm_mullo_epi32(in[14], cospi58);
- v[15] = _mm_sub_epi32(v[15], x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
-
- // stage 3
- addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
-
- // stage 4
- v[0] = u[0];
- v[1] = u[1];
- v[2] = u[2];
- v[3] = u[3];
- v[4] = u[4];
- v[5] = u[5];
- v[6] = u[6];
- v[7] = u[7];
-
- v[8] = _mm_mullo_epi32(u[8], cospi8);
- x = _mm_mullo_epi32(u[9], cospi56);
- v[8] = _mm_add_epi32(v[8], x);
- v[8] = _mm_add_epi32(v[8], rnding);
- v[8] = _mm_srai_epi32(v[8], bit);
-
- v[9] = _mm_mullo_epi32(u[8], cospi56);
- x = _mm_mullo_epi32(u[9], cospi8);
- v[9] = _mm_sub_epi32(v[9], x);
- v[9] = _mm_add_epi32(v[9], rnding);
- v[9] = _mm_srai_epi32(v[9], bit);
-
- v[10] = _mm_mullo_epi32(u[10], cospi40);
- x = _mm_mullo_epi32(u[11], cospi24);
- v[10] = _mm_add_epi32(v[10], x);
- v[10] = _mm_add_epi32(v[10], rnding);
- v[10] = _mm_srai_epi32(v[10], bit);
-
- v[11] = _mm_mullo_epi32(u[10], cospi24);
- x = _mm_mullo_epi32(u[11], cospi40);
- v[11] = _mm_sub_epi32(v[11], x);
- v[11] = _mm_add_epi32(v[11], rnding);
- v[11] = _mm_srai_epi32(v[11], bit);
-
- v[12] = _mm_mullo_epi32(u[12], cospim56);
- x = _mm_mullo_epi32(u[13], cospi8);
- v[12] = _mm_add_epi32(v[12], x);
- v[12] = _mm_add_epi32(v[12], rnding);
- v[12] = _mm_srai_epi32(v[12], bit);
-
- v[13] = _mm_mullo_epi32(u[12], cospi8);
- x = _mm_mullo_epi32(u[13], cospim56);
- v[13] = _mm_sub_epi32(v[13], x);
- v[13] = _mm_add_epi32(v[13], rnding);
- v[13] = _mm_srai_epi32(v[13], bit);
-
- v[14] = _mm_mullo_epi32(u[14], cospim24);
- x = _mm_mullo_epi32(u[15], cospi40);
- v[14] = _mm_add_epi32(v[14], x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
-
- v[15] = _mm_mullo_epi32(u[14], cospi40);
- x = _mm_mullo_epi32(u[15], cospim24);
- v[15] = _mm_sub_epi32(v[15], x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
-
- // stage 5
- addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
-
- // stage 6
- v[0] = u[0];
- v[1] = u[1];
- v[2] = u[2];
- v[3] = u[3];
-
- v[4] = _mm_mullo_epi32(u[4], cospi16);
- x = _mm_mullo_epi32(u[5], cospi48);
- v[4] = _mm_add_epi32(v[4], x);
- v[4] = _mm_add_epi32(v[4], rnding);
- v[4] = _mm_srai_epi32(v[4], bit);
-
- v[5] = _mm_mullo_epi32(u[4], cospi48);
- x = _mm_mullo_epi32(u[5], cospi16);
- v[5] = _mm_sub_epi32(v[5], x);
- v[5] = _mm_add_epi32(v[5], rnding);
- v[5] = _mm_srai_epi32(v[5], bit);
-
- v[6] = _mm_mullo_epi32(u[6], cospim48);
- x = _mm_mullo_epi32(u[7], cospi16);
- v[6] = _mm_add_epi32(v[6], x);
- v[6] = _mm_add_epi32(v[6], rnding);
- v[6] = _mm_srai_epi32(v[6], bit);
-
- v[7] = _mm_mullo_epi32(u[6], cospi16);
- x = _mm_mullo_epi32(u[7], cospim48);
- v[7] = _mm_sub_epi32(v[7], x);
- v[7] = _mm_add_epi32(v[7], rnding);
- v[7] = _mm_srai_epi32(v[7], bit);
-
- v[8] = u[8];
- v[9] = u[9];
- v[10] = u[10];
- v[11] = u[11];
-
- v[12] = _mm_mullo_epi32(u[12], cospi16);
- x = _mm_mullo_epi32(u[13], cospi48);
- v[12] = _mm_add_epi32(v[12], x);
- v[12] = _mm_add_epi32(v[12], rnding);
- v[12] = _mm_srai_epi32(v[12], bit);
-
- v[13] = _mm_mullo_epi32(u[12], cospi48);
- x = _mm_mullo_epi32(u[13], cospi16);
- v[13] = _mm_sub_epi32(v[13], x);
- v[13] = _mm_add_epi32(v[13], rnding);
- v[13] = _mm_srai_epi32(v[13], bit);
-
- v[14] = _mm_mullo_epi32(u[14], cospim48);
- x = _mm_mullo_epi32(u[15], cospi16);
- v[14] = _mm_add_epi32(v[14], x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
-
- v[15] = _mm_mullo_epi32(u[14], cospi16);
- x = _mm_mullo_epi32(u[15], cospim48);
- v[15] = _mm_sub_epi32(v[15], x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
-
- // stage 7
- addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
- addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
-
- // stage 8
- v[0] = u[0];
- v[1] = u[1];
-
- y = _mm_mullo_epi32(u[2], cospi32);
- x = _mm_mullo_epi32(u[3], cospi32);
- v[2] = _mm_add_epi32(y, x);
- v[2] = _mm_add_epi32(v[2], rnding);
- v[2] = _mm_srai_epi32(v[2], bit);
-
- v[3] = _mm_sub_epi32(y, x);
- v[3] = _mm_add_epi32(v[3], rnding);
- v[3] = _mm_srai_epi32(v[3], bit);
-
- v[4] = u[4];
- v[5] = u[5];
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm_mullo_epi32(in[15], cospi2);
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(v[0], x);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_mullo_epi32(in[15], cospi62);
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(v[1], x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(in[13], cospi10);
+ x = _mm_mullo_epi32(in[2], cospi54);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(in[13], cospi54);
+ x = _mm_mullo_epi32(in[2], cospi10);
+ v[3] = _mm_sub_epi32(v[3], x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_mullo_epi32(in[11], cospi18);
+ x = _mm_mullo_epi32(in[4], cospi46);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(in[11], cospi46);
+ x = _mm_mullo_epi32(in[4], cospi18);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(in[9], cospi26);
+ x = _mm_mullo_epi32(in[6], cospi38);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(in[9], cospi38);
+ x = _mm_mullo_epi32(in[6], cospi26);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = _mm_mullo_epi32(in[7], cospi34);
+ x = _mm_mullo_epi32(in[8], cospi30);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(in[7], cospi30);
+ x = _mm_mullo_epi32(in[8], cospi34);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(in[5], cospi42);
+ x = _mm_mullo_epi32(in[10], cospi22);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(in[5], cospi22);
+ x = _mm_mullo_epi32(in[10], cospi42);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(in[3], cospi50);
+ x = _mm_mullo_epi32(in[12], cospi14);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(in[3], cospi14);
+ x = _mm_mullo_epi32(in[12], cospi50);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(in[1], cospi58);
+ x = _mm_mullo_epi32(in[14], cospi6);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(in[1], cospi6);
+ x = _mm_mullo_epi32(in[14], cospi58);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
- y = _mm_mullo_epi32(u[6], cospi32);
- x = _mm_mullo_epi32(u[7], cospi32);
- v[6] = _mm_add_epi32(y, x);
- v[6] = _mm_add_epi32(v[6], rnding);
- v[6] = _mm_srai_epi32(v[6], bit);
+ // stage 3
+ addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
- v[7] = _mm_sub_epi32(y, x);
- v[7] = _mm_add_epi32(v[7], rnding);
- v[7] = _mm_srai_epi32(v[7], bit);
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi8);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi8);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi40);
+ x = _mm_mullo_epi32(u[11], cospi24);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(u[10], cospi24);
+ x = _mm_mullo_epi32(u[11], cospi40);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[12], cospim56);
+ x = _mm_mullo_epi32(u[13], cospi8);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi8);
+ x = _mm_mullo_epi32(u[13], cospim56);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim24);
+ x = _mm_mullo_epi32(u[15], cospi40);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi40);
+ x = _mm_mullo_epi32(u[15], cospim24);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
- v[8] = u[8];
- v[9] = u[9];
+ // stage 5
+ addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
- y = _mm_mullo_epi32(u[10], cospi32);
- x = _mm_mullo_epi32(u[11], cospi32);
- v[10] = _mm_add_epi32(y, x);
- v[10] = _mm_add_epi32(v[10], rnding);
- v[10] = _mm_srai_epi32(v[10], bit);
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm_mullo_epi32(u[4], cospi16);
+ x = _mm_mullo_epi32(u[5], cospi48);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(u[4], cospi48);
+ x = _mm_mullo_epi32(u[5], cospi16);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(u[6], cospim48);
+ x = _mm_mullo_epi32(u[7], cospi16);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(u[6], cospi16);
+ x = _mm_mullo_epi32(u[7], cospim48);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm_mullo_epi32(u[12], cospi16);
+ x = _mm_mullo_epi32(u[13], cospi48);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi48);
+ x = _mm_mullo_epi32(u[13], cospi16);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim48);
+ x = _mm_mullo_epi32(u[15], cospi16);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi16);
+ x = _mm_mullo_epi32(u[15], cospim48);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
- v[11] = _mm_sub_epi32(y, x);
- v[11] = _mm_add_epi32(v[11], rnding);
- v[11] = _mm_srai_epi32(v[11], bit);
+ // stage 7
+ addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
- v[12] = u[12];
- v[13] = u[13];
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
- y = _mm_mullo_epi32(u[14], cospi32);
- x = _mm_mullo_epi32(u[15], cospi32);
- v[14] = _mm_add_epi32(y, x);
- v[14] = _mm_add_epi32(v[14], rnding);
- v[14] = _mm_srai_epi32(v[14], bit);
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(zero, v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(zero, v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(zero, v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(zero, v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(zero, v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(zero, v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(zero, v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(zero, v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
- v[15] = _mm_sub_epi32(y, x);
- v[15] = _mm_add_epi32(v[15], rnding);
- v[15] = _mm_srai_epi32(v[15], bit);
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0_low, a0_high, a1_low, a1_high;
+ __m128i zero = _mm_set1_epi32(0);
+ offset = _mm_unpacklo_epi32(offset, zero);
- // stage 9
- if (do_cols) {
- out[0] = v[0];
- out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
- out[2] = v[12];
- out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
- out[4] = v[6];
- out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
- out[6] = v[10];
- out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
- out[8] = v[3];
- out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
- out[10] = v[15];
- out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
- out[12] = v[5];
- out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
- out[14] = v[9];
- out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
- } else {
- const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
- const __m128i clamp_hi_out =
- _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ for (int i = 0; i < 16; i++) {
+ a0_low = _mm_mul_epi32(in[i], fact);
+ a0_low = _mm_add_epi32(a0_low, offset);
+ a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+ a0_high = _mm_srli_si128(in[i], 4);
+ a0_high = _mm_mul_epi32(a0_high, fact);
+ a0_high = _mm_add_epi32(a0_high, offset);
+ a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+ a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+ a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+ out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+ }
- neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- }
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
}
}
-
static INLINE void idct64_stage8_sse4_1(
__m128i *u, const __m128i *cospim32, const __m128i *cospi32,
const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
@@ -3091,21 +3239,21 @@ static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
int bd, int out_shift,
- const int log_range) {
- if (do_cols) {
- for (int i = 0; i < 32; i++) {
- addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
- }
- } else {
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ for (int i = 0; i < 32; i++) {
+ addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+ }
+
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- for (int i = 0; i < 32; i++) {
- addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
- &clamp_lo_out, &clamp_hi_out, out_shift);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ for (int i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+ 4);
}
}
}
@@ -3115,8 +3263,8 @@ static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
const int32_t *cospi = cospi_arr(bit);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
- const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
- const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
@@ -3135,88 +3283,82 @@ static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
// stage 9
// stage 10
// stage 11
- if (do_cols) {
- x = _mm_max_epi32(x, clamp_lo);
- x = _mm_min_epi32(x, clamp_hi);
- } else {
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
- x = _mm_add_epi32(x, offset);
- x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
-
- x = _mm_max_epi32(x, clamp_lo_out);
- x = _mm_min_epi32(x, clamp_hi_out);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
}
-
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
out[0] = x;
- out[63] = x;
out[1] = x;
- out[62] = x;
out[2] = x;
- out[61] = x;
out[3] = x;
- out[60] = x;
out[4] = x;
- out[59] = x;
out[5] = x;
- out[58] = x;
out[6] = x;
- out[57] = x;
out[7] = x;
- out[56] = x;
out[8] = x;
- out[55] = x;
out[9] = x;
- out[54] = x;
out[10] = x;
- out[53] = x;
out[11] = x;
- out[52] = x;
out[12] = x;
- out[51] = x;
out[13] = x;
- out[50] = x;
out[14] = x;
- out[49] = x;
out[15] = x;
- out[48] = x;
out[16] = x;
- out[47] = x;
out[17] = x;
- out[46] = x;
out[18] = x;
- out[45] = x;
out[19] = x;
- out[44] = x;
out[20] = x;
- out[43] = x;
out[21] = x;
- out[42] = x;
out[22] = x;
- out[41] = x;
out[23] = x;
- out[40] = x;
out[24] = x;
- out[39] = x;
out[25] = x;
- out[38] = x;
out[26] = x;
- out[37] = x;
out[27] = x;
- out[36] = x;
out[28] = x;
- out[35] = x;
out[29] = x;
- out[34] = x;
out[30] = x;
- out[33] = x;
out[31] = x;
out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
}
}
@@ -3434,7 +3576,6 @@ static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
u[6] = u[1];
u[5] = u[2];
u[4] = u[3];
- u[9] = u[9];
idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
&cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
@@ -3448,7 +3589,7 @@ static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
bit);
// stage 11
- idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
}
}
@@ -3758,7 +3899,7 @@ static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
bit);
// stage 11
- idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
}
}
@@ -4221,20 +4362,20 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
for (i = 56; i < 64; i++) v[i] = u[i];
// stage 11
- if (do_cols) {
- for (i = 0; i < 32; i++) {
- addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
- }
- } else {
+ for (i = 0; i < 32; i++) {
+ addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- for (i = 0; i < 32; i++) {
- addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
- &clamp_lo_out, &clamp_hi_out, out_shift);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ for (i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
+ &clamp_hi_out, 4);
}
}
}
@@ -4246,8 +4387,8 @@ static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
- const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
- const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i bf1;
// stage 0
@@ -4269,17 +4410,17 @@ static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
bf1 = _mm_min_epi32(bf1, clamp_hi);
} else {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
- bf1 = _mm_add_epi32(bf1, offset);
- bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
- bf1 = _mm_max_epi32(bf1, clamp_lo_out);
- bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ bf1 = _mm_add_epi32(bf1, offset);
+ bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+ }
}
+
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
out[0] = bf1;
out[1] = bf1;
out[2] = bf1;
@@ -4422,7 +4563,7 @@ static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
&rounding, bit);
// stage 9
- idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
}
static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
@@ -4568,9 +4709,8 @@ static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
// stage 8
idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
&rounding, bit);
-
// stage 9
- idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
}
static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
@@ -4926,62 +5066,30 @@ static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
bf0[31] = bf1[31];
// stage 9
- if (do_cols) {
- addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
- addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
- addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
- addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
- addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
- addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
- addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
- addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
- addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
- addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
- addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
- addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
- addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
- addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
- addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
- addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
- } else {
+ addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
const int log_range_out = AOMMAX(16, bd + 6);
- const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
- -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
- const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
- (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
-
- addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
- &clamp_hi_out, out_shift);
- addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
- &clamp_hi_out, out_shift);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
}
}
@@ -4992,127 +5100,23 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
case IDTX:
- av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
case H_DCT:
- case V_ADST:
case H_ADST:
- case V_FLIPADST:
case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
case V_DCT:
- case H_DCT:
case V_ADST:
- case H_ADST:
case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default:
av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
txfm_param->tx_size,
txfm_param->eob, bd);
break;
- }
-}
-
-void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
- uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
default:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
- uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- // Assembly version doesn't support IDTX, so use C version for it.
- case IDTX:
- av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
+ av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
break;
- default: assert(0);
}
}
-
void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
int stride,
const TxfmParam *txfm_param) {
@@ -5127,53 +5131,268 @@ void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
return;
}
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- }
+ av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
}
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ for (int i = 0; i < 32; i += 16) {
+ out[i] = _mm_slli_epi32(in[i], 2);
+ out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+ out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+ out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+ out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+ out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+ out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+ out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+ out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+ out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+ out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+ out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+ out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+ out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+ out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+ out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+ }
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
static const transform_1d_sse4_1
highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
{
- { NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL },
+ { idct4x4_sse4_1, NULL, NULL, NULL },
+ { iadst4x4_sse4_1, NULL, NULL, NULL },
+ { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
},
{ { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
{ iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
- { NULL, NULL, NULL, NULL } },
+ { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
{
{ idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
NULL },
{ iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
NULL },
- { NULL, NULL, NULL, NULL },
+ { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
},
{ { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
idct32x32_sse4_1 },
{ NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL } },
+ { iidentity32_sse4_1, NULL, NULL, NULL } },
{ { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
idct64x64_sse4_1 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = input_stride >> 2;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+ __m128i buf0[16];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+ _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+ _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+ _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div8 = input_stride >> 2;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[16];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(
+ buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[64 * 4];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[32];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < (input_stride >> 2); ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ for (int j = 0; j < (input_stride >> 2); ++j) {
+ _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+ _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+ _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+ _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+ }
+ }
+ for (int i = 0; i < (input_stride >> 2); i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, 0, txfm_size_row,
+ bd);
+ }
+ }
+}
static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
uint16_t *output,
int stride, TX_TYPE tx_type,
@@ -5182,7 +5401,7 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
__m128i buf1[64 * 16];
int eobx, eoby;
get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
- const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
const int txw_idx = get_txw_idx(tx_size);
const int txh_idx = get_txh_idx(tx_size);
const int txfm_size_col = tx_size_wide[tx_size];
@@ -5220,7 +5439,8 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
av1_round_shift_rect_array_32_sse4_1(
buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
}
- row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+ row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
__m128i *_buf1 = buf1 + i * 4;
if (lr_flip) {
@@ -5244,7 +5464,7 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
// 2nd stage: column transform
for (int i = 0; i < buf_size_w_div8; i++) {
col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
- inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
buf1 + i * txfm_size_row, txfm_size_row,
@@ -5261,6 +5481,230 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
}
}
+static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[8];
+ const int32_t *input_row = input;
+ __m128i *buf0_cur = buf0;
+ load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
+ NewInvSqrt2);
+ row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+ row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+
+ if (lr_flip) {
+ TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ } else {
+ TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[8];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+ TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+
+ av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
+ NewInvSqrt2);
+ row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_sse2(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < 2; i++) {
+ col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+ av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ }
+ av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+ // write to buffer
+ highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
+ txfm_size_row, bd);
+}
+
+static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_h_div8 = txfm_size_row >> 2;
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[16];
+ const int32_t *input_row = input;
+ __m128i *buf0_cur = buf0;
+ load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+ for (int i = 0; i < (txfm_size_row >> 2); i++) {
+ row_txfm(buf0 + (i << 2), buf0 + (i << 2),
+ av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+ }
+
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+ buf1[4 * j + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+ buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+ buf1[4 * j + 2], buf1[4 * j + 3]);
+ }
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[16];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ for (int j = 0; j < buf_size_w_div8; j++) {
+ TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+ buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+ }
+ row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_sse2(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+ av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ }
+ av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+}
+
void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
uint8_t *output, int stride,
TX_TYPE tx_type, TX_SIZE tx_size,
@@ -5279,70 +5723,99 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
bd);
break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ highbd_inv_txfm2d_add_h_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ highbd_inv_txfm2d_add_v_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case IDTX:
+ highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
default: assert(0); break;
}
}
+void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
- case TX_32X32:
- av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X16:
- av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
- break;
case TX_8X8:
av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
break;
case TX_4X8:
- av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
break;
case TX_8X4:
- av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
- break;
- case TX_8X16:
- av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X8:
- av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X32:
- av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
- break;
- case TX_32X16:
- av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
- break;
- case TX_32X64:
- av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
- break;
- case TX_64X32:
- av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
break;
case TX_4X4:
av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
break;
case TX_16X4:
- av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
break;
case TX_4X16:
- av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
- break;
- case TX_8X32:
- av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
break;
- case TX_32X8:
- av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
- break;
- case TX_64X64:
- case TX_16X64:
- case TX_64X16:
+ default:
av1_highbd_inv_txfm2d_add_universe_sse4_1(
- input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
- txfm_param->eob, txfm_param->bd);
+ input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
+ txfm_param->bd);
break;
- default: assert(0 && "Invalid transform size"); break;
}
}
diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
index e298cf653..70f1ec709 100644
--- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,23 +22,23 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-void av1_highbd_jnt_convolve_2d_copy_avx2(
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -78,15 +78,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
const __m256i res_unsigned_lo =
_mm256_add_epi32(res_32b_lo, offset_const);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
const __m256i res_unsigned_hi =
_mm256_add_epi32(res_32b_hi, offset_const);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo = highbd_convolve_rounding(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -135,8 +137,9 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
const __m256i res_unsigned_lo =
_mm256_add_epi32(res_32b, offset_const);
- const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -179,15 +182,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
const __m256i res_unsigned_lo =
_mm256_add_epi32(res_32b_lo, offset_const);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
const __m256i res_unsigned_hi =
_mm256_add_epi32(res_32b_hi, offset_const);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo =
highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -223,11 +228,11 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
}
}
-void av1_highbd_jnt_convolve_2d_avx2(
+void av1_highbd_dist_wtd_convolve_2d_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -244,7 +249,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
__m256i s[8], coeffs_y[4], coeffs_x[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
@@ -272,8 +277,8 @@ void av1_highbd_jnt_convolve_2d_avx2(
const __m256i clip_pixel_to_bd =
_mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
for (j = 0; j < w; j += 8) {
/* Horizontal filter */
@@ -364,8 +369,9 @@ void av1_highbd_jnt_convolve_2d_avx2(
const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
- const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -409,10 +415,12 @@ void av1_highbd_jnt_convolve_2d_avx2(
const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo =
highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -456,24 +464,24 @@ void av1_highbd_jnt_convolve_2d_avx2(
}
}
-void av1_highbd_jnt_convolve_x_avx2(
+void av1_highbd_dist_wtd_convolve_x_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint16_t *const src_ptr = src - fo_horiz;
const int bits = FILTER_BITS - conv_params->round_1;
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
int i, j;
__m256i s[4], coeffs_x[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -496,7 +504,7 @@ void av1_highbd_jnt_convolve_x_avx2(
_mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
assert(bits >= 0);
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
for (j = 0; j < w; j += 8) {
/* Horizontal filter */
@@ -548,7 +556,7 @@ void av1_highbd_jnt_convolve_x_avx2(
const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -588,10 +596,12 @@ void av1_highbd_jnt_convolve_x_avx2(
const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo = highbd_convolve_rounding(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -623,24 +633,24 @@ void av1_highbd_jnt_convolve_x_avx2(
}
}
-void av1_highbd_jnt_convolve_y_avx2(
+void av1_highbd_dist_wtd_convolve_y_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
const uint16_t *const src_ptr = src - fo_vert * src_stride;
const int bits = FILTER_BITS - conv_params->round_0;
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
assert(bits >= 0);
int i, j;
__m256i s[8], coeffs_y[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
@@ -662,7 +672,7 @@ void av1_highbd_jnt_convolve_y_avx2(
_mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
const __m256i zero = _mm256_setzero_si256();
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
for (j = 0; j < w; j += 8) {
const uint16_t *data = &src_ptr[j];
@@ -753,8 +763,9 @@ void av1_highbd_jnt_convolve_y_avx2(
const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
- const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -799,10 +810,12 @@ void av1_highbd_jnt_convolve_y_avx2(
const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo =
highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
index 1a29985b5..f033a6f94 100644
--- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -17,23 +17,23 @@
#include "aom_dsp/x86/convolve_sse2.h"
#include "aom_dsp/x86/convolve_sse4_1.h"
-void av1_highbd_jnt_convolve_y_sse4_1(
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
const uint16_t *const src_ptr = src - fo_vert * src_stride;
const int bits = FILTER_BITS - conv_params->round_0;
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
assert(bits >= 0);
int i, j;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
@@ -56,7 +56,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
const __m128i zero = _mm_setzero_si128();
__m128i s[16], coeffs_y[4];
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
for (j = 0; j < w; j += 8) {
const uint16_t *data = &src_ptr[j];
@@ -121,10 +121,12 @@ void av1_highbd_jnt_convolve_y_sse4_1(
const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
- const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
- const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
- &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_1 =
+ highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_0 =
highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
@@ -186,16 +188,16 @@ void av1_highbd_jnt_convolve_y_sse4_1(
const __m128i comp_avg_res_lo_0 =
highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_lo_1 =
highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_hi_0 =
highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_hi_1 =
highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo_0 =
highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
@@ -257,24 +259,24 @@ void av1_highbd_jnt_convolve_y_sse4_1(
}
}
-void av1_highbd_jnt_convolve_x_sse4_1(
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y, const int subpel_x_q4,
- const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint16_t *const src_ptr = src - fo_horiz;
const int bits = FILTER_BITS - conv_params->round_1;
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
int i, j;
__m128i s[4], coeffs_x[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m128i wt0 = _mm_set1_epi32(w0);
@@ -297,7 +299,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
_mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
assert(bits >= 0);
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
for (j = 0; j < w; j += 8) {
/* Horizontal filter */
@@ -339,7 +341,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i round_result = highbd_convolve_rounding_sse2(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -359,10 +361,12 @@ void av1_highbd_jnt_convolve_x_sse4_1(
const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
- const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo = highbd_convolve_rounding_sse2(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h b/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
index 6f24e5948..5734810f5 100644
--- a/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -75,13 +75,20 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
out[63]);
}
-static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
- for (int j = 0; j < 8; j++) {
- for (int i = 0; i < 8; i++) {
- TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
- input[i * 32 + j + 16], input[i * 32 + j + 24],
- output[j * 32 + i + 0], output[j * 32 + i + 8],
- output[j * 32 + i + 16], output[j * 32 + i + 24]);
+static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output,
+ const int width, const int height) {
+ const int numcol = height >> 2;
+ const int numrow = width >> 2;
+ for (int j = 0; j < numrow; j++) {
+ for (int i = 0; i < numcol; i++) {
+ TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
+ input[i * width + j + (numrow * 1)],
+ input[i * width + j + (numrow * 2)],
+ input[i * width + j + (numrow * 3)],
+ output[j * height + i + (numcol * 0)],
+ output[j * height + i + (numcol * 1)],
+ output[j * height + i + (numcol * 2)],
+ output[j * height + i + (numcol * 3)]);
}
}
}
diff --git a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
index 4bcab0564..60a819308 100644
--- a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
@@ -15,9 +15,9 @@
#include "av1/common/warped_motion.h"
-static const uint8_t warp_highbd_arrange_bytes[16] = {
- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-};
+static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10,
+ 12, 14, 1, 3, 5, 7,
+ 9, 11, 13, 15 };
static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
@@ -25,24 +25,28 @@ static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
};
-static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
- 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
-};
-static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
- 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
-};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9,
+ 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11 };
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
+ 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15 };
static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
__m128i *coeff) {
// Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_0 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -63,14 +67,18 @@ static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
// Filter odd-index pixels
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_1 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -87,7 +95,7 @@ static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
int sx, __m128i *coeff) {
// Filter coeff
const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+ (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
coeff[0] = _mm_shuffle_epi8(
tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
@@ -454,16 +462,16 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
// Filter even-index pixels
const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -491,16 +499,16 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
+ (__m128i *)(av1_warped_filter +
((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
@@ -537,7 +545,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
__m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
__m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
_mm_mullo_epi32(res_lo, wt1));
res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
@@ -570,7 +578,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
(__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
__m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
_mm_mullo_epi32(res_hi, wt1));
res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
diff --git a/media/libaom/src/av1/common/x86/intra_edge_sse4.c b/media/libaom/src/av1/common/x86/intra_edge_sse4.c
index 0c857b583..fc69f41d7 100644
--- a/media/libaom/src/av1/common/x86/intra_edge_sse4.c
+++ b/media/libaom/src/av1/common/x86/intra_edge_sse4.c
@@ -212,10 +212,10 @@ void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
{ -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
};
- DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
- { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
- { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
- };
+ DECLARE_ALIGNED(
+ 16, static const int8_t,
+ v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
// Extend first/last samples (upper-left p[-1], last p[sz-1])
// to support 4-tap filter
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
index 9f2e2b457..6de61573e 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c
@@ -23,8 +23,8 @@
static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
+ const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
return wt;
}
@@ -35,22 +35,20 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
}
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_horiz;
+ int i, j, is_horiz_4tap = 0;
const int bits = FILTER_BITS - conv_params->round_1;
const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -58,87 +56,147 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int rounding_shift =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
- __m256i filt[4], coeffs[4];
assert(bits >= 0);
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
const __m256i round_const =
_mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
- for (i = 0; i < h; i += 2) {
- const uint8_t *src_data = src_ptr + i * src_stride;
- CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
- for (j = 0; j < w; j += 8) {
- const __m256i data =
- load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+ __m256i filt[4], coeffs[4];
- __m256i res = convolve_lowbd_x(data, coeffs, filt);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
- res = _mm256_slli_epi16(res, bits);
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_horiz_4tap = 1;
- const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+ // horz_filt as 4 tap
+ if (is_horiz_4tap) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
- // Accumulate values into the destination buffer
- if (do_average) {
- const __m256i data_ref_0 =
- load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+ res = _mm256_slli_epi16(res, bits);
- const __m256i round_result = convolve_rounding(
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
- const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
- if (w > 4) {
- _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
- _mm_storel_epi64(
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
} else {
- *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
- *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
- _mm_cvtsi128_si32(res_1);
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
}
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ }
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+ __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
- const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
}
}
}
}
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ int i, j, is_vert_4tap = 0;
// +1 to compensate for dividing the filter coeffs by 2
const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
const __m256i round_const =
@@ -146,7 +204,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -162,201 +220,395 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
assert((FILTER_BITS - conv_params->round_0) >= 0);
- prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
(void)conv_params;
(void)filter_params_x;
- (void)subpel_x_q4;
-
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
- // Load lines a and b. Line a to lower 128, line b to upper 128
- {
- __m256i src_ab[7];
- __m256i src_a[7];
- src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
- for (int kk = 0; kk < 6; ++kk) {
- data += src_stride;
- src_a[kk + 1] =
- _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
- src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ (void)subpel_x_qn;
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_vert_4tap) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src4;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[4];
+ __m256i src_a[5];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 4; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src4 = src_a[4];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+ s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
}
- src6 = src_a[6];
- s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
- s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
- s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
- s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
- s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
- s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
- }
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[(i + 7) * src_stride + j];
- const __m256i src7 =
- _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
- const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 5) * src_stride + j];
+ const __m256i src5 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+ src4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
- s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
- s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
- __m256i res_lo = convolve_lowbd(s, coeffs);
+ __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
- res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
- const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
- const __m256i res_lo_0_shift =
- _mm256_slli_epi32(res_lo_0_32b, left_shift);
- const __m256i res_lo_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
- const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
- const __m256i res_lo_1_shift =
- _mm256_slli_epi32(res_lo_1_32b, left_shift);
- const __m256i res_lo_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
- const __m256i res_lo_round =
- _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
- const __m256i res_lo_unsigned =
- _mm256_add_epi16(res_lo_round, offset_const_2);
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
- if (w - j < 16) {
- if (do_average) {
- const __m256i data_ref_0 = load_line2_avx2(
- &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
- const __m256i round_result = convolve_rounding(
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
- const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
- if (w - j > 4) {
- _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
- _mm_storel_epi64(
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
} else {
- *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
- _mm_cvtsi128_si32(res_0);
- *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
- _mm_cvtsi128_si32(res_1);
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
}
} else {
- const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
- const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_1);
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
}
- } else {
- __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+ s[0] = s[1];
+ s[1] = s[2];
- res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[7];
+ __m256i src_a[7];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 6; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src6 = src_a[6];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+ s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+ }
- const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
- const __m256i res_hi_0_shift =
- _mm256_slli_epi32(res_hi_0_32b, left_shift);
- const __m256i res_hi_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 7) * src_stride + j];
+ const __m256i src7 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
- const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
- const __m256i res_hi_1_shift =
- _mm256_slli_epi32(res_hi_1_32b, left_shift);
- const __m256i res_hi_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
- const __m256i res_hi_round =
- _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
- const __m256i res_hi_unsigned =
- _mm256_add_epi16(res_hi_round, offset_const_2);
+ __m256i res_lo = convolve_lowbd(s, coeffs);
- if (do_average) {
- const __m256i data_ref_0_lo = load_line2_avx2(
- &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
- const __m256i data_ref_0_hi =
- load_line2_avx2(&dst[i * dst_stride + j + 8],
- &dst[i * dst_stride + j + 8 + dst_stride]);
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
- const __m256i comp_avg_res_lo =
- comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
- const __m256i comp_avg_res_hi =
- comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
- const __m256i round_result_lo = convolve_rounding(
- &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
- const __m256i round_result_hi = convolve_rounding(
- &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
- const __m256i res_8 =
- _mm256_packus_epi16(round_result_lo, round_result_hi);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
- _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
- _mm_store_si128(
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
} else {
- const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+ __m256i res_hi = convolve_lowbd(s + 4, coeffs);
- const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_lo_1);
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
- const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
- const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
- _mm_store_si128(
- (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
}
- }
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
}
}
}
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- int im_h = h + filter_params_y->taps - 1;
+
int im_stride = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ int i, is_horiz_4tap = 0, is_vert_4tap = 0;
const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -364,18 +616,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int rounding_shift =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
- __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
const __m256i round_const_h = _mm256_set1_epi16(
((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -385,9 +628,29 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- {
+ __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_horiz_4tap) {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
const uint8_t *src_h = src_ptr + j;
for (i = 0; i < im_h; i += 2) {
__m256i data =
@@ -396,49 +659,59 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
data = _mm256_inserti128_si256(
data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
src_h += (src_stride << 1);
- __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
round_shift_h);
_mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
}
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
}
+ } else if (is_vert_4tap) {
+ int im_h = h + 3;
+ const int fo_vert = 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
- /* Vertical filter */
- {
+ /* Vertical filter */
+ __m256i s[6];
__m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
__m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
__m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
__m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
s[0] = _mm256_unpacklo_epi16(s0, s1);
s[1] = _mm256_unpacklo_epi16(s2, s3);
- s[2] = _mm256_unpacklo_epi16(s4, s5);
- s[4] = _mm256_unpackhi_epi16(s0, s1);
- s[5] = _mm256_unpackhi_epi16(s2, s3);
- s[6] = _mm256_unpackhi_epi16(s4, s5);
+ s[3] = _mm256_unpackhi_epi16(s0, s1);
+ s[4] = _mm256_unpackhi_epi16(s2, s3);
for (i = 0; i < h; i += 2) {
const int16_t *data = &im_block[i * im_stride];
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
- const __m256i res_a = convolve(s, coeffs_y);
+ const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
const __m256i res_a_round = _mm256_sra_epi32(
_mm256_add_epi32(res_a, round_const_v), round_shift_v);
if (w - j > 4) {
- const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
const __m256i res_b_round = _mm256_sra_epi32(
_mm256_add_epi32(res_b, round_const_v), round_shift_v);
const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
@@ -448,8 +721,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i data_ref_0 =
load_line2_avx2(&dst[i * dst_stride + j],
&dst[i * dst_stride + j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -479,8 +752,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
load_line2_avx2(&dst[i * dst_stride + j],
&dst[i * dst_stride + j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -504,38 +777,49 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
res_1);
}
}
-
s[0] = s[1];
s[1] = s[2];
- s[2] = s[3];
-
+ s[3] = s[4];
s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
}
}
+ } else {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ }
}
}
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
(void)filter_params_x;
(void)filter_params_y;
- (void)subpel_x_q4;
- (void)subpel_y_q4;
+ (void)subpel_x_qn;
+ (void)subpel_y_qn;
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const __m256i wt = unpack_weights_avx2(conv_params);
const __m256i zero = _mm256_setzero_si256();
@@ -562,7 +846,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
_mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -600,7 +884,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
const __m256i data_ref_0 = load_line2_avx2(
&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
index 87dc3242e..f8f640a11 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c
@@ -16,12 +16,12 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
const int dst_stride = conv_params->dst_stride;
@@ -37,7 +37,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i wt1 = _mm_set1_epi16(w1);
const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -48,9 +48,9 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
__m128i coeffs[4];
(void)filter_params_y;
- (void)subpel_y_q4;
+ (void)subpel_y_qn;
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
if (w == 4) {
do {
@@ -77,7 +77,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -134,7 +134,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -150,12 +150,12 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
}
}
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
const int dst_stride = conv_params->dst_stride;
@@ -167,7 +167,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -180,9 +180,9 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
__m128i coeffs[4];
(void)filter_params_x;
- (void)subpel_x_q4;
+ (void)subpel_x_qn;
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
if (w == 4) {
__m128i s[8], src6, res, res_shift;
@@ -225,7 +225,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -254,7 +254,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -331,7 +331,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -360,7 +360,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -383,3 +383,233 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
} while (j < w);
}
}
+
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i temp_lo, temp_hi;
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+ temp_lo = _mm_srli_si128(src_lo, 4);
+ temp_hi = _mm_slli_si128(src_hi, 12);
+ const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ temp_lo = _mm_srli_si128(src_lo, 8);
+ temp_hi = _mm_slli_si128(src_hi, 8);
+ const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ temp_lo = _mm_srli_si128(src_lo, 12);
+ temp_hi = _mm_slli_si128(src_hi, 4);
+ const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ temp_lo = _mm_srli_si128(src_lo, 2);
+ temp_hi = _mm_slli_si128(src_hi, 14);
+ const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ temp_lo = _mm_srli_si128(src_lo, 6);
+ temp_hi = _mm_slli_si128(src_hi, 10);
+ const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ temp_lo = _mm_srli_si128(src_lo, 10);
+ temp_hi = _mm_slli_si128(src_hi, 6);
+ const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ temp_lo = _mm_srli_si128(src_lo, 14);
+ temp_hi = _mm_slli_si128(src_hi, 2);
+ const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ else
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ }
+ }
+ }
+}
diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c b/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
index 822772782..f45e3b267 100644
--- a/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
+++ b/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
@@ -16,12 +16,11 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_ssse3(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
@@ -34,7 +33,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
@@ -56,7 +55,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -124,7 +123,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
/* Vertical filter */
{
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
// coeffs 0 1 0 1 2 3 2 3
@@ -211,7 +210,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/media/libaom/src/av1/common/x86/reconinter_avx2.c b/media/libaom/src/av1/common/x86/reconinter_avx2.c
index f645e0454..a38bd8317 100644
--- a/media/libaom/src/av1/common/x86/reconinter_avx2.c
+++ b/media/libaom/src/av1/common/x86/reconinter_avx2.c
@@ -28,8 +28,8 @@ static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
}
void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
DIFFWTD_MASK_TYPE mask_type,
- const uint8_t *src0, int stride0,
- const uint8_t *src1, int stride1,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
int h, int w) {
const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
@@ -37,18 +37,18 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
if (4 == w) {
do {
const __m128i s0A = xx_loadl_32(src0);
- const __m128i s0B = xx_loadl_32(src0 + stride0);
- const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
- const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+ const __m128i s0B = xx_loadl_32(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
const __m128i s1A = xx_loadl_32(src1);
- const __m128i s1B = xx_loadl_32(src1 + stride1);
- const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
- const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+ const __m128i s1B = xx_loadl_32(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
@@ -58,40 +58,40 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
const __m128i x_m8 =
_mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
xx_storeu_128(mask, x_m8);
- src0 += (stride0 << 2);
- src1 += (stride1 << 2);
+ src0 += (src0_stride << 2);
+ src1 += (src1_stride << 2);
mask += 16;
i += 4;
} while (i < h);
} else if (8 == w) {
do {
const __m128i s0A = xx_loadl_64(src0);
- const __m128i s0B = xx_loadl_64(src0 + stride0);
- const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
- const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
const __m128i s1A = xx_loadl_64(src1);
- const __m128i s1B = xx_loadl_64(src1 + stride1);
- const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
- const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
yy_storeu_256(mask, m8);
- src0 += stride0 << 2;
- src1 += stride1 << 2;
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
mask += 32;
i += 4;
} while (i < h);
} else if (16 == w) {
do {
const __m128i s0A = xx_load_128(src0);
- const __m128i s0B = xx_load_128(src0 + stride0);
+ const __m128i s0B = xx_load_128(src0 + src0_stride);
const __m128i s1A = xx_load_128(src1);
- const __m128i s1B = xx_load_128(src1 + stride1);
+ const __m128i s1B = xx_load_128(src1 + src1_stride);
const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
@@ -103,8 +103,8 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
const __m256i m8 =
_mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
yy_storeu_256(mask, m8);
- src0 += stride0 << 1;
- src1 += stride1 << 1;
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
mask += 32;
i += 2;
} while (i < h);
@@ -127,8 +127,8 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
yy_storeu_256(mask + j, m8);
j += 32;
} while (j < w);
- src0 += stride0;
- src1 += stride1;
+ src0 += src0_stride;
+ src1 += src1_stride;
mask += w;
i += 1;
} while (i < h);
diff --git a/media/libaom/src/av1/common/x86/selfguided_avx2.c b/media/libaom/src/av1/common/x86/selfguided_avx2.c
index 0aaf1f454..3c5558dda 100644
--- a/media/libaom/src/av1/common/x86/selfguided_avx2.c
+++ b/media/libaom/src/av1/common/x86/selfguided_avx2.c
@@ -219,12 +219,12 @@ static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
int width, int height, int buf_stride, int bit_depth,
int sgr_params_idx, int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int n = (2 * r + 1) * (2 * r + 1);
const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
// one_over_n[n-1] is 2^12/n, so easily fits in an int16
- const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+ const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -263,7 +263,7 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
SGRPROJ_MTABLE_BITS),
_mm256_set1_epi32(255));
- const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+ const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
yy_storeu_256(A + i * buf_stride + j, a_res);
@@ -356,12 +356,12 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
const int32_t *D, int width, int height,
int buf_stride, int bit_depth, int sgr_params_idx,
int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int n = (2 * r + 1) * (2 * r + 1);
const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
// one_over_n[n-1] is 2^12/n, so easily fits in an int16
- const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+ const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -400,7 +400,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
SGRPROJ_MTABLE_BITS),
_mm256_set1_epi32(255));
- const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+ const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
yy_storeu_256(A + i * buf_stride + j, a_res);
@@ -604,7 +604,7 @@ int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
buf_stride);
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
// Write to flt0 and flt1
// If params->r == 0 we skip the corresponding filter. We only allow one of
// the radii to be 0, as having both equal to 0 would be equivalent to
@@ -630,11 +630,11 @@ int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
return 0;
}
-void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -642,9 +642,9 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
(void)ret;
assert(!ret);
- const sgr_params_type *const params = &sgr_params[eps];
+ const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
- decode_xq(xqd, xq, params);
+ av1_decode_xq(xqd, xq, params);
__m256i xq0 = _mm256_set1_epi32(xq[0]);
__m256i xq1 = _mm256_set1_epi32(xq[1]);
diff --git a/media/libaom/src/av1/common/x86/selfguided_sse4.c b/media/libaom/src/av1/common/x86/selfguided_sse4.c
index ea3f6d942..72c7708f1 100644
--- a/media/libaom/src/av1/common/x86/selfguided_sse4.c
+++ b/media/libaom/src/av1/common/x86/selfguided_sse4.c
@@ -170,12 +170,12 @@ static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
int width, int height, int buf_stride, int bit_depth,
int sgr_params_idx, int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int n = (2 * r + 1) * (2 * r + 1);
const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
// one_over_n[n-1] is 2^12/n, so easily fits in an int16
- const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+ const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -216,10 +216,11 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
// 'Gather' type instructions are not available pre-AVX2, so synthesize a
// gather using scalar loads.
- const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
- x_by_xplus1[_mm_extract_epi32(z, 2)],
- x_by_xplus1[_mm_extract_epi32(z, 1)],
- x_by_xplus1[_mm_extract_epi32(z, 0)]);
+ const __m128i a_res =
+ _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
xx_storeu_128(A + i * buf_stride + j, a_res);
@@ -310,12 +311,12 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
const int32_t *D, int width, int height,
int buf_stride, int bit_depth, int sgr_params_idx,
int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int n = (2 * r + 1) * (2 * r + 1);
const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
// one_over_n[n-1] is 2^12/n, so easily fits in an int16
- const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+ const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
@@ -356,10 +357,11 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
// 'Gather' type instructions are not available pre-AVX2, so synthesize a
// gather using scalar loads.
- const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
- x_by_xplus1[_mm_extract_epi32(z, 2)],
- x_by_xplus1[_mm_extract_epi32(z, 1)],
- x_by_xplus1[_mm_extract_epi32(z, 0)]);
+ const __m128i a_res =
+ _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
xx_storeu_128(A + i * buf_stride + j, a_res);
@@ -554,7 +556,7 @@ int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
buf_stride);
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
// Write to flt0 and flt1
// If params->r == 0 we skip the corresponding filter. We only allow one of
// the radii to be 0, as having both equal to 0 would be equivalent to
@@ -580,11 +582,11 @@ int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
return 0;
}
-void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -592,9 +594,9 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
(void)ret;
assert(!ret);
- const sgr_params_type *const params = &sgr_params[eps];
+ const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
- decode_xq(xqd, xq, params);
+ av1_decode_xq(xqd, xq, params);
__m128i xq0 = _mm_set1_epi32(xq[0]);
__m128i xq1 = _mm_set1_epi32(xq[1]);
diff --git a/media/libaom/src/av1/common/x86/warp_plane_avx2.c b/media/libaom/src/av1/common/x86/warp_plane_avx2.c
new file mode 100644
index 000000000..53a928d76
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/warp_plane_avx2.c
@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
+ 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
+ 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7,
+ 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10,
+ 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4,
+ 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
+ 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8,
+ 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11,
+ 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 };
+
+static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
+ __m256i *coeff,
+ const __m256i *shuffle_src,
+ const __m256i *round_const,
+ const __m128i *shift, int row) {
+ const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
+ const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
+ const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
+ const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
+
+ const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
+ const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
+ const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
+ const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
+
+ const __m256i res_even = _mm256_add_epi16(res_02, res_46);
+ const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
+ const __m256i res =
+ _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
+ horz_out[row] = _mm256_srl_epi16(res, *shift);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
+ int sx,
+ __m256i *coeff) {
+ __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+
+ __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+
+ __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
+ __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
+ __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
+ __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
+
+ __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
+ __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
+ __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
+ __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
+
+ __m128i tmp_8 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
+
+ __m128i tmp_9 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
+
+ __m128i tmp_10 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
+
+ __m128i tmp_11 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
+
+ tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
+
+ tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
+
+ tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
+
+ tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
+
+ const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
+ const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
+ const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
+ const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
+
+ const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+ const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+ const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+ const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+ coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+ coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+ coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+ coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
+ __m256i *coeff) {
+ __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
+ const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
+ const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
+ const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
+
+ const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+ const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+ const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+ const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+ coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+ coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+ coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+ coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
+ __m256i *coeff) {
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
+
+ const __m256i res_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
+
+ coeff[0] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
+ coeff[1] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
+ coeff[2] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
+ coeff[3] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
+}
+
+static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
+ int sx, int alpha, int beta, int row,
+ const __m256i *shuffle_src,
+ const __m256i *round_const,
+ const __m128i *shift) {
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
+ filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
+ row);
+}
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m256i *coeff) {
+ const __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
+ coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
+ coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
+ coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
+}
+
+static INLINE void warp_horizontal_filter_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ int k, iy, sx, row = 0;
+ __m256i coeff[4];
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ sx = sx4 + beta * (k + 4);
+ horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
+ round_const, shift);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)alpha;
+ int k, iy, sx, row = 0;
+ __m256i coeff[4];
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_beta0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)beta;
+ int k, iy, row = 0;
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)alpha;
+ int k, iy, row = 0;
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void unpack_weights_and_set_round_const_avx2(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
+ *res_sub_const =
+ _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16((short)w0);
+ const __m256i wt1 = _mm256_set1_epi16((short)w1);
+ *wt = _mm256_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
+ int sy,
+ __m256i *coeffs) {
+ __m128i filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m128i filt_10 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_11 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_12 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_13 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i filt_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+ __m256i filt_1 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+ __m256i filt_2 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+ __m256i filt_3 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+ __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+ filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_10 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_11 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_12 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_13 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+ filt_1 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+ filt_2 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+ filt_3 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+ res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
+ __m256i *coeffs) {
+ __m128i filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
+ __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
+ __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
+ __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+ __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+ filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_0 = _mm256_broadcastsi128_si256(filt_00);
+ filt_1 = _mm256_broadcastsi128_si256(filt_01);
+ filt_2 = _mm256_broadcastsi128_si256(filt_02);
+ filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+ res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
+ __m256i *coeffs) {
+ const __m128i filt_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+ const __m128i filt_1 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i res_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
+
+ coeffs[0] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
+ coeffs[1] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
+ coeffs[2] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
+ coeffs[3] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
+
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
+ __m256i *src,
+ __m256i *coeffs,
+ __m256i *res_lo,
+ __m256i *res_hi, int row) {
+ const __m256i src_6 = horz_out[row + 3];
+ const __m256i src_7 =
+ _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
+
+ src[6] = _mm256_unpacklo_epi16(src_6, src_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
+ const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
+ const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
+ const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
+
+ const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
+ _mm256_add_epi32(res_4, res_6));
+
+ src[7] = _mm256_unpackhi_epi16(src_6, src_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
+ const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
+ const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
+ const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
+
+ const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
+ _mm256_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output_avx2(
+ const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
+ const __m256i *wt, const __m256i *res_sub_const,
+ const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
+ int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m256i res_lo_1 = *res_lo;
+ __m256i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p_0 =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&conv_params
+ ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
+
+ res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+
+ const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
+ __m256i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const dst8_1 =
+ (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+ const __m128i p_16_0 = _mm_loadl_epi64(p_0);
+ const __m128i p_16_1 = _mm_loadl_epi64(p_1);
+ const __m256i p_16 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
+ const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
+ const __m256i shifted_32 =
+ _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
+ }
+ res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
+ res_lo_16 = _mm256_srai_epi16(
+ _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
+ const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
+ const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
+ const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
+ *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
+ *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
+ } else {
+ const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
+ const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
+ _mm_storel_epi64(p_0, temp_lo_16_0);
+ _mm_storel_epi64(p_1, temp_lo_16_1);
+ }
+ if (p_width > 4) {
+ __m128i *const p4_0 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ __m128i *const p4_1 =
+ (__m128i *)&conv_params
+ ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
+ __m256i res_hi_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8_4_0 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ __m128i *const dst8_4_1 =
+ (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
+ const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
+ const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
+ const __m256i p4_16 = _mm256_inserti128_si256(
+ _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
+ const __m256i shifted_32 =
+ _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
+ res_hi_16 = _mm256_srai_epi16(
+ _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
+ __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
+ const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
+ const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
+ *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
+ *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
+ } else {
+ const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
+ const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
+ _mm_storel_epi64(p4_0, temp_hi_16_0);
+ _mm_storel_epi64(p4_1, temp_hi_16_1);
+ }
+ }
+ } else {
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
+ const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
+ const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
+ const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+
+ if (p_width == 4) {
+ *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0);
+ *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1);
+ } else {
+ _mm_storel_epi64(p, res_8bit0);
+ _mm_storel_epi64(p1, res_8bit1);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ int k, row = 0;
+ __m256i src[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ int sy = sy4 + delta * (k + 4);
+ __m256i coeffs[8];
+ prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)gamma;
+ int k, row = 0;
+ __m256i src[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ int sy = sy4 + delta * (k + 4);
+ __m256i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)delta;
+ int k, row = 0;
+ __m256i src[8], coeffs[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)gamma;
+ int k, row = 0;
+ __m256i src[8], coeffs[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else
+ warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
+ p_height, p_stride, p_width, i, j, sy4,
+ reduce_bits_vert, res_add_const, round_bits,
+ res_sub_const, round_bits_const, wt);
+}
+
+static INLINE void prepare_warp_horizontal_filter_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0_avx2(
+ ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+ alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+ alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else
+ warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i, round_const, shift,
+ shuffle_src);
+}
+
+int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int dst_stride) {
+ int64_t sum_error = 0;
+ int i, j;
+ __m256i row_error, col_error;
+ __m256i zero = _mm256_set1_epi16(0);
+ __m256i dup_255 = _mm256_set1_epi16(255);
+ col_error = zero;
+
+ for (i = 0; i < (p_height / 4); i++) {
+ row_error = _mm256_set1_epi16(0);
+ for (j = 0; j < (p_width / 16); j++) {
+ __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
+ __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
+ __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
+ __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
+ __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
+ __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
+ __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
+ __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
+
+ __m256i diff_1 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
+ __m256i diff_2 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
+ __m256i diff_3 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
+ __m256i diff_4 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
+
+ __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
+ __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
+ __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
+ __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
+ __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
+ __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
+ __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
+ __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
+
+ __m256i error_1_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
+ __m256i error_1_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
+ __m256i error_2_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
+ __m256i error_2_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
+ __m256i error_3_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
+ __m256i error_3_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
+ __m256i error_4_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
+ __m256i error_4_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
+
+ __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
+ __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
+ __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
+ __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
+
+ __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
+ __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
+
+ __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
+ row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
+ }
+ __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
+ __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
+ __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
+ col_error = _mm256_add_epi64(col_error, col_error_temp);
+ // Error summation for remaining width, which is not multiple of 16
+ if (p_width & 0xf) {
+ for (int k = 0; k < 4; ++k) {
+ for (int l = j * 16; l < p_width; ++l) {
+ sum_error +=
+ (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
+ ref[l + ((i * 4) + k) * ref_stride]);
+ }
+ }
+ }
+ }
+ __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
+ __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
+ sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
+ int64_t sum_error_d_0, sum_error_d_1;
+ xx_storel_64(&sum_error_d_0, sum_error_q_0);
+ xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
+ sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+ // Error summation for remaining height, which is not multiple of 4
+ if (p_height & 0x3) {
+ for (int k = i * 4; k < p_height; ++k) {
+ for (int l = 0; l < p_width; ++l) {
+ sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
+ ref[l + k * ref_stride]);
+ }
+ }
+ }
+ return sum_error;
+}
+
+void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m256i horz_out[8];
+ int i, j, k;
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m256i reduce_bits_vert_const =
+ _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ const __m256i round_const = _mm256_set1_epi16(
+ (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
+ const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
+
+ __m256i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const,
+ &wt);
+
+ __m256i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
+ const int32_t const1 = alpha * (-4) + beta * (-4) +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ const int32_t const2 = gamma * (-4) + delta * (-4) +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
+ const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
+
+ __m256i shuffle_src[4];
+ shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
+ shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
+ shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
+ shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += const1;
+ sy4 += const2;
+
+ sx4 &= ~const3;
+ sy4 &= ~const3;
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+
+ if (ix4 <= -7) {
+ int iy, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_0 =
+ _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_1 =
+ _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ } else if (ix4 >= width + 6) {
+ int iy, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_0 = _mm256_set1_epi16(
+ const4 + ref[iy * stride + (width - 1)] * const5);
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_1 = _mm256_set1_epi16(
+ const4 + ref[iy * stride + (width - 1)] * const5);
+ horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ horz_out[row] =
+ _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+ int iy, sx, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
+ src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
+ src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
+ }
+ sx = sx4 + beta * (k + 4);
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+ horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
+ shuffle_src, &round_const, &shift);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right =
+ _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_right);
+ }
+ sx = sx4 + beta * (k + 4);
+ const __m256i src_01 = _mm256_castsi128_si256(src);
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
+ &round_const, &shift, row);
+ } else {
+ prepare_warp_horizontal_filter_avx2(
+ ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
+ i, &round_const, &shift, shuffle_src);
+ }
+
+ // Vertical filter
+ prepare_warp_vertical_filter_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
+ p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ }
+ }
+}
diff --git a/media/libaom/src/av1/common/x86/warp_plane_sse2.c b/media/libaom/src/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 000000000..6ff666518
--- /dev/null
+++ b/media/libaom/src/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int dst_stride) {
+ int64_t sum_error = 0;
+ int i, j;
+ __m128i row_error, col_error;
+ __m128i zero = _mm_set1_epi16(0);
+ __m128i dup_255 = _mm_set1_epi16(255);
+ col_error = zero;
+ for (i = 0; i < (p_height); i++) {
+ row_error = zero;
+ for (j = 0; j < (p_width / 16); j++) {
+ __m128i ref_8 =
+ _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
+ __m128i dst_8 =
+ _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
+ __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
+ __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
+ __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
+ __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
+
+ __m128i diff_1 =
+ _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
+ __m128i diff_2 =
+ _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
+
+ __m128i error_1_lo =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 2)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 1)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
+ __m128i error_1_hi =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 6)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 5)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
+ __m128i error_2_lo =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 2)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 1)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
+ __m128i error_2_hi =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 6)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 5)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
+
+ __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
+ __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
+ __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
+
+ row_error = _mm_add_epi32(row_error, error_1_2);
+ }
+ __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
+ __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
+ __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
+ col_error = _mm_add_epi64(col_error, col_error_temp);
+ // Error summation for remaining width, which is not multiple of 16
+ if (p_width & 0xf) {
+ for (int l = j * 16; l < p_width; ++l) {
+ sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
+ ref[l + i * ref_stride]);
+ }
+ }
+ }
+ int64_t sum_error_d_0, sum_error_d_1;
+ xx_storel_64(&sum_error_d_0, col_error);
+ xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
+ sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+ return sum_error;
+}
diff --git a/media/libaom/src/av1/common/x86/warp_plane_sse4.c b/media/libaom/src/av1/common/x86/warp_plane_sse4.c
index b810cea2e..10ddf92d0 100644
--- a/media/libaom/src/av1/common/x86/warp_plane_sse4.c
+++ b/media/libaom/src/av1/common/x86/warp_plane_sse4.c
@@ -16,7 +16,7 @@
#include "av1/common/warped_motion.h"
-/* This is a modified version of 'warped_filter' from warped_motion.c:
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
* Each coefficient is stored in 8 bits instead of 16 bits
* The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
@@ -31,8 +31,8 @@
coefficients into the correct order more quickly.
*/
/* clang-format off */
-DECLARE_ALIGNED(8, static const int8_t,
- filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+DECLARE_ALIGNED(8, const int8_t,
+ av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
#if WARPEDPIXEL_PREC_BITS == 6
// [-1, 0)
{ 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
@@ -198,40 +198,53 @@ DECLARE_ALIGNED(8, static const int8_t,
// in an SSE register into two sequences:
// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
-static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8,
- 8, 10, 10, 12, 12, 14, 14, 0 };
-static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
- 9, 11, 11, 13, 13, 15, 15, 0 };
-
-static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1 };
-
-static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
- 2, 3, 2, 3, 2, 3, 2, 3 };
-
-static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
- 4, 5, 4, 5, 4, 5, 4, 5 };
-
-static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
- 6, 7, 6, 7, 6, 7, 6, 7 };
-
-static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
- 0, 1, 2, 3, 0, 1, 2, 3 };
-static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
- 4, 5, 6, 7, 4, 5, 6, 7 };
-static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
- 8, 9, 10, 11, 8, 9, 10, 11 };
-static const uint8_t shuffle_gamma0_mask3[16] = {
- 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+ even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9,
+ 9, 11, 11, 13, 13, 15, 15, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15, 12, 13, 14, 15 };
static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
const int offset_bits_horiz,
const int reduce_bits_horiz, int k) {
const __m128i src_even =
- _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+ _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
const __m128i src_odd =
- _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+ _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
// The pixel order we need for 'src' is:
// 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
@@ -271,21 +284,21 @@ static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
__m128i *coeff) {
// Filter even-index pixels
const __m128i tmp_0 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
const __m128i tmp_1 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
const __m128i tmp_2 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
const __m128i tmp_3 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
const __m128i tmp_4 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
const __m128i tmp_5 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
const __m128i tmp_6 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
const __m128i tmp_7 = _mm_loadl_epi64(
- (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
// Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
@@ -319,20 +332,20 @@ static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
__m128i *coeff) {
// Filter even-index pixels
const __m128i tmp_0 =
- _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+ _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
// Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
- coeff[0] = _mm_shuffle_epi8(
- tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+ coeff[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
// Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
- coeff[1] = _mm_shuffle_epi8(
- tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+ coeff[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
// Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
- coeff[2] = _mm_shuffle_epi8(
- tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+ coeff[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
// Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
- coeff[3] = _mm_shuffle_epi8(
- tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+ coeff[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
}
static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
@@ -449,21 +462,25 @@ static INLINE void unpack_weights_and_set_round_const(
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi16(w0);
- const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
+ const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
*wt = _mm_unpacklo_epi16(wt0, wt1);
}
static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
__m128i *coeffs) {
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_0 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
@@ -476,14 +493,18 @@ static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_1 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -500,17 +521,17 @@ static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
__m128i *coeffs) {
const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+ (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
// even coeffs
coeffs[0] =
- _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
coeffs[1] =
- _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
coeffs[2] =
- _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
coeffs[3] =
- _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
// odd coeffs
coeffs[4] = coeffs[0];
@@ -577,7 +598,7 @@ static INLINE void store_vertical_filter_output(
__m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
const __m128i p_16 = _mm_loadl_epi64(p);
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
const __m128i shifted_32 =
@@ -610,7 +631,7 @@ static INLINE void store_vertical_filter_output(
(__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
const __m128i p4_16 = _mm_loadl_epi64(p4);
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
const __m128i shifted_32 =
diff --git a/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c b/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c
index 87a6e1239..b7ac68383 100644
--- a/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c
+++ b/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c
@@ -17,6 +17,7 @@
#include "av1/common/convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/synonyms_avx2.h"
@@ -25,6 +26,20 @@
// on the left.
// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+
+// Exploiting the range of wiener filter coefficients,
+// horizontal filtering can be done in 16 bit intermediate precision.
+// The details are as follows :
+// Consider the horizontal wiener filter coefficients of the following form :
+// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
+// Subtracting 2^(FILTER_BITS) from the centre tap we get the following :
+// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0]
+// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
+// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
+// precision. Finally, after rounding the above result by round_0, we multiply
+// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
+// horizontal filter output.
+
void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -37,224 +52,190 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
(void)x_step_q4;
(void)y_step_q4;
- DECLARE_ALIGNED(32, uint16_t,
- temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 2;
- memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
- const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
+ int im_h = h + SUBPEL_TAPS - 2;
+ int im_stride = 8;
+ memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
+ int i, j;
+ const int center_tap = (SUBPEL_TAPS - 1) / 2;
const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
- const __m128i zero_128 = _mm_setzero_si128();
- const __m256i zero_256 = _mm256_setzero_si256();
-
- // Add an offset to account for the "add_src" part of the convolve function.
- const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-
- const __m256i clamp_low = zero_256;
+ __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
+
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
+
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
+ const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs_h[0] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs_h[1] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs_h[2] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs_h[3] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
+
+ const __m256i round_const_h =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
+ const __m256i round_const_horz =
+ _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
+ const __m256i clamp_low = _mm256_setzero_si256();
const __m256i clamp_high =
_mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
- /* Horizontal filter */
- {
- // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
- const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
-
- // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
- const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
- const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
- const __m256i round_const = _mm256_set1_epi32(
- (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
-
- for (int i = 0; i < intermediate_height; ++i) {
- for (int j = 0; j < w; j += 16) {
- const uint8_t *data_ij = src_ptr + i * src_stride + j;
-
- // Load 8-bit src data
- const __m128i data_0 = xx_loadu_128(data_ij + 0);
- const __m128i data_1 = xx_loadu_128(data_ij + 1);
- const __m128i data_2 = xx_loadu_128(data_ij + 2);
- const __m128i data_3 = xx_loadu_128(data_ij + 3);
- const __m128i data_4 = xx_loadu_128(data_ij + 4);
- const __m128i data_5 = xx_loadu_128(data_ij + 5);
- const __m128i data_6 = xx_loadu_128(data_ij + 6);
- const __m128i data_7 = xx_loadu_128(data_ij + 7);
-
- // (Zero-)Extend 8-bit data to 16-bit data
- const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
- const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
- const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
- const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
- const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
- const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
- const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
- const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
-
- // Multiply src data by filter coeffs and sum pairs
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
- // Calculate scalar product for even- and odd-indices separately,
- // increasing to 32-bit precision
- const __m256i res_even_sum = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
- const __m256i res_odd_sum = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
-
- const __m256i res_even = _mm256_srai_epi32(
- _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
- const __m256i res_odd = _mm256_srai_epi32(
- _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
-
- // Reduce to 16-bit precision and pack even- and odd-index results
- // back into one register. The _mm256_packs_epi32 intrinsic returns
- // a register with the pixels ordered as follows:
- // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
- const __m256i res = _mm256_packs_epi32(res_even, res_odd);
- const __m256i res_clamped =
- _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
-
- // Store in a temporary array
- yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
- }
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
+
+ const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
+
+ const __m256i round_const_v =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (j = 0; j < w; j += 8) {
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ // Load the next line
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data,
+ _mm_loadu_si128(
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+ 1);
+
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+ res =
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+ __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
+
+ // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
+ // the result
+ data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
+ res = _mm256_add_epi16(res, data_0);
+ res = _mm256_add_epi16(res, round_const_horz);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
}
- }
- /* Vertical filter */
- {
- // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
- const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
-
- // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
- const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
- const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
- const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
- const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
- const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
- // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
- const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
- // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
- const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
- // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
- const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
- // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
- const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
- // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
- const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
- (1 << (bd + conv_params->round_1 - 1)));
-
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
-
- // Load 16-bit data from the output of the horizontal filter in
- // which the pixels are ordered as follows:
- // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
- const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
- const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
- const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
- const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
- const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
- const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
- const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
- const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
-
- // Filter the even-indices, increasing to 32-bit precision
- const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
- const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
- const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
- const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
-
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-
- const __m256i res_even = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
- // Filter the odd-indices, increasing to 32-bit precision
- const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
- const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
- const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
- const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
-
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
- const __m256i res_odd = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
- // Pixels are currently in the following order:
- // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
- // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
- //
- // Rearrange the pixels into the following order:
- // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
- // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
- const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
- const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
- const __m256i res_lo_round = _mm256_srai_epi32(
- _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
- const __m256i res_hi_round = _mm256_srai_epi32(
- _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
-
- // Reduce to 16-bit precision and pack into the correct order:
- // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
- const __m256i res_16bit =
- _mm256_packs_epi32(res_lo_round, res_hi_round);
-
- // Reduce to 8-bit precision. This messes up the order:
- // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
- // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
- const __m256i res_8bit =
- _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
-
- // Swap the two central 32-bit values to get the order:
- // [ - - - - - - - - - - - - - - - - ]
- // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
- const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
-
- // Store the lower 128-bit lane in the dst array
- xx_storeu_128(dst + i * dst_stride + j,
- _mm256_castsi256_si128(res_8bit2));
+ /* Vertical filter */
+ {
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ __m256i s[8];
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (i = 0; i < h - 1; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ __m256i res_a = convolve(s, coeffs_v);
+ __m256i res_b = convolve(s + 4, coeffs_v);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ // 8 bit conversion and saturation to uint8
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ // Store values into the destination buffer
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+
+ _mm_storel_epi64(p_0, res_0);
+ _mm_storel_epi64(p_1, res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ if (h - i) {
+ s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
+ s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
+ s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
+
+ const int16_t *data = &im_block[i * im_stride];
+ const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
+ const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+
+ __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
+ __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
+
+ s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
+ __m256i convolveres = convolve(s, coeffs_v);
+
+ const __m256i res_round = _mm256_sra_epi32(
+ _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ __m128i reslo = _mm256_castsi256_si128(res_round);
+ __m128i reshi = _mm256_extracti128_si256(res_round, 1);
+ const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
+
+ // 8 bit conversion and saturation to uint8
+ const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storel_epi64(p_0, res_8b);
}
}
}