diff options
author | Moonchild <moonchild@palemoon.org> | 2021-03-03 18:48:48 +0000 |
---|---|---|
committer | Moonchild <moonchild@palemoon.org> | 2021-03-04 00:03:46 +0000 |
commit | 44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746 (patch) | |
tree | 9d9cc4d21c93ae3e1a88ab5c160c3be5f6af0ca9 /media/libaom/src/av1/common | |
parent | 353943d1a48086a39ff5f4365b22f8f058d5f66e (diff) | |
download | aura-central-44d2b4a86e3d862eb1b68db3d9a29b9dbf3da746.tar.gz |
Issue mcp-graveyard/UXP%1737 - Import libaom 2.0.2 source
Diffstat (limited to 'media/libaom/src/av1/common')
117 files changed, 20997 insertions, 15545 deletions
diff --git a/media/libaom/src/av1/common/alloccommon.c b/media/libaom/src/av1/common/alloccommon.c index 1bf81c91d..badee3df9 100644 --- a/media/libaom/src/av1/common/alloccommon.c +++ b/media/libaom/src/av1/common/alloccommon.c @@ -15,10 +15,10 @@ #include "aom_mem/aom_mem.h" #include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" -#include "av1/common/onyxc_int.h" int av1_get_MBs(int width, int height) { const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); @@ -31,60 +31,6 @@ int av1_get_MBs(int width, int height) { return mb_rows * mb_cols; } -#if LOOP_FILTER_BITMASK -static int alloc_loop_filter_mask(AV1_COMMON *cm) { - aom_free(cm->lf.lfm); - cm->lf.lfm = NULL; - - // Each lfm holds bit masks for all the 4x4 blocks in a max - // 64x64 (128x128 for ext_partitions) region. The stride - // and rows are rounded up / truncated to a multiple of 16 - // (32 for ext_partition). - cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2; - cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) * - cm->lf.lfm_stride; - cm->lf.lfm = - (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm)); - if (!cm->lf.lfm) return 1; - - unsigned int i; - for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]); - - return 0; -} - -static void free_loop_filter_mask(AV1_COMMON *cm) { - if (cm->lf.lfm == NULL) return; - - aom_free(cm->lf.lfm); - cm->lf.lfm = NULL; - cm->lf.lfm_num = 0; - cm->lf.lfm_stride = 0; -} -#endif - -void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) { - // Ensure that the decoded width and height are both multiples of - // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if - // subsampling is used). - // This simplifies the implementation of various experiments, - // eg. cdef, which operates on units of 8x8 luma pixels. - const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); - const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); - - cm->mi_cols = aligned_width >> MI_SIZE_LOG2; - cm->mi_rows = aligned_height >> MI_SIZE_LOG2; - cm->mi_stride = calc_mi_size(cm->mi_cols); - - cm->mb_cols = (cm->mi_cols + 2) >> 2; - cm->mb_rows = (cm->mi_rows + 2) >> 2; - cm->MBs = cm->mb_rows * cm->mb_cols; - -#if LOOP_FILTER_BITMASK - alloc_loop_filter_mask(cm); -#endif -} - void av1_free_ref_frame_buffers(BufferPool *pool) { int i; @@ -92,6 +38,9 @@ void av1_free_ref_frame_buffers(BufferPool *pool) { if (pool->frame_bufs[i].ref_count > 0 && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); + pool->frame_bufs[i].raw_frame_buffer.data = NULL; + pool->frame_bufs[i].raw_frame_buffer.size = 0; + pool->frame_bufs[i].raw_frame_buffer.priv = NULL; pool->frame_bufs[i].ref_count = 0; } aom_free(pool->frame_bufs[i].mvs); @@ -124,20 +73,19 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) { // able to quickly answer the question "Where is the <n>'th stripe for tile // row <m>?" To make that efficient, we generate the rst_last_stripe array. int num_stripes = 0; - for (int i = 0; i < cm->tile_rows; ++i) { + for (int i = 0; i < cm->tiles.rows; ++i) { TileInfo tile_info; av1_tile_set_row(&tile_info, cm, i); const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start; const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2); const int tile_stripes = (ext_h + 63) / 64; num_stripes += tile_stripes; - cm->rst_end_stripe[i] = num_stripes; } // Now we need to allocate enough space to store the line buffers for the // stripes const int frame_w = cm->superres_upscaled_width; - const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; + const int use_highbd = cm->seq_params.use_highbitdepth; for (int p = 0; p < num_planes; ++p) { const int is_uv = p > 0; @@ -184,106 +132,131 @@ void av1_free_restoration_buffers(AV1_COMMON *cm) { aom_free_frame_buffer(&cm->rst_frame); } -void av1_free_above_context_buffers(AV1_COMMON *cm, - int num_free_above_contexts) { +void av1_free_above_context_buffers(CommonContexts *above_contexts) { int i; - const int num_planes = cm->num_allocated_above_context_planes; + const int num_planes = above_contexts->num_planes; - for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) { + for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) { for (i = 0; i < num_planes; i++) { - aom_free(cm->above_context[i][tile_row]); - cm->above_context[i][tile_row] = NULL; + aom_free(above_contexts->entropy[i][tile_row]); + above_contexts->entropy[i][tile_row] = NULL; } - aom_free(cm->above_seg_context[tile_row]); - cm->above_seg_context[tile_row] = NULL; + aom_free(above_contexts->partition[tile_row]); + above_contexts->partition[tile_row] = NULL; - aom_free(cm->above_txfm_context[tile_row]); - cm->above_txfm_context[tile_row] = NULL; + aom_free(above_contexts->txfm[tile_row]); + above_contexts->txfm[tile_row] = NULL; } for (i = 0; i < num_planes; i++) { - aom_free(cm->above_context[i]); - cm->above_context[i] = NULL; + aom_free(above_contexts->entropy[i]); + above_contexts->entropy[i] = NULL; } - aom_free(cm->above_seg_context); - cm->above_seg_context = NULL; + aom_free(above_contexts->partition); + above_contexts->partition = NULL; - aom_free(cm->above_txfm_context); - cm->above_txfm_context = NULL; + aom_free(above_contexts->txfm); + above_contexts->txfm = NULL; - cm->num_allocated_above_contexts = 0; - cm->num_allocated_above_context_mi_col = 0; - cm->num_allocated_above_context_planes = 0; + above_contexts->num_tile_rows = 0; + above_contexts->num_mi_cols = 0; + above_contexts->num_planes = 0; } void av1_free_context_buffers(AV1_COMMON *cm) { - cm->free_mi(cm); + cm->mi_params.free_mi(&cm->mi_params); - av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts); + av1_free_above_context_buffers(&cm->above_contexts); -#if LOOP_FILTER_BITMASK - free_loop_filter_mask(cm); +#if CONFIG_LPF_MASK + av1_free_loop_filter_mask(cm); #endif } -int av1_alloc_above_context_buffers(AV1_COMMON *cm, - int num_alloc_above_contexts) { - const int num_planes = av1_num_planes(cm); - int plane_idx; +int av1_alloc_above_context_buffers(CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes) { const int aligned_mi_cols = - ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2); + ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2); // Allocate above context buffers - cm->num_allocated_above_contexts = num_alloc_above_contexts; - cm->num_allocated_above_context_mi_col = aligned_mi_cols; - cm->num_allocated_above_context_planes = num_planes; - for (plane_idx = 0; plane_idx < num_planes; plane_idx++) { - cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc( - num_alloc_above_contexts, sizeof(cm->above_context[0])); - if (!cm->above_context[plane_idx]) return 1; + above_contexts->num_tile_rows = num_tile_rows; + above_contexts->num_mi_cols = aligned_mi_cols; + above_contexts->num_planes = num_planes; + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->entropy[0])); + if (!above_contexts->entropy[plane_idx]) return 1; } - cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc( - num_alloc_above_contexts, sizeof(cm->above_seg_context)); - if (!cm->above_seg_context) return 1; + above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->partition)); + if (!above_contexts->partition) return 1; - cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc( - num_alloc_above_contexts, sizeof(cm->above_txfm_context)); - if (!cm->above_txfm_context) return 1; + above_contexts->txfm = + (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm)); + if (!above_contexts->txfm) return 1; - for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) { - for (plane_idx = 0; plane_idx < num_planes; plane_idx++) { - cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc( - aligned_mi_cols, sizeof(*cm->above_context[0][tile_row])); - if (!cm->above_context[plane_idx][tile_row]) return 1; + for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) { + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx][tile_row] = + (ENTROPY_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row])); + if (!above_contexts->entropy[plane_idx][tile_row]) return 1; } - cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc( - aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row])); - if (!cm->above_seg_context[tile_row]) return 1; + above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->partition[tile_row])); + if (!above_contexts->partition[tile_row]) return 1; - cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc( - aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row])); - if (!cm->above_txfm_context[tile_row]) return 1; + above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row])); + if (!above_contexts->txfm[tile_row]) return 1; } return 0; } -int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) { - int new_mi_size; - - av1_set_mb_mi(cm, width, height); - new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); - if (cm->mi_alloc_size < new_mi_size) { - cm->free_mi(cm); - if (cm->alloc_mi(cm, new_mi_size)) goto fail; +// Allocate the dynamically allocated arrays in 'mi_params' assuming +// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of +// the struct members. +static int alloc_mi(CommonModeInfoParams *mi_params) { + const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows); + const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows; + const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int alloc_mi_size = + mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d); + + if (mi_params->mi_alloc_size < alloc_mi_size || + mi_params->mi_grid_size < mi_grid_size) { + mi_params->free_mi(mi_params); + + mi_params->mi_alloc = + aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc)); + if (!mi_params->mi_alloc) return 1; + mi_params->mi_alloc_size = alloc_mi_size; + + mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc( + mi_grid_size, sizeof(*mi_params->mi_grid_base)); + if (!mi_params->mi_grid_base) return 1; + mi_params->mi_grid_size = mi_grid_size; + + mi_params->tx_type_map = + aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map)); + if (!mi_params->tx_type_map) return 1; } return 0; +} + +int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) { + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->set_mb_mi(mi_params, width, height); + if (alloc_mi(mi_params)) goto fail; + return 0; fail: // clear the mi_* values to force a realloc on resync - av1_set_mb_mi(cm, 0, 0); + mi_params->set_mb_mi(mi_params, 0, 0); av1_free_context_buffers(cm); return 1; } @@ -293,8 +266,44 @@ void av1_remove_common(AV1_COMMON *cm) { aom_free(cm->fc); cm->fc = NULL; - aom_free(cm->frame_contexts); - cm->frame_contexts = NULL; + aom_free(cm->default_frame_context); + cm->default_frame_context = NULL; +} + +void av1_init_mi_buffers(CommonModeInfoParams *mi_params) { + mi_params->setup_mi(mi_params); +} + +#if CONFIG_LPF_MASK +int av1_alloc_loop_filter_mask(AV1_COMMON *cm) { + aom_free(cm->lf.lfm); + cm->lf.lfm = NULL; + + // Each lfm holds bit masks for all the 4x4 blocks in a max + // 64x64 (128x128 for ext_partitions) region. The stride + // and rows are rounded up / truncated to a multiple of 16 + // (32 for ext_partition). + cm->lf.lfm_stride = + (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2; + cm->lf.lfm_num = + ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) * + cm->lf.lfm_stride; + cm->lf.lfm = + (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm)); + if (!cm->lf.lfm) return 1; + + unsigned int i; + for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]); + + return 0; } -void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); } +void av1_free_loop_filter_mask(AV1_COMMON *cm) { + if (cm->lf.lfm == NULL) return; + + aom_free(cm->lf.lfm); + cm->lf.lfm = NULL; + cm->lf.lfm_num = 0; + cm->lf.lfm_stride = 0; +} +#endif diff --git a/media/libaom/src/av1/common/alloccommon.h b/media/libaom/src/av1/common/alloccommon.h index 8e5896981..fe8e0c530 100644 --- a/media/libaom/src/av1/common/alloccommon.h +++ b/media/libaom/src/av1/common/alloccommon.h @@ -14,21 +14,25 @@ #define INVALID_IDX -1 // Invalid buffer index. +#include "config/aom_config.h" + #ifdef __cplusplus extern "C" { #endif struct AV1Common; struct BufferPool; +struct CommonContexts; +struct CommonModeInfoParams; void av1_remove_common(struct AV1Common *cm); -int av1_alloc_above_context_buffers(struct AV1Common *cm, - int num_alloc_above_contexts); -void av1_free_above_context_buffers(struct AV1Common *cm, - int num_free_above_contexts); +int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes); +void av1_free_above_context_buffers(struct CommonContexts *above_contexts); int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height); -void av1_init_context_buffers(struct AV1Common *cm); +void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params); void av1_free_context_buffers(struct AV1Common *cm); void av1_free_ref_frame_buffers(struct BufferPool *pool); @@ -38,9 +42,13 @@ void av1_free_restoration_buffers(struct AV1Common *cm); int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height); void av1_free_state_buffers(struct AV1Common *cm); -void av1_set_mb_mi(struct AV1Common *cm, int width, int height); int av1_get_MBs(int width, int height); +#if CONFIG_LPF_MASK +int av1_alloc_loop_filter_mask(struct AV1Common *cm); +void av1_free_loop_filter_mask(struct AV1Common *cm); +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c index bad411743..2f3567aea 100644 --- a/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c +++ b/media/libaom/src/av1/common/arm/av1_inv_txfm_neon.c @@ -48,11 +48,11 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { // 1D functions static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { - { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c }, - { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c }, - { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c }, - { av1_idct32_new, NULL, NULL }, - { av1_idct64_new, NULL, NULL }, + { av1_idct4, av1_iadst4, av1_iidentity4_c }, + { av1_idct8, av1_iadst8, av1_iidentity8_c }, + { av1_idct16, av1_iadst16, av1_iidentity16_c }, + { av1_idct32, NULL, NULL }, + { av1_idct64, NULL, NULL }, }; static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, @@ -248,31 +248,27 @@ static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { x[1] = vcombine_s16(v1[0], v1[1]); } -static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1, - int16_t *const c2, - int16_t *const c3) { +static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { int16x4_t val = vdup_n_s16((int16_t)0); - val = vld1_lane_s16(c0, val, 0); - val = vld1_lane_s16(c1, val, 1); - val = vld1_lane_s16(c2, val, 2); - val = vld1_lane_s16(c3, val, 3); + val = vset_lane_s16(c0, val, 0); + val = vset_lane_s16(c1, val, 1); + val = vset_lane_s16(c2, val, 2); + val = vset_lane_s16(c3, val, 3); return val; } -static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), - (int16_t *)(cospi + 20), (int16_t *)(cospi + 44)); - const int16x4_t c1 = - create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28), - (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); - const int16x4_t c2 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[20], (int16_t)cospi[44]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[8]; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; @@ -327,22 +323,21 @@ static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out, // Stage 7 out[0] = x[0]; - out[1] = vnegq_s16(x[4]); + out[1] = vqnegq_s16(x[4]); out[2] = x[6]; - out[3] = vnegq_s16(x[2]); + out[3] = vqnegq_s16(x[2]); out[4] = x[3]; - out[5] = vnegq_s16(x[7]); + out[5] = vqnegq_s16(x[7]); out[6] = x[5]; - out[7] = vnegq_s16(x[1]); + out[7] = vqnegq_s16(x[1]); } -static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); - const int16x4_t c2 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[8]; int16x8_t s0, s1, s4, s5; @@ -381,34 +376,32 @@ static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out, // Stage 7 out[0] = x[0]; - out[1] = vnegq_s16(x[4]); + out[1] = vqnegq_s16(x[4]); out[2] = x[6]; - out[3] = vnegq_s16(x[2]); + out[3] = vqnegq_s16(x[2]); out[4] = x[3]; - out[5] = vnegq_s16(x[7]); + out[5] = vqnegq_s16(x[7]); out[6] = x[5]; - out[7] = vnegq_s16(x[1]); + out[7] = vqnegq_s16(x[1]); } -static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, - int bit) { +static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[8], step2[8]; - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - const int16x4_t c2 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); // stage 2 btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); // stage 3 - btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]); - btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]); + btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]); step2[4] = vqaddq_s16(step1[4], step1[5]); step2[5] = vqsubq_s16(step1[4], step1[5]); step2[6] = vqsubq_s16(step1[7], step1[6]); @@ -419,7 +412,7 @@ static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, step1[1] = vqaddq_s16(step2[1], step2[2]); step1[2] = vqsubq_s16(step2[1], step2[2]); step1[3] = vqsubq_s16(step2[0], step2[3]); - btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]); + btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]); // stage 5 out[0] = vqaddq_s16(step1[0], step2[7]); @@ -432,8 +425,8 @@ static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, out[7] = vqsubq_s16(step1[0], step2[7]); } -static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1; @@ -489,19 +482,24 @@ static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input, } } -static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output, - int8_t cos_bit, int bit) { - (void)bit; - (void)cos_bit; - - output[0] = vmulq_n_s16(input[0], (int16_t)2); - output[1] = vmulq_n_s16(input[1], (int16_t)2); - output[2] = vmulq_n_s16(input[2], (int16_t)2); - output[3] = vmulq_n_s16(input[3], (int16_t)2); - output[4] = vmulq_n_s16(input[4], (int16_t)2); - output[5] = vmulq_n_s16(input[5], (int16_t)2); - output[6] = vmulq_n_s16(input[6], (int16_t)2); - output[7] = vmulq_n_s16(input[7], (int16_t)2); +static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output, + int txw_idx, int8_t size, int bit) { + const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); + int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]); + int16x4_t low_i16, high_i16; + int32x4_t low_i32, high_i32; + for (int i = 0; i < size; i++) { + int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale); + int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale); + low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4); + high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4); + low_i16 = vqmovn_s32(low_i32); + high_i16 = vqmovn_s32(high_i32); + output[i] = vcombine_s16(low_i16, high_i16); + } } static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output, @@ -520,38 +518,8 @@ static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output, } } -static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output, - int8_t cos_bit, int bit) { - (void)bit; - (void)cos_bit; - - int32x4_t out_low, out_high; - int16x4_t low, high; - int16_t scale = (int16_t)(2 * NewSqrt2); - - for (int z = 0; z < 16; ++z) { - out_low = vmull_n_s16(vget_low_s16(input[z]), scale); - out_high = vmull_n_s16(vget_high_s16(input[z]), scale); - - low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); - high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); - - output[z] = vcombine_s16(low, high); - } -} - -static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output, - int8_t cos_bit, int bit) { - (void)bit; - (void)cos_bit; - - for (int z = 0; z < 32; ++z) { - output[z] = vmulq_n_s16(input[z], (int16_t)4); - } -} - -static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1; @@ -584,25 +552,23 @@ static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out, out[15] = step1; } -static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[16], step2[16]; - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), - (int16_t *)(cospi + 36), (int16_t *)(cospi + 28)); - const int16x4_t c1 = - create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44), - (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); - const int16x4_t c2 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - const int16x4_t c3 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); - + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 2 btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); @@ -642,8 +608,7 @@ static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out, btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); - btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3, - &step2[10], &step2[13]); + btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]); step2[4] = vqaddq_s16(step1[4], step1[5]); step2[5] = vqsubq_s16(step1[4], step1[5]); @@ -710,14 +675,16 @@ static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out, out[15] = vqsubq_s16(step2[0], step2[15]); } -static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[16], step2[16]; - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c1 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 @@ -753,8 +720,7 @@ static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out, btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); - btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0, - &step2[10], &step2[13]); + btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]); step2[4] = vqaddq_s16(step1[4], step1[5]); step2[5] = vqsubq_s16(step1[4], step1[5]); @@ -820,30 +786,23 @@ static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out, out[15] = vqsubq_s16(step2[0], step2[15]); } -static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62), - (int16_t *)(cospi + 10), (int16_t *)(cospi + 54)); - const int16x4_t c1 = - create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46), - (int16_t *)(cospi + 26), (int16_t *)(cospi + 38)); - const int16x4_t c2 = - create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30), - (int16_t *)(cospi + 42), (int16_t *)(cospi + 22)); - const int16x4_t c3 = - create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14), - (int16_t *)(cospi + 58), (int16_t *)(cospi + 6)); - const int16x4_t c4 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - - const int16x4_t c = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[10], (int16_t)cospi[54]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[26], (int16_t)cospi[38]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[16]; int16x8_t t[14]; @@ -933,14 +892,14 @@ static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out, t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; - btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); - btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6); + btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6); t[8] = x[8]; t[9] = x[9]; t[10] = x[10]; t[11] = x[11]; - btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); - btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14); + btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14); // Stage 7 x[0] = vqaddq_s16(t[0], t[2]); @@ -961,40 +920,38 @@ static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out, x[15] = vqsubq_s16(s13, s15); // Stage 8 - btf_16_half_neon(x + 2, c); - btf_16_half_neon(x + 6, c); - btf_16_half_neon(x + 10, c); - btf_16_half_neon(x + 14, c); + btf_16_half_neon(x + 2, c5); + btf_16_half_neon(x + 6, c5); + btf_16_half_neon(x + 10, c5); + btf_16_half_neon(x + 14, c5); // Stage 9 out[0] = x[0]; - out[1] = vnegq_s16(x[8]); + out[1] = vqnegq_s16(x[8]); out[2] = x[12]; - out[3] = vnegq_s16(x[4]); + out[3] = vqnegq_s16(x[4]); out[4] = x[6]; - out[5] = vnegq_s16(x[14]); + out[5] = vqnegq_s16(x[14]); out[6] = x[10]; - out[7] = vnegq_s16(x[2]); + out[7] = vqnegq_s16(x[2]); out[8] = x[3]; - out[9] = vnegq_s16(x[11]); + out[9] = vqnegq_s16(x[11]); out[10] = x[15]; - out[11] = vnegq_s16(x[7]); + out[11] = vqnegq_s16(x[7]); out[12] = x[5]; - out[13] = vnegq_s16(x[13]); + out[13] = vqnegq_s16(x[13]); out[14] = x[9]; - out[15] = vnegq_s16(x[1]); + out[15] = vqnegq_s16(x[1]); } -static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); - const int16x4_t c4 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - const int16x4_t c = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[16]; int16x8_t t[10]; @@ -1016,7 +973,7 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out, // Stage 4 t[0] = x[0]; t[1] = x[1]; - btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); // Stage 5 x[0] = t[0]; @@ -1031,10 +988,10 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out, // stage 6 t[0] = x[0]; t[1] = x[1]; - btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); t[8] = x[8]; t[9] = x[9]; - btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); // Stage 7 x[0] = t[0]; @@ -1055,41 +1012,39 @@ static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out, x[15] = s13; // Stage 8 - btf_16_half_neon(x + 2, c); - btf_16_half_neon(x + 6, c); - btf_16_half_neon(x + 10, c); - btf_16_half_neon(x + 14, c); + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); // Stage 9 out[0] = x[0]; - out[1] = vnegq_s16(x[8]); + out[1] = vqnegq_s16(x[8]); out[2] = x[12]; - out[3] = vnegq_s16(x[4]); + out[3] = vqnegq_s16(x[4]); out[4] = x[6]; - out[5] = vnegq_s16(x[14]); + out[5] = vqnegq_s16(x[14]); out[6] = x[10]; - out[7] = vnegq_s16(x[2]); + out[7] = vqnegq_s16(x[2]); out[8] = x[3]; - out[9] = vnegq_s16(x[11]); + out[9] = vqnegq_s16(x[11]); out[10] = x[15]; - out[11] = vnegq_s16(x[7]); + out[11] = vqnegq_s16(x[7]); out[12] = x[5]; - out[13] = vnegq_s16(x[13]); + out[13] = vqnegq_s16(x[13]); out[14] = x[9]; - out[15] = vnegq_s16(x[1]); + out[15] = vqnegq_s16(x[1]); } -static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); - const int16x4_t c4 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - const int16x4_t c = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[16]; int16x8_t t[14]; @@ -1144,10 +1099,10 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out, t[5] = x[5]; t[6] = x[6]; t[7] = x[7]; - btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); - btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); - btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); - btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14); // Stage 5 x[0] = vqaddq_s16(t[0], t[4]); @@ -1172,14 +1127,14 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out, t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; - btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); - btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6); + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6); t[8] = x[8]; t[9] = x[9]; t[10] = x[10]; t[11] = x[11]; - btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); - btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14); + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14); // Stage 7 x[0] = vqaddq_s16(t[0], t[2]); @@ -1200,60 +1155,58 @@ static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out, x[15] = vqsubq_s16(s13, s15); // Stage 8 - btf_16_half_neon(x + 2, c); - btf_16_half_neon(x + 6, c); - btf_16_half_neon(x + 10, c); - btf_16_half_neon(x + 14, c); + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); // Stage 9 out[0] = x[0]; - out[1] = vnegq_s16(x[8]); + out[1] = vqnegq_s16(x[8]); out[2] = x[12]; - out[3] = vnegq_s16(x[4]); + out[3] = vqnegq_s16(x[4]); out[4] = x[6]; - out[5] = vnegq_s16(x[14]); + out[5] = vqnegq_s16(x[14]); out[6] = x[10]; - out[7] = vnegq_s16(x[2]); + out[7] = vqnegq_s16(x[2]); out[8] = x[3]; - out[9] = vnegq_s16(x[11]); + out[9] = vqnegq_s16(x[11]); out[10] = x[15]; - out[11] = vnegq_s16(x[7]); + out[11] = vqnegq_s16(x[7]); out[12] = x[5]; - out[13] = vnegq_s16(x[13]); + out[13] = vqnegq_s16(x[13]); out[14] = x[9]; - out[15] = vnegq_s16(x[1]); + out[15] = vqnegq_s16(x[1]); } -static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[32], step2[32]; - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62), - (int16_t *)(cospi + 34), (int16_t *)(cospi + 30)); - const int16x4_t c1 = - create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46), - (int16_t *)(cospi + 50), (int16_t *)(cospi + 14)); - const int16x4_t c2 = - create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54), - (int16_t *)(cospi + 42), (int16_t *)(cospi + 22)); - const int16x4_t c3 = - create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38), - (int16_t *)(cospi + 58), (int16_t *)(cospi + 6)); - const int16x4_t c4 = - create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), - (int16_t *)(cospi + 36), (int16_t *)(cospi + 28)); - const int16x4_t c5 = - create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44), - (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); - const int16x4_t c6 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - const int16x4_t c7 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[34], (int16_t)cospi[30]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[50], (int16_t)cospi[14]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c8 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c9 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 2 @@ -1321,11 +1274,9 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out, btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); - btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6, - &step2[18], &step2[29]); + btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]); btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); - btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6, - &step2[22], &step2[25]); + btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]); step2[0] = step1[0]; step2[1] = step1[1]; @@ -1353,8 +1304,7 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out, btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); - btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7, - &step1[10], &step1[13]); + btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]); step1[4] = vqaddq_s16(step2[4], step2[5]); step1[5] = vqsubq_s16(step2[4], step2[5]); @@ -1386,10 +1336,8 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out, btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); - btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7, - &step2[20], &step2[27]); - btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7, - &step2[21], &step2[26]); + btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]); step2[0] = vqaddq_s16(step1[0], step1[3]); step2[1] = vqaddq_s16(step1[1], step1[2]); @@ -1516,8 +1464,8 @@ static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out, out[31] = vqsubq_s16(step2[0], step2[31]); } -static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1; @@ -1573,19 +1521,22 @@ static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out, out[31] = step1; } -static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[32], step2[32]; int32x4_t t32[16]; - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - const int16x4_t c1 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); - + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 @@ -1627,11 +1578,9 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out, btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); - btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0, - &step2[18], &step2[29]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); - btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0, - &step2[22], &step2[25]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); step2[0] = step1[0]; step2[8] = step1[8]; @@ -1659,8 +1608,7 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out, vrshrn_n_s32(t32[1], INV_COS_BIT)); btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); - btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1, - &step1[10], &step1[13]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); step1[4] = step2[4]; step1[5] = step2[4]; @@ -1692,10 +1640,8 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out, btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); - btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1, - &step2[20], &step2[27]); - btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1, - &step2[21], &step2[26]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); step2[0] = step1[0]; step2[1] = step1[0]; @@ -1822,18 +1768,22 @@ static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out, out[31] = vqsubq_s16(step2[0], step2[31]); } -static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out, - int8_t cos_bit, int bit) { +static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { (void)bit; const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[32], step2[32]; int32x4_t t32[16]; - const int16x4_t c0 = - create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), - (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); - const int16x4_t c1 = - create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), - (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 @@ -1889,11 +1839,9 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out, btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); - btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0, - &step2[18], &step2[29]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); - btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0, - &step2[22], &step2[25]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); step2[0] = step1[0]; step2[2] = step1[2]; @@ -1924,8 +1872,7 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out, btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); - btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1, - &step1[10], &step1[13]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); step1[4] = vqaddq_s16(step2[4], step2[5]); step1[5] = vqsubq_s16(step2[4], step2[5]); @@ -1957,10 +1904,8 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out, btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); - btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1, - &step2[20], &step2[27]); - btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1, - &step2[21], &step2[26]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); step2[0] = vqaddq_s16(step1[0], step1[3]); step2[1] = vqaddq_s16(step1[0], step1[2]); @@ -2086,33 +2031,1542 @@ static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out, out[30] = vqsubq_s16(step2[1], step2[30]); out[31] = vqsubq_s16(step2[0], step2[31]); } +static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]); + btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]); + btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]); + btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]); + + step1[0] = vqaddq_s16(step2[0], step2[15]); + step1[1] = vqaddq_s16(step2[1], step2[14]); + step1[2] = vqaddq_s16(step2[2], step2[13]); + step1[3] = vqaddq_s16(step2[3], step2[12]); + step1[4] = vqaddq_s16(step2[4], step2[11]); + step1[5] = vqaddq_s16(step2[5], step2[10]); + step1[6] = vqaddq_s16(step2[6], step2[9]); + step1[7] = vqaddq_s16(step2[7], step2[8]); + step1[8] = vqsubq_s16(step2[7], step2[8]); + step1[9] = vqsubq_s16(step2[6], step2[9]); + step1[10] = vqsubq_s16(step2[5], step2[10]); + step1[11] = vqsubq_s16(step2[4], step2[11]); + step1[12] = vqsubq_s16(step2[3], step2[12]); + step1[13] = vqsubq_s16(step2[2], step2[13]); + step1[14] = vqsubq_s16(step2[1], step2[14]); + step1[15] = vqsubq_s16(step2[0], step2[15]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[47]); + step1[33] = vqaddq_s16(step2[33], step2[46]); + step1[34] = vqaddq_s16(step2[34], step2[45]); + step1[35] = vqaddq_s16(step2[35], step2[44]); + step1[36] = vqaddq_s16(step2[36], step2[43]); + step1[37] = vqaddq_s16(step2[37], step2[42]); + step1[38] = vqaddq_s16(step2[38], step2[41]); + step1[39] = vqaddq_s16(step2[39], step2[40]); + step1[40] = vqsubq_s16(step2[39], step2[40]); + step1[41] = vqsubq_s16(step2[38], step2[41]); + step1[42] = vqsubq_s16(step2[37], step2[42]); + step1[43] = vqsubq_s16(step2[36], step2[43]); + step1[44] = vqsubq_s16(step2[35], step2[44]); + step1[45] = vqsubq_s16(step2[34], step2[45]); + step1[46] = vqsubq_s16(step2[33], step2[46]); + step1[47] = vqsubq_s16(step2[32], step2[47]); + step1[48] = vqsubq_s16(step2[63], step2[48]); + step1[49] = vqsubq_s16(step2[62], step2[49]); + step1[50] = vqsubq_s16(step2[61], step2[50]); + step1[51] = vqsubq_s16(step2[60], step2[51]); + step1[52] = vqsubq_s16(step2[59], step2[52]); + step1[53] = vqsubq_s16(step2[58], step2[53]); + step1[54] = vqsubq_s16(step2[57], step2[54]); + step1[55] = vqsubq_s16(step2[56], step2[55]); + step1[56] = vqaddq_s16(step2[56], step2[55]); + step1[57] = vqaddq_s16(step2[57], step2[54]); + step1[58] = vqaddq_s16(step2[58], step2[53]); + step1[59] = vqaddq_s16(step2[59], step2[52]); + step1[60] = vqaddq_s16(step2[60], step2[51]); + step1[61] = vqaddq_s16(step2[61], step2[50]); + step1[62] = vqaddq_s16(step2[62], step2[49]); + step1[63] = vqaddq_s16(step2[63], step2[48]); +} + +static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]); + btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]); + btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]); + btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]); + btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]); + btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]); + btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]); + btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]); + + step2[0] = vqaddq_s16(step1[0], step1[31]); + step2[1] = vqaddq_s16(step1[1], step1[30]); + step2[2] = vqaddq_s16(step1[2], step1[29]); + step2[3] = vqaddq_s16(step1[3], step1[28]); + step2[4] = vqaddq_s16(step1[4], step1[27]); + step2[5] = vqaddq_s16(step1[5], step1[26]); + step2[6] = vqaddq_s16(step1[6], step1[25]); + step2[7] = vqaddq_s16(step1[7], step1[24]); + step2[8] = vqaddq_s16(step1[8], step1[23]); + step2[9] = vqaddq_s16(step1[9], step1[22]); + step2[10] = vqaddq_s16(step1[10], step1[21]); + step2[11] = vqaddq_s16(step1[11], step1[20]); + step2[12] = vqaddq_s16(step1[12], step1[19]); + step2[13] = vqaddq_s16(step1[13], step1[18]); + step2[14] = vqaddq_s16(step1[14], step1[17]); + step2[15] = vqaddq_s16(step1[15], step1[16]); + step2[16] = vqsubq_s16(step1[15], step1[16]); + step2[17] = vqsubq_s16(step1[14], step1[17]); + step2[18] = vqsubq_s16(step1[13], step1[18]); + step2[19] = vqsubq_s16(step1[12], step1[19]); + step2[20] = vqsubq_s16(step1[11], step1[20]); + step2[21] = vqsubq_s16(step1[10], step1[21]); + step2[22] = vqsubq_s16(step1[9], step1[22]); + step2[23] = vqsubq_s16(step1[8], step1[23]); + step2[24] = vqsubq_s16(step1[7], step1[24]); + step2[25] = vqsubq_s16(step1[6], step1[25]); + step2[26] = vqsubq_s16(step1[5], step1[26]); + step2[27] = vqsubq_s16(step1[4], step1[27]); + step2[28] = vqsubq_s16(step1[3], step1[28]); + step2[29] = vqsubq_s16(step1[2], step1[29]); + step2[30] = vqsubq_s16(step1[1], step1[30]); + step2[31] = vqsubq_s16(step1[0], step1[31]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[37] = step1[37]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[58] = step1[58]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; +} + +static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[16]; + step2[4] = in[8]; + step2[6] = in[24]; + step2[8] = in[4]; + step2[10] = in[20]; + step2[12] = in[12]; + step2[14] = in[28]; + step2[16] = in[2]; + step2[18] = in[18]; + step2[20] = in[10]; + step2[22] = in[26]; + step2[24] = in[6]; + step2[26] = in[22]; + step2[28] = in[14]; + step2[30] = in[30]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]); + btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]); + btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]); + btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]); + btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[8] = step2[8]; + step1[10] = step2[10]; + step1[12] = step2[12]; + step1[14] = step2[14]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]); + btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]); + btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = vqaddq_s16(step2[32], step2[33]); + step1[33] = vqsubq_s16(step2[32], step2[33]); + step1[34] = vqsubq_s16(step2[35], step2[34]); + step1[35] = vqaddq_s16(step2[35], step2[34]); + step1[36] = vqaddq_s16(step2[36], step2[37]); + step1[37] = vqsubq_s16(step2[36], step2[37]); + step1[38] = vqsubq_s16(step2[39], step2[38]); + step1[39] = vqaddq_s16(step2[39], step2[38]); + step1[40] = vqaddq_s16(step2[40], step2[41]); + step1[41] = vqsubq_s16(step2[40], step2[41]); + step1[42] = vqsubq_s16(step2[43], step2[42]); + step1[43] = vqaddq_s16(step2[43], step2[42]); + step1[44] = vqaddq_s16(step2[44], step2[45]); + step1[45] = vqsubq_s16(step2[44], step2[45]); + step1[46] = vqsubq_s16(step2[47], step2[46]); + step1[47] = vqaddq_s16(step2[47], step2[46]); + step1[48] = vqaddq_s16(step2[48], step2[49]); + step1[49] = vqsubq_s16(step2[48], step2[49]); + step1[50] = vqsubq_s16(step2[51], step2[50]); + step1[51] = vqaddq_s16(step2[51], step2[50]); + step1[52] = vqaddq_s16(step2[52], step2[53]); + step1[53] = vqsubq_s16(step2[52], step2[53]); + step1[54] = vqsubq_s16(step2[55], step2[54]); + step1[55] = vqaddq_s16(step2[55], step2[54]); + step1[56] = vqaddq_s16(step2[56], step2[57]); + step1[57] = vqsubq_s16(step2[56], step2[57]); + step1[58] = vqsubq_s16(step2[59], step2[58]); + step1[59] = vqaddq_s16(step2[59], step2[58]); + step1[60] = vqaddq_s16(step2[60], step2[61]); + step1[61] = vqsubq_s16(step2[60], step2[61]); + step1[62] = vqsubq_s16(step2[63], step2[62]); + step1[63] = vqaddq_s16(step2[63], step2[62]); + + // stage 4 + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = vqaddq_s16(step1[16], step1[17]); + step2[17] = vqsubq_s16(step1[16], step1[17]); + step2[18] = vqsubq_s16(step1[19], step1[18]); + step2[19] = vqaddq_s16(step1[19], step1[18]); + step2[20] = vqaddq_s16(step1[20], step1[21]); + step2[21] = vqsubq_s16(step1[20], step1[21]); + step2[22] = vqsubq_s16(step1[23], step1[22]); + step2[23] = vqaddq_s16(step1[23], step1[22]); + step2[24] = vqaddq_s16(step1[24], step1[25]); + step2[25] = vqsubq_s16(step1[24], step1[25]); + step2[26] = vqsubq_s16(step1[27], step1[26]); + step2[27] = vqaddq_s16(step1[27], step1[26]); + step2[28] = vqaddq_s16(step1[28], step1[29]); + step2[29] = vqsubq_s16(step1[28], step1[29]); + step2[30] = vqsubq_s16(step1[31], step1[30]); + step2[31] = vqaddq_s16(step1[31], step1[30]); + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + step1[2] = step2[2]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + + t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; + out[32] = step1; + out[33] = step1; + out[34] = step1; + out[35] = step1; + out[36] = step1; + out[37] = step1; + out[38] = step1; + out[39] = step1; + out[40] = step1; + out[41] = step1; + out[42] = step1; + out[43] = step1; + out[44] = step1; + out[45] = step1; + out[46] = step1; + out[47] = step1; + out[48] = step1; + out[49] = step1; + out[50] = step1; + out[51] = step1; + out[52] = step1; + out[53] = step1; + out[54] = step1; + out[55] = step1; + out[56] = step1; + out[57] = step1; + out[58] = step1; + out[59] = step1; + out[60] = step1; + out[61] = step1; + out[62] = step1; + out[63] = step1; +} + +static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[8] = in[4]; + step2[16] = in[2]; + step2[24] = in[6]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[8] = step2[8]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + step1[16] = step2[16]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[31] = step2[31]; + step1[32] = step2[32]; + step1[33] = step2[33]; + step1[34] = step2[33]; + step1[35] = step2[32]; + step1[36] = step2[39]; + step1[37] = step2[38]; + step1[38] = step2[38]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[41]; + step1[42] = step2[41]; + step1[43] = step2[40]; + step1[44] = step2[47]; + step1[45] = step2[46]; + step1[46] = step2[46]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[49]; + step1[50] = step2[49]; + step1[51] = step2[48]; + step1[52] = step2[55]; + step1[53] = step2[54]; + step1[54] = step2[54]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[57]; + step1[58] = step2[57]; + step1[59] = step2[56]; + step1[60] = step2[63]; + step1[61] = step2[62]; + step1[62] = step2[62]; + step1[63] = step2[63]; + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[17]; + step2[19] = step1[16]; + step2[20] = step1[23]; + step2[21] = step1[22]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[26] = step1[25]; + step2[27] = step1[24]; + step2[28] = step1[31]; + step2[29] = step1[30]; + step2[30] = step1[30]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[10] = step2[9]; + step1[11] = step2[8]; + step1[12] = step2[15]; + step1[13] = step2[14]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[3]; + step2[5] = step1[2]; + step2[6] = step1[1]; + step2[7] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[8]; + step2[8] = in[4]; + step2[12] = in[12]; + step2[16] = in[2]; + step2[20] = in[10]; + step2[24] = in[6]; + step2[28] = in[14]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[4] = step2[4]; + step1[8] = step2[8]; + step1[12] = step2[12]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[34] = step2[35]; + step1[35] = step2[35]; + step1[36] = step2[36]; + step1[37] = step2[36]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[42] = step2[43]; + step1[43] = step2[43]; + step1[44] = step2[44]; + step1[45] = step2[44]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[50] = step2[51]; + step1[51] = step2[51]; + step1[52] = step2[52]; + step1[53] = step2[52]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[58] = step2[59]; + step1[59] = step2[59]; + step1[60] = step2[60]; + step1[61] = step2[60]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + step2[4] = step1[4]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} // Functions for blocks with eob at DC and within // topleft 8x8, 16x16, 32x32 corner -static const transform_1d_neon - lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { - { - { av1_idct4_new, av1_idct4_new, NULL, NULL }, - { av1_iadst4_new, av1_iadst4_new, NULL, NULL }, - { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL }, - }, - { { av1_idct8_new, av1_idct8_new, NULL, NULL }, - { av1_iadst8_new, av1_iadst8_new, NULL, NULL }, - { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } }, - { - { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL }, - { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL }, - { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL }, - }, - { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new }, - { NULL, NULL, NULL, NULL }, - { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c, - av1_iidentity32_c } }, - { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } } - }; - static const transform_neon lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { { @@ -2120,108 +3574,35 @@ static const transform_neon { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, }, - { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL }, - { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL }, - { identity8_new_neon, identity8_new_neon, NULL, NULL } }, + { { idct8_low1_neon, idct8_neon, NULL, NULL }, + { iadst8_low1_neon, iadst8_neon, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, { - { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL }, - { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon, - NULL }, - { identity16_new_neon, identity16_new_neon, identity16_new_neon, - NULL }, + { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL }, + { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL }, + { NULL, NULL, NULL, NULL }, }, - { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon, - idct32_new_neon }, + { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon }, { NULL, NULL, NULL, NULL }, - { identity32_new_neon, identity32_new_neon, identity32_new_neon, - identity32_new_neon } }, - { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon, + idct64_low32_neon }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; -static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); - int32_t *temp_in = txfm_buf; - - int eobx, eoby; - get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; - - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); - - int32_t *temp_out = temp_in + buf_offset; - int32_t *buf = temp_out + buf_offset; - int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; - int r, bd = 8; - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_neon row_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_neon col_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - - // row tx - int row_start = (buf_size_nonzero_h_div8 * 8); - for (int i = 0; i < row_start; i++) { - if (abs(rect_type) == 1) { - for (int j = 0; j < txfm_size_col; j++) - temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); - row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); - } else { - row_txfm(input, buf_ptr, cos_bit_row, stage_range); - } - av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); - input += txfm_size_col; - buf_ptr += txfm_size_col; - } - - // Doing memset for the rows which are not processed in row transform. - memset(buf_ptr, 0, - sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); - - // col tx - for (int c = 0; c < txfm_size_col; c++) { - for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; - - col_txfm(temp_in, temp_out, cos_bit_col, stage_range); - av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); - - for (r = 0; r < txfm_size_row; ++r) { - output[r * stride + c] = - highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); - } - } -} - static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { + (void)tx_type; int16x8_t a[32 * 4]; int16x8_t b[32 * 4]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), @@ -2232,17 +3613,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const int32_t *input_1; int temp_b = 0; - const transform_neon row_txfm = - lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_neon col_txfm = - lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); for (int i = 0; i < buf_size_nonzero_h_div8; i++) { input_1 = input; @@ -2257,9 +3629,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, int y = i * txfm_size_col; round_shift_for_rect(&a[y], &a[y], txfm_size_col); } - row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); - av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, - -shift[0]); + identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col], + txw_idx, txfm_size_col, -shift[0]); for (int j = 0; j < buf_size_w_div8; ++j) { int k = j * 8 + i * txfm_size_col; transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); @@ -2267,9 +3638,8 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, temp_b += 8; } for (int j = 0; j < buf_size_w_div8; ++j) { - col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); - av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, - -shift[1]); + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { @@ -2281,90 +3651,6 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, } } -static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); - int32_t *temp_in = txfm_buf; - - int eobx, eoby; - get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; - - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); - - int32_t *temp_out = temp_in + buf_offset; - int32_t *buf = temp_out + buf_offset; - int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; - int r, bd = 8; - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_neon row_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_neon col_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - - // row tx - int row_start = (buf_size_nonzero_h_div8 * 8); - for (int i = 0; i < row_start; i++) { - if (abs(rect_type) == 1) { - for (int j = 0; j < txfm_size_col; j++) - temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); - row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); - } else { - row_txfm(input, buf_ptr, cos_bit_row, stage_range); - } - av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); - input += txfm_size_col; - buf_ptr += txfm_size_col; - } - // Doing memset for the rows which are not processed in row transform. - memset(buf_ptr, 0, - sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); - - // col tx - for (int c = 0; c < txfm_size_col; c++) { - if (lr_flip == 0) { - for (r = 0; r < txfm_size_row; ++r) - temp_in[r] = buf[r * txfm_size_col + c]; - } else { - // flip left right - for (r = 0; r < txfm_size_row; ++r) - temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; - } - col_txfm(temp_in, temp_out, cos_bit_col, stage_range); - av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); - - if (ud_flip == 0) { - for (r = 0; r < txfm_size_row; ++r) { - output[r * stride + c] = - highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); - } - } else { - // flip upside down - for (r = 0; r < txfm_size_row; ++r) { - output[r * stride + c] = highbd_clip_pixel_add( - output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); - } - } - } -} - static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { @@ -2372,11 +3658,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( int16x8_t b[16 * 2]; int eobx, eoby, ud_flip, lr_flip; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), @@ -2386,15 +3671,11 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const int32_t *input_1; int temp_b = 0; const transform_neon row_txfm = lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_neon col_txfm = - lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - assert(col_txfm != NULL); assert(row_txfm != NULL); get_flip_cfg(tx_type, &ud_flip, &lr_flip); @@ -2432,9 +3713,8 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( } } for (int j = 0; j < buf_size_w_div8; ++j) { - col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); - av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, - -shift[1]); + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { @@ -2446,90 +3726,6 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( } } -static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); - int32_t *temp_in = txfm_buf; - - int eobx, eoby; - get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; - - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); - - int32_t *temp_out = temp_in + buf_offset; - int32_t *buf = temp_out + buf_offset; - int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; - int r, bd = 8; - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_neon row_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_neon col_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - - // row tx - int row_start = (buf_size_nonzero_h_div8 * 8); - for (int i = 0; i < row_start; i++) { - if (abs(rect_type) == 1) { - for (int j = 0; j < txfm_size_col; j++) - temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); - row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); - } else { - row_txfm(input, buf_ptr, cos_bit_row, stage_range); - } - av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); - input += txfm_size_col; - buf_ptr += txfm_size_col; - } - // Doing memset for the rows which are not processed in row transform. - memset(buf_ptr, 0, - sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); - - // col tx - for (int c = 0; c < txfm_size_col; c++) { - if (lr_flip == 0) { - for (r = 0; r < txfm_size_row; ++r) - temp_in[r] = buf[r * txfm_size_col + c]; - } else { - // flip left right - for (r = 0; r < txfm_size_row; ++r) - temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; - } - col_txfm(temp_in, temp_out, cos_bit_col, stage_range); - av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); - - if (ud_flip == 0) { - for (r = 0; r < txfm_size_row; ++r) { - output[r * stride + c] = - highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); - } - } else { - // flip upside down - for (r = 0; r < txfm_size_row; ++r) { - output[r * stride + c] = highbd_clip_pixel_add( - output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); - } - } - } -} - static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { @@ -2537,11 +3733,10 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( int16x8_t b[16 * 2]; int eobx, eoby, ud_flip, lr_flip; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), @@ -2550,17 +3745,13 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const int32_t *input_1; int temp_b = 0; - const transform_neon row_txfm = - lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_neon col_txfm = lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); - assert(row_txfm != NULL); get_flip_cfg(tx_type, &ud_flip, &lr_flip); @@ -2577,9 +3768,8 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( int y = i * txfm_size_col; round_shift_for_rect(&a[y], &a[y], txfm_size_col); } - row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); - av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, - -shift[0]); + identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col], + txw_idx, txfm_size_col, -shift[0]); for (int j = 0; j < buf_size_w_div8; ++j) { int k = j * 8 + i * txfm_size_col; transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); @@ -2604,24 +3794,24 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, uint8_t *output, int stride, - TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { + TX_TYPE tx_type, int eob) { (void)eob; + TX_SIZE tx_size = TX_4X4; DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]); int32_t *temp_in = txfm_buf; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 }; int r, bd = 8; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; @@ -2647,6 +3837,7 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } + clamp_buf(temp_in, txfm_size_row, bd + 8); col_txfm(temp_in, temp_out, cos_bit_col, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); @@ -2666,24 +3857,25 @@ static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, } void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, TX_SIZE tx_size, - int eob) { + int stride, TX_TYPE tx_type, int eob) { (void)eob; + TX_SIZE tx_size = TX_4X8; DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); int32_t *temp_in = txfm_buf; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; int r, bd = 8; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; @@ -2711,6 +3903,7 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } + clamp_buf(temp_in, txfm_size_row, bd + 8); col_txfm(temp_in, temp_out, cos_bit_col, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); @@ -2730,24 +3923,25 @@ void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, } void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, TX_SIZE tx_size, - int eob) { + int stride, TX_TYPE tx_type, int eob) { (void)eob; + TX_SIZE tx_size = TX_8X4; DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); int32_t *temp_in = txfm_buf; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; int r, bd = 8; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; @@ -2775,6 +3969,7 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } + clamp_buf(temp_in, txfm_size_row, bd + 8); col_txfm(temp_in, temp_out, cos_bit_col, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); @@ -2794,24 +3989,25 @@ void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, } void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { + int stride, TX_TYPE tx_type, int eob) { (void)eob; + TX_SIZE tx_size = TX_4X16; DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); int32_t *temp_in = txfm_buf; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; int r, bd = 8; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; @@ -2837,6 +4033,7 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } + clamp_buf(temp_in, txfm_size_row, bd + 8); col_txfm(temp_in, temp_out, cos_bit_col, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); @@ -2856,25 +4053,25 @@ void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, } void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { + int stride, TX_TYPE tx_type, int eob) { (void)eob; - + TX_SIZE tx_size = TX_16X4; DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); int32_t *temp_in = txfm_buf; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; int r, bd = 8; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; @@ -2900,89 +4097,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } - col_txfm(temp_in, temp_out, cos_bit_col, stage_range); - av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); - - if (ud_flip == 0) { - for (r = 0; r < txfm_size_row; ++r) { - output[r * stride + c] = - highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); - } - } else { - // flip upside down - for (r = 0; r < txfm_size_row; ++r) { - output[r * stride + c] = highbd_clip_pixel_add( - output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); - } - } - } -} - -static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); - int32_t *temp_in = txfm_buf; - - int eobx, eoby, ud_flip, lr_flip, row_start; - get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); - - int32_t *temp_out = temp_in + buf_offset; - int32_t *buf = temp_out + buf_offset; - int32_t *buf_ptr = buf; - const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 }; - const int bd = 8; - int r; - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_neon row_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_neon col_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - row_start = (buf_size_nonzero_h_div8 << 3); - - for (int i = 0; i < row_start; i++) { - if (abs(rect_type) == 1) { - for (int j = 0; j < txfm_size_col; j++) - temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits); - row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range); - } else { - row_txfm(input, buf_ptr, cos_bit_row, stage_range); - } - av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); - input += txfm_size_col; - buf_ptr += txfm_size_col; - } - - // Doing memset for the rows which are not processed in row transform. - memset(buf_ptr, 0, - sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start)); - - for (int c = 0; c < txfm_size_col; c++) { - if (lr_flip == 0) { - for (r = 0; r < txfm_size_row; ++r) - temp_in[r] = buf[r * txfm_size_col + c]; - } else { - // flip left right - for (r = 0; r < txfm_size_row; ++r) - temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; - } + clamp_buf(temp_in, txfm_size_row, bd + 8); col_txfm(temp_in, temp_out, cos_bit_col, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); @@ -3008,17 +4123,18 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( int16x8_t b[64 * 8]; int eobx, eoby, ud_flip, lr_flip; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int buf_size_w_div8 = txfm_size_col >> 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const int32_t *input_1; @@ -3038,14 +4154,14 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( input_1 = input; for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { int k = j * 8 + i * txfm_size_col; - load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride); transpose_s16_8x8q(&a[k], &a[k]); input_1 += 8; } - input += (txfm_size_col * 8); + input += (input_stride * 8); if (abs(rect_type) == 1) { int y = i * txfm_size_col; - round_shift_for_rect(&a[y], &a[y], txfm_size_col); + round_shift_for_rect(&a[y], &a[y], input_stride); } row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, @@ -3083,36 +4199,6 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( } } -static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - switch (tx_type) { - case IDTX: - lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type, - tx_size, eob); - break; - - case H_DCT: - case H_ADST: - case H_FLIPADST: - lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type, - tx_size, eob); - break; - - case V_DCT: - case V_ADST: - case V_FLIPADST: - lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type, - tx_size, eob); - break; - - default: - lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type, - tx_size, eob); - break; - } -} - static INLINE void lowbd_inv_txfm2d_add_universe_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { @@ -3146,73 +4232,27 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon( void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { - int row; switch (tx_size) { case TX_4X4: - lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size, - eob); + lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob); break; case TX_4X8: - lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size, - eob); + lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob); break; case TX_8X4: - lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size, - eob); + lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob); break; case TX_4X16: - lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size, - eob); + lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob); break; case TX_16X4: - lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size, - eob); + lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob); break; - case TX_16X64: { - lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type, - tx_size, eob); - } break; - - case TX_64X16: { - int32_t mod_input[64 * 16]; - for (row = 0; row < 16; ++row) { - memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); - memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); - } - lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); - } break; - - case TX_32X64: { - lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type, - tx_size, eob); - } break; - - case TX_64X32: { - int32_t mod_input[64 * 32]; - for (row = 0; row < 32; ++row) { - memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); - memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); - } - lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); - } break; - - case TX_64X64: { - int32_t mod_input[64 * 64]; - for (row = 0; row < 32; ++row) { - memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); - memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); - } - lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); - } break; - default: lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, tx_size, eob); diff --git a/media/libaom/src/av1/common/arm/av1_txfm_neon.c b/media/libaom/src/av1/common/arm/av1_txfm_neon.c index de3c54724..7e3a05ab7 100644 --- a/media/libaom/src/av1/common/arm/av1_txfm_neon.c +++ b/media/libaom/src/av1/common/arm/av1_txfm_neon.c @@ -12,6 +12,8 @@ #include <arm_neon.h> #include <assert.h> +#include "config/av1_rtcd.h" + #include "aom_ports/mem.h" #include "av1/common/arm/mem_neon.h" diff --git a/media/libaom/src/av1/common/arm/cfl_neon.c b/media/libaom/src/av1/common/arm/cfl_neon.c index 39025b5e5..371be5f0e 100644 --- a/media/libaom/src/av1/common/arm/cfl_neon.c +++ b/media/libaom/src/av1/common/arm/cfl_neon.c @@ -131,6 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input, } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } +#if CONFIG_AV1_HIGHBITDEPTH #ifndef __aarch64__ uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), @@ -247,6 +248,7 @@ static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input, input += input_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } +#endif // CONFIG_AV1_HIGHBITDEPTH CFL_GET_SUBSAMPLE_FUNCTION(neon) @@ -511,6 +513,7 @@ static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3, CFL_PREDICT_FN(neon, lbd) +#if CONFIG_AV1_HIGHBITDEPTH static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) { return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0))); } @@ -582,3 +585,4 @@ static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3, } CFL_PREDICT_FN(neon, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/av1/common/arm/convolve_neon.c b/media/libaom/src/av1/common/arm/convolve_neon.c index d0c4f8ff6..51c96961c 100644 --- a/media/libaom/src/av1/common/arm/convolve_neon.c +++ b/media/libaom/src/av1/common/arm/convolve_neon.c @@ -195,12 +195,12 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; const int8_t bits = FILTER_BITS - conv_params->round_0; - (void)subpel_y_q4; + (void)subpel_y_qn; (void)conv_params; (void)filter_params_y; @@ -214,7 +214,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0); const int16x8_t shift_by_bits = vdupq_n_s16(-bits); @@ -603,14 +603,14 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int vert_offset = filter_params_y->taps / 2 - 1; src -= vert_offset * src_stride; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; (void)conv_params; assert(conv_params->round_0 <= FILTER_BITS); @@ -618,7 +618,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); if (w <= 4) { uint8x8_t d01; @@ -844,17 +844,110 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, } } +// Horizontal filtering for convolve_2d_sr for width multiple of 8 +// Processes one row at a time +static INLINE void horiz_filter_w8_single_row( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int width, int height, const int16_t *x_filter, + const int16x8_t horiz_const, const int16x8_t shift_round_0) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + do { + uint8x8_t t0 = vld1_u8(src_ptr); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + int width_tmp = width; + const uint8_t *s = src_ptr + 8; + int16_t *dst_tmp = dst_ptr; + + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t sum = s0; + s0 = s7; + + s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const, shift_round_0); + + vst1q_s16(dst_tmp, res0); + + s += 8; + dst_tmp += 8; + width_tmp -= 8; + } while (width_tmp > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } while (height > 0); +} + +// Horizontal filtering for convolve_2d_sr for width <= 4 +// Processes one row at a time +static INLINE void horiz_filter_w4_single_row( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int width, int height, const int16_t *x_filter, + const int16x4_t horiz_const, const int16x4_t shift_round_0) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + do { + const uint8_t *s = src_ptr; + + __builtin_prefetch(s); + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s0 = vget_low_s16(tt0); + s4 = vget_high_s16(tt0); + + __builtin_prefetch(dst_ptr); + s += 8; + + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const, shift_round_0); + + if (width == 4) { + vst1_s16(dst_ptr, d0); + dst_ptr += dst_stride; + } else if (width == 2) { + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0); + dst_ptr += dst_stride; + } + + src_ptr += src_stride; + height--; + } while (height > 0); +} + void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { int im_dst_stride; int width, height; - uint8x8_t t0; #if defined(__aarch64__) + uint8x8_t t0; uint8x8_t t1, t2, t3, t4, t5, t6, t7; + const uint8_t *s; #endif DECLARE_ALIGNED(16, int16_t, @@ -867,7 +960,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; - const uint8_t *s; + int16_t *dst_ptr; dst_ptr = im_block; @@ -880,7 +973,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); int16_t x_filter_tmp[8]; int16x8_t filter_x_coef = vld1q_s16(x_filter); @@ -893,18 +986,14 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, assert(conv_params->round_0 > 0); if (w <= 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; -#if defined(__aarch64__) - int16x4_t s8, s9, s10, d1, d2, d3; -#endif - const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1)); +#if defined(__aarch64__) + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; do { + assert(height >= 4); s = src_ptr; - -#if defined(__aarch64__) __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); @@ -963,57 +1052,30 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, src_ptr += 4 * src_stride; dst_ptr += 4 * im_dst_stride; height -= 4; -#else - int16x8_t tt0; - - __builtin_prefetch(s); - - t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s0 = vget_low_s16(tt0); - s4 = vget_high_s16(tt0); - - __builtin_prefetch(dst_ptr); - s += 8; + } while (height >= 4); - t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - - s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 - s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 - s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 - s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 - s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 - s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 - - d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, - horiz_const, shift_round_0); - - if (w == 4) { - vst1_s16(dst_ptr, d0); - dst_ptr += im_dst_stride; - } else if (w == 2) { - vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0); - dst_ptr += im_dst_stride; - } - - src_ptr += src_stride; - height -= 1; -#endif - } while (height > 0); - } else { - int16_t *d_tmp; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0; -#if defined(__aarch64__) - int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7; - int16x8_t s11, s12, s13, s14; + if (height) { + assert(height < 4); + horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); + } +#else + horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); #endif + } else { const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1)); #if defined(__aarch64__) + int16_t *d_tmp; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14; + int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; do { + assert(height >= 8); __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); __builtin_prefetch(src_ptr + 2 * src_stride); @@ -1099,45 +1161,121 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, src_ptr += 8 * src_stride; dst_ptr += 8 * im_dst_stride; height -= 8; - } while (height > 0); -#else - do { - t0 = vld1_u8(src_ptr); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + } while (height >= 8); - width = w; - s = src_ptr + 8; - d_tmp = dst_ptr; + if (height >= 4) { + assert(height < 8); + int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, + reg10, reg11, reg12, reg13, reg14; + int16x4_t d0, d1, d2, d3, d4, d5, d6, d7; + int16x8_t out0, out1, out2, out3; + + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + + load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); - __builtin_prefetch(dst_ptr); + reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + s = src_ptr + 7; + d_tmp = dst_ptr; + width = w; do { - t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - int16x8_t sum = s0; - s0 = s7; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); - s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 - s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 - s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 - s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 - s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 - s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 - s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, - horiz_const, shift_round_0); + d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, + x_filter_tmp); + + d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, + x_filter_tmp); + + d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, + x_filter_tmp); + + d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10, + x_filter_tmp); + + d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11, + x_filter_tmp); + + d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12, + x_filter_tmp); + + d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13, + x_filter_tmp); + + d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14, + x_filter_tmp); + + transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1, + &out2, &out3); - vst1q_s16(d_tmp, res0); + out0 = vaddq_s16(out0, horiz_const); + out0 = vqrshlq_s16(out0, shift_round_0); + out1 = vaddq_s16(out1, horiz_const); + out1 = vqrshlq_s16(out1, shift_round_0); + + out2 = vaddq_s16(out2, horiz_const); + out2 = vqrshlq_s16(out2, shift_round_0); + + out3 = vaddq_s16(out3, horiz_const); + out3 = vqrshlq_s16(out3, shift_round_0); + + store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3); + + reg0 = reg8; + reg1 = reg9; + reg2 = reg10; + reg3 = reg11; + reg4 = reg12; + reg5 = reg13; + reg6 = reg14; s += 8; d_tmp += 8; width -= 8; } while (width > 0); - src_ptr += src_stride; - dst_ptr += im_dst_stride; - height -= 1; - } while (height > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * im_dst_stride; + height -= 4; + } + + if (height) { + assert(height < 4); + horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); + } +#else + + horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w, + height, x_filter_tmp, horiz_const, + shift_round_0); #endif } @@ -1149,7 +1287,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1)); const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); @@ -1409,12 +1547,12 @@ void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; (void)conv_params; const uint8_t *src1; diff --git a/media/libaom/src/av1/common/arm/convolve_neon.h b/media/libaom/src/av1/common/arm/convolve_neon.h index f382984f2..dbcfab631 100644 --- a/media/libaom/src/av1/common/arm/convolve_neon.h +++ b/media/libaom/src/av1/common/arm/convolve_neon.h @@ -73,7 +73,7 @@ static INLINE uint16x8_t wiener_convolve8_horiz_8x8( int32x4_t sum_0, sum_1; int32x4_t s3_0, s3_1; const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1)); - const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits)); + const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1; /* for the purpose of right shift by { conv_params->round_0 } */ const int32x4_t round_bits = vdupq_n_s32(-round0_bits); @@ -124,7 +124,7 @@ static INLINE uint16x4_t wiener_convolve8_horiz_4x8( int16x4_t sum, temp0, temp1, temp2; const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1)); - const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits)); + const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1; const int32x4_t round_bits = vdupq_n_s32(-round0_bits); const int32x4_t zero = vdupq_n_s32(0); const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0); diff --git a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c index e5674ef7c..92112fb85 100644 --- a/media/libaom/src/av1/common/arm/jnt_convolve_neon.c +++ b/media/libaom/src/av1/common/arm/jnt_convolve_neon.c @@ -23,19 +23,17 @@ #include "av1/common/arm/transpose_neon.h" #if !defined(__aarch64__) -static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0, - const uint16_t fwd_offset, - const uint16_t bck_offset, - const int16x4_t sub_const_vec, - const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0) { +static INLINE void compute_avg_4x1( + uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset, + const uint16_t bck_offset, const int16x4_t sub_const_vec, + const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) { int16x4_t tmp0; uint16x4_t tmp_u0; uint32x4_t sum0; int32x4_t dst0; int16x8_t tmp4; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); sum0 = vmull_n_u16(res0, fwd_offset); @@ -65,12 +63,10 @@ static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0, } } -static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0, - const uint16_t fwd_offset, - const uint16_t bck_offset, - const int16x4_t sub_const, - const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0) { +static INLINE void compute_avg_8x1( + uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset, + const uint16_t bck_offset, const int16x4_t sub_const, + const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) { int16x4_t tmp0, tmp2; int16x8_t f0; uint32x4_t sum0, sum2; @@ -78,7 +74,7 @@ static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0, uint16x8_t tmp_u0; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t sub_const_vec = vmovl_s16(sub_const); const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); @@ -123,7 +119,7 @@ static INLINE void compute_avg_4x4( uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const_vec, const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { + const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { int16x4_t tmp0, tmp1, tmp2, tmp3; uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; uint32x4_t sum0, sum1, sum2, sum3; @@ -132,7 +128,7 @@ static INLINE void compute_avg_4x4( int16x8_t tmp4, tmp5; const int16x8_t zero = vdupq_n_s16(0); - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); const int32x4_t const_vec = vmovl_s16(sub_const_vec); @@ -203,8 +199,8 @@ static INLINE void compute_avg_8x4( uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const, const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2, - uint8x8_t *t3) { + const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1, + uint8x8_t *t2, uint8x8_t *t3) { int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int16x8_t f0, f1, f2, f3; uint32x4_t sum0, sum1, sum2, sum3; @@ -214,7 +210,7 @@ static INLINE void compute_avg_8x4( uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; const int16x8_t zero = vdupq_n_s16(0); - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t sub_const_vec = vmovl_s16(sub_const); const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); @@ -319,7 +315,7 @@ static INLINE void compute_avg_8x4( } } -static INLINE void jnt_convolve_2d_horiz_neon( +static INLINE void dist_wtd_convolve_2d_horiz_neon( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, int16_t *x_filter_tmp, const int im_h, int w, const int round_0) { const int bd = 8; @@ -563,7 +559,7 @@ static INLINE void jnt_convolve_2d_horiz_neon( } } -static INLINE void jnt_convolve_2d_vert_neon( +static INLINE void dist_wtd_convolve_2d_vert_neon( int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride, ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) { uint8_t *dst_u8_ptr, *d_u8; @@ -587,7 +583,7 @@ static INLINE void jnt_convolve_2d_vert_neon( const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; uint16x4_t res4, d0; @@ -652,8 +648,8 @@ static INLINE void jnt_convolve_2d_vert_neon( d += (dst_stride << 2); compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset, - bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg, - &t0, &t1); + bck_offset, sub_const_vec, round_bits, + use_dist_wtd_comp_avg, &t0, &t1); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -691,7 +687,7 @@ static INLINE void jnt_convolve_2d_vert_neon( d += (dst_stride); compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec, - round_bits, use_jnt_comp_avg, &t0); + round_bits, use_dist_wtd_comp_avg, &t0); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -717,12 +713,12 @@ static INLINE void jnt_convolve_2d_vert_neon( } while (w > 0); } -void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { assert(!(w % 4)); assert(!(h % 4)); @@ -736,9 +732,9 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, const int round_0 = conv_params->round_0 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); int16_t x_filter_tmp[8]; int16x8_t filter_x_coef = vld1q_s16(x_filter); @@ -748,19 +744,18 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, filter_x_coef = vshrq_n_s16(filter_x_coef, 1); vst1q_s16(&x_filter_tmp[0], filter_x_coef); - jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, - x_filter_tmp, im_h, w, round_0); + dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, + x_filter_tmp, im_h, w, round_0); - jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params, - y_filter, h, w); + dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, + conv_params, y_filter, h, w); } -void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, - uint8_t *dst8, int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_copy_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2, tmp_shift3; uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3; @@ -783,8 +778,8 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; if (!(w & 0x07)) { for (y = 0; y < (h >> 2); ++y) { @@ -811,7 +806,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1, res_q2, res_q3, conv_params->fwd_offset, conv_params->bck_offset, sub_const_vec, bits, - conv_params->use_jnt_comp_avg, &tmp_shift0, + conv_params->use_dist_wtd_comp_avg, &tmp_shift0, &tmp_shift1, &tmp_shift2, &tmp_shift3); vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0); @@ -854,7 +849,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7, conv_params->fwd_offset, conv_params->bck_offset, - sub_const_vec, bits, conv_params->use_jnt_comp_avg, + sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg, &tmp_shift0, &tmp_shift1); vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0); @@ -881,12 +876,12 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, } } -void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { assert(!(w % 4)); assert(!(h % 4)); @@ -902,14 +897,14 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); const uint8_t *src_ptr = src - horiz_offset; @@ -1031,8 +1026,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0), vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), vreinterpret_u16_s16(d3), fwd_offset, bck_offset, - round_offset_vec, round_bits, use_jnt_comp_avg, &t0, - &t1); + round_offset_vec, round_bits, use_dist_wtd_comp_avg, + &t0, &t1); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); // 00 01 02 03 @@ -1103,7 +1098,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, bck_offset, round_offset_vec, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); // 00 01 02 03 @@ -1231,11 +1226,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res0), - vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2), - vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), + vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1243,11 +1239,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res4), - vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6), - vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), + vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1319,7 +1316,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, bck_offset, round_offset64, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_u8(d_u8, t0); d_u8 += (dst8_stride); @@ -1342,12 +1339,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, } } -void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { assert(!(w % 4)); assert(!(h % 4)); @@ -1363,15 +1360,15 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int shift_value = (conv_params->round_1 - 1 - bits); (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const uint8_t *src_ptr = src - (vert_offset * src_stride); @@ -1489,8 +1486,8 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0), vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), vreinterpret_u16_s16(d3), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, - &t1); + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -1535,7 +1532,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, bck_offset, round_offset64, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -1654,11 +1651,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res0), - vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2), - vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), + vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1666,11 +1664,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res4), - vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6), - vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), + vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1718,7 +1717,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, bck_offset, round_offset64, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_u8(d_u8, t0); d_u8 += (dst8_stride); diff --git a/media/libaom/src/av1/common/arm/mem_neon.h b/media/libaom/src/av1/common/arm/mem_neon.h index c4ae2e784..171055fe1 100644 --- a/media/libaom/src/av1/common/arm/mem_neon.h +++ b/media/libaom/src/av1/common/arm/mem_neon.h @@ -13,6 +13,7 @@ #include <arm_neon.h> #include <string.h> +#include "aom_dsp/aom_dsp_common.h" static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0, const uint8x8_t s1) { @@ -315,6 +316,26 @@ static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p, *s3 = vld1q_s16(s); } +// Load 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { + uint32_t a; + uint32x4_t a_u32 = vdupq_n_u32(0); + if (stride == 4) return vld1q_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 0); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 1); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 2); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 3); + return vreinterpretq_u8_u32(a_u32); +} + static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride, uint32x2_t *tu0, uint32x2_t *tu1, uint32x2_t *tu2, uint32x2_t *tu3) { @@ -383,6 +404,15 @@ static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride, *tu0 = vset_lane_u32(a, *tu0, 1); } +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define store_unaligned_u8_4x1(dst, src, lane) \ + do { \ + uint32_t a; \ + a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride, uint16x4_t *tu0) { uint16_t a; @@ -491,4 +521,19 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, vst1q_u32(s, s4); } +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +} + #endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_ diff --git a/media/libaom/src/av1/common/arm/selfguided_neon.c b/media/libaom/src/av1/common/arm/selfguided_neon.c index b3a37c4cb..fc404a64a 100644 --- a/media/libaom/src/av1/common/arm/selfguided_neon.c +++ b/media/libaom/src/av1/common/arm/selfguided_neon.c @@ -19,8 +19,8 @@ #include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/common.h" -#include "av1/common/onyxc_int.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" #include "av1/common/arm/mem_neon.h" @@ -86,7 +86,7 @@ static INLINE void calc_ab_fast_internal_common( for (int x = 0; x < 4; x++) { for (int y = 0; y < 4; y++) { - dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]]; + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; } } load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3); @@ -214,7 +214,7 @@ static INLINE void calc_ab_internal_common( for (int x = 0; x < 4; x++) { for (int y = 0; y < 8; y++) { - dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]]; + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; } } load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7); @@ -376,6 +376,21 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16, w -= 8; count++; } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst16, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst16, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } } { @@ -467,7 +482,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, const uint32_t n = (2 * r + 1) * (2 * r + 1); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); - const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7; @@ -509,6 +524,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, } while (h > 0); } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, uint16_t *B16, int32_t *B, const int buf_stride, const int width, @@ -522,7 +538,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); - const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; @@ -573,6 +589,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, h -= (ht_inc * 4); } while (h > 0); } +#endif // CONFIG_AV1_HIGHBITDEPTH static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, int32_t *B, const int buf_stride, @@ -584,7 +601,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, const uint32_t n = (2 * r + 1) * (2 * r + 1); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); - const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; @@ -626,6 +643,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, } while (h > 0); } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, int32_t *B, const int buf_stride, const int width, const int height, @@ -638,7 +656,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); - const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; @@ -679,6 +697,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, h -= (ht_inc * 4); } while (h > 0); } +#endif // CONFIG_AV1_HIGHBITDEPTH static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, int32_t *dst2, const int dst_stride, const int width, @@ -788,6 +807,21 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, w -= 8; count++; } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst1, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst1, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } } { @@ -1145,7 +1179,7 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -1181,17 +1215,25 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, // Calculation of a, b. a output is in 16bit tmp_buf which is in range of // [1, 256] for all bit depths. b output is kept in 32bit buffer. - if (8 == bit_depth) { - calc_ab_fast_internal_lbd( - (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), - (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, - params->s[radius_idx], 2); - } else { +#if CONFIG_AV1_HIGHBITDEPTH + if (bit_depth > 8) { calc_ab_fast_internal_hbd( (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, bit_depth, r, params->s[radius_idx], 2); + } else { + calc_ab_fast_internal_lbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, + params->s[radius_idx], 2); } +#else + (void)bit_depth; + calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1), + (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, + width + 2, height + 2, r, params->s[radius_idx], 2); +#endif final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16, dgd_stride, dst, dst_stride, width, height); } @@ -1200,7 +1242,7 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -1235,19 +1277,27 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; +#if CONFIG_AV1_HIGHBITDEPTH // Calculation of a, b. a output is in 16bit tmp_buf which is in range of // [1, 256] for all bit depths. b output is kept in 32bit buffer. - if (8 == bit_depth) { - calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + if (bit_depth > 8) { + calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), (B - buf_stride - 1), buf_stride, width + 2, - height + 2, r, params->s[radius_idx], 1); + height + 2, bit_depth, r, params->s[radius_idx], 1); } else { - calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), (B - buf_stride - 1), buf_stride, width + 2, - height + 2, bit_depth, r, params->s[radius_idx], 1); + height + 2, r, params->s[radius_idx], 1); } +#else + (void)bit_depth; + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, height + 2, + r, params->s[radius_idx], 1); +#endif final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst, dst_stride, width, height); } @@ -1299,8 +1349,14 @@ static INLINE void src_convert_u8_to_u16(const uint8_t *src, dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride]; } } + + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, uint16_t *dst, const int dst_stride, int width, int height) { @@ -1339,13 +1395,18 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride), sizeof(uint16_t) * width); } + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); } +#endif // CONFIG_AV1_HIGHBITDEPTH int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, int stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; assert(!(params->r[0] == 0 && params->r[1] == 0)); uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; @@ -1356,6 +1417,7 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int dgd_stride = stride; +#if CONFIG_AV1_HIGHBITDEPTH if (highbd) { const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); src_convert_hbd_copy( @@ -1370,6 +1432,13 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif if (params->r[0] > 0) restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, @@ -1380,11 +1449,11 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, return 0; } -void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, - int height, int stride, int eps, - const int *xqd, uint8_t *dst8, - int dst_stride, int32_t *tmpbuf, - int bit_depth, int highbd) { +void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); @@ -1395,11 +1464,12 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int dgd_stride = stride; - const sgr_params_type *const params = &sgr_params[eps]; + const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; assert(!(params->r[0] == 0 && params->r[1] == 0)); +#if CONFIG_AV1_HIGHBITDEPTH if (highbd) { const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); src_convert_hbd_copy( @@ -1414,7 +1484,13 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } - +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif if (params->r[0] > 0) restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width, bit_depth, eps, 0); @@ -1422,7 +1498,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, restoration_internal(dgd16, width, height, dgd16_stride, flt1, width, bit_depth, eps, 1); - decode_xq(xqd, xq, params); + av1_decode_xq(xqd, xq, params); { int16_t *src_ptr; @@ -1485,6 +1561,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero)); +#if CONFIG_AV1_HIGHBITDEPTH if (highbd) { r4 = vminq_u16(r4, max); vst1q_u16(dst16_ptr, r4); @@ -1492,6 +1569,11 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, t0 = vqmovn_u16(r4); vst1_u8(dst_ptr, t0); } +#else + (void)max; + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); +#endif w -= 8; count += 8; dst_ptr += 8; diff --git a/media/libaom/src/av1/common/arm/transpose_neon.h b/media/libaom/src/av1/common/arm/transpose_neon.h index 8a3d9f07f..91d89b43f 100644 --- a/media/libaom/src/av1/common/arm/transpose_neon.h +++ b/media/libaom/src/av1/common/arm/transpose_neon.h @@ -250,6 +250,71 @@ static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1, vreinterpret_u16_u32(c3.val[1])); } +static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, int16x4_t *a3, + int16x4_t *a4, int16x4_t *a5, + int16x4_t *a6, int16x4_t *a7, + int16x8_t *o0, int16x8_t *o1, + int16x8_t *o2, int16x8_t *o3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + int16x4x2_t b0 = vtrn_s16(*a0, *a1); + int16x4x2_t b1 = vtrn_s16(*a2, *a3); + int16x4x2_t b2 = vtrn_s16(*a4, *a5); + int16x4x2_t b3 = vtrn_s16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), + vreinterpret_s32_s16(b3.val[0])); + int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), + vreinterpret_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // o0: 00 10 20 30 40 50 60 70 + // o1: 01 11 21 31 41 51 61 71 + // o2: 02 12 22 32 42 52 62 72 + // o3: 03 13 23 33 43 53 63 73 + + *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), + vreinterpret_s16_s32(c2.val[0])); + *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), + vreinterpret_s16_s32(c3.val[0])); + *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), + vreinterpret_s16_s32(c2.val[1])); + *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), + vreinterpret_s16_s32(c3.val[1])); +} + static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3, uint16x8_t *a4, uint16x8_t *a5, @@ -386,7 +451,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, vget_high_s16(vreinterpretq_s16_s32(c3.val[1]))); } -static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { +static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); @@ -448,10 +513,10 @@ static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) { // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 - const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); - const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); - const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); - const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); *out = d0.val[0]; *(out + 1) = d1.val[0]; diff --git a/media/libaom/src/av1/common/arm/warp_plane_neon.c b/media/libaom/src/av1/common/arm/warp_plane_neon.c index 7f02d42a7..c10a34fcd 100644 --- a/media/libaom/src/av1/common/arm/warp_plane_neon.c +++ b/media/libaom/src/av1/common/arm/warp_plane_neon.c @@ -20,7 +20,7 @@ #include "av1/common/warped_motion.h" #include "av1/common/scale.h" -/* This is a modified version of 'warped_filter' from warped_motion.c: +/* This is a modified version of 'av1_warped_filter' from warped_motion.c: * Each coefficient is stored in 8 bits instead of 16 bits * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 @@ -333,22 +333,22 @@ static INLINE void vertical_filter_neon(const int16x8_t *src, c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]), vreinterpretq_s32_s16(b3.val[1])); - f0 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); - f1 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); - f2 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); - f3 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); - f4 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); - f5 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); - f6 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); - f7 = vld1q_s16( - (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + f0 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + f1 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + f2 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + f3 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + f4 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + f5 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + f6 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + f7 = vld1q_s16((int16_t *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2)); d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6)); @@ -640,7 +640,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, uint16x4_t tmp16_lo = vld1_u16(p); int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo)); int16x4_t tmp16_low; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_lo = vmulq_s32(res_lo, bwd); tmp32_lo = vmulq_s32(tmp32_lo, fwd); tmp32_lo = vaddq_s32(tmp32_lo, res_lo); @@ -671,7 +671,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, uint16x4_t tmp16_hi = vld1_u16(p4); int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi)); int16x4_t tmp16_high; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_hi = vmulq_s32(res_hi, bwd); tmp32_hi = vmulq_s32(tmp32_hi, fwd); tmp32_hi = vaddq_s32(tmp32_hi, res_hi); diff --git a/media/libaom/src/av1/common/av1_common_int.h b/media/libaom/src/av1/common/av1_common_int.h new file mode 100644 index 000000000..0403405e9 --- /dev/null +++ b/media/libaom/src/av1/common/av1_common_int.h @@ -0,0 +1,1557 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_ +#define AOM_AV1_COMMON_AV1_COMMON_INT_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/internal/aom_codec_internal.h" +#include "aom_util/aom_thread.h" +#include "av1/common/alloccommon.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/enums.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/mv.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" +#include "av1/common/tile_common.h" +#include "av1/common/timing.h" +#include "av1/common/odintrin.h" +#include "av1/encoder/hash_motion.h" +#include "aom_dsp/grain_synthesis.h" +#include "aom_dsp/grain_table.h" +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) && defined(__has_warning) +#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") +#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT +#endif +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT +#endif + +#ifndef AOM_FALLTHROUGH_INTENDED +#define AOM_FALLTHROUGH_INTENDED \ + do { \ + } while (0) +#endif + +#define CDEF_MAX_STRENGTHS 16 + +/* Constant values while waiting for the sequence header */ +#define FRAME_ID_LENGTH 15 +#define DELTA_FRAME_ID_LENGTH 14 + +#define FRAME_CONTEXTS (FRAME_BUFFERS + 1) +// Extra frame context which is always kept at default values +#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1) +#define PRIMARY_REF_BITS 3 +#define PRIMARY_REF_NONE 7 + +#define NUM_PING_PONG_BUFFERS 2 + +#define MAX_NUM_TEMPORAL_LAYERS 8 +#define MAX_NUM_SPATIAL_LAYERS 4 +/* clang-format off */ +// clang-format seems to think this is a pointer dereference and not a +// multiplication. +#define MAX_NUM_OPERATING_POINTS \ + (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS) +/* clang-format on */ + +// TODO(jingning): Turning this on to set up transform coefficient +// processing timer. +#define TXCOEFF_TIMER 0 +#define TXCOEFF_COST_TIMER 0 + +enum { + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} UENUM1BYTE(REFERENCE_MODE); + +enum { + /** + * Frame context updates are disabled + */ + REFRESH_FRAME_CONTEXT_DISABLED, + /** + * Update frame context to values resulting from backward probability + * updates based on entropy/counts in the decoded frame + */ + REFRESH_FRAME_CONTEXT_BACKWARD, +} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE); + +#define MFMV_STACK_SIZE 3 +typedef struct { + int_mv mfmv0; + uint8_t ref_frame_offset; +} TPL_MV_REF; + +typedef struct { + int_mv mv; + MV_REFERENCE_FRAME ref_frame; +} MV_REF; + +typedef struct RefCntBuffer { + // For a RefCntBuffer, the following are reference-holding variables: + // - cm->ref_frame_map[] + // - cm->cur_frame + // - cm->scaled_ref_buf[] (encoder only) + // - pbi->output_frame_index[] (decoder only) + // With that definition, 'ref_count' is the number of reference-holding + // variables that are currently referencing this buffer. + // For example: + // - suppose this buffer is at index 'k' in the buffer pool, and + // - Total 'n' of the variables / array elements above have value 'k' (that + // is, they are pointing to buffer at index 'k'). + // Then, pool->frame_bufs[k].ref_count = n. + int ref_count; + + unsigned int order_hint; + unsigned int ref_order_hints[INTER_REFS_PER_FRAME]; + + // These variables are used only in encoder and compare the absolute + // display order hint to compute the relative distance and overcome + // the limitation of get_relative_dist() which returns incorrect + // distance when a very old frame is used as a reference. + unsigned int display_order_hint; + unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME]; + + MV_REF *mvs; + uint8_t *seg_map; + struct segmentation seg; + int mi_rows; + int mi_cols; + // Width and height give the size of the buffer (before any upscaling, unlike + // the sizes that can be derived from the buf structure) + int width; + int height; + WarpedMotionParams global_motion[REF_FRAMES]; + int showable_frame; // frame can be used as show existing frame in future + uint8_t film_grain_params_present; + aom_film_grain_t film_grain_params; + aom_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; + FRAME_TYPE frame_type; + + // This is only used in the encoder but needs to be indexed per ref frame + // so it's extremely convenient to keep it here. + int interp_filter_selected[SWITCHABLE]; + + // Inter frame reference frame delta for loop filter + int8_t ref_deltas[REF_FRAMES]; + + // 0 = ZERO_MV, MV + int8_t mode_deltas[MAX_MODE_LF_DELTAS]; + + FRAME_CONTEXT frame_context; +} RefCntBuffer; + +typedef struct BufferPool { +// Protect BufferPool from being accessed by several FrameWorkers at +// the same time during frame parallel decode. +// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. +// TODO(wtc): Remove this. See +// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630. +#if CONFIG_MULTITHREAD + pthread_mutex_t pool_mutex; +#endif + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + + aom_get_frame_buffer_cb_fn_t get_fb_cb; + aom_release_frame_buffer_cb_fn_t release_fb_cb; + + RefCntBuffer frame_bufs[FRAME_BUFFERS]; + + // Frame buffers allocated internally by the codec. + InternalFrameBufferList int_frame_buffers; +} BufferPool; + +typedef struct { + int cdef_damping; + int nb_cdef_strengths; + int cdef_strengths[CDEF_MAX_STRENGTHS]; + int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; + int cdef_bits; +} CdefInfo; + +typedef struct { + int delta_q_present_flag; + // Resolution of delta quant + int delta_q_res; + int delta_lf_present_flag; + // Resolution of delta lf level + int delta_lf_res; + // This is a flag for number of deltas of loop filter level + // 0: use 1 delta, for y_vertical, y_horizontal, u, and v + // 1: use separate deltas for each filter level + int delta_lf_multi; +} DeltaQInfo; + +typedef struct { + int enable_order_hint; // 0 - disable order hint, and related tools + int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs, + // frame_sign_bias + // if 0, enable_dist_wtd_comp and + // enable_ref_frame_mvs must be set as 0. + int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes + // 1 - enable it + int enable_ref_frame_mvs; // 0 - disable ref frame mvs + // 1 - enable it +} OrderHintInfo; + +// Sequence header structure. +// Note: All syntax elements of sequence_header_obu that need to be +// bit-identical across multiple sequence headers must be part of this struct, +// so that consistency is checked by are_seq_headers_consistent() function. +// One exception is the last member 'op_params' that is ignored by +// are_seq_headers_consistent() function. +typedef struct SequenceHeader { + int num_bits_width; + int num_bits_height; + int max_frame_width; + int max_frame_height; + uint8_t frame_id_numbers_present_flag; + int frame_id_length; + int delta_frame_id_length; + BLOCK_SIZE sb_size; // Size of the superblock used for this frame + int mib_size; // Size of the superblock in units of MI blocks + int mib_size_log2; // Log 2 of above. + + OrderHintInfo order_hint_info; + + uint8_t force_screen_content_tools; // 0 - force off + // 1 - force on + // 2 - adaptive + uint8_t still_picture; // Video is a single frame still picture + uint8_t reduced_still_picture_hdr; // Use reduced header for still picture + uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel + // 1 - force to integer + // 2 - adaptive + uint8_t enable_filter_intra; // enables/disables filterintra + uint8_t enable_intra_edge_filter; // enables/disables edge upsampling + uint8_t enable_interintra_compound; // enables/disables interintra_compound + uint8_t enable_masked_compound; // enables/disables masked compound + uint8_t enable_dual_filter; // 0 - disable dual interpolation filter + // 1 - enable vert/horz filter selection + uint8_t enable_warped_motion; // 0 - disable warp for the sequence + // 1 - enable warp for the sequence + uint8_t enable_superres; // 0 - Disable superres for the sequence + // and no frame level superres flag + // 1 - Enable superres for the sequence + // enable per-frame superres flag + uint8_t enable_cdef; // To turn on/off CDEF + uint8_t enable_restoration; // To turn on/off loop restoration + BITSTREAM_PROFILE profile; + + // Color config. + aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, + // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. + uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers. + uint8_t monochrome; // Monochorme video + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + int color_range; + int subsampling_x; // Chroma subsampling for x + int subsampling_y; // Chroma subsampling for y + aom_chroma_sample_position_t chroma_sample_position; + uint8_t separate_uv_delta_q; + uint8_t film_grain_params_present; + + // Operating point info. + int operating_points_cnt_minus_1; + int operating_point_idc[MAX_NUM_OPERATING_POINTS]; + int timing_info_present; + aom_timing_info_t timing_info; + uint8_t decoder_model_info_present_flag; + aom_dec_model_info_t decoder_model_info; + uint8_t display_model_info_present_flag; + AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS]; + uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in spec. One bit: 0 or 1. + + // IMPORTANT: the op_params member must be at the end of the struct so that + // are_seq_headers_consistent() can be implemented with a memcmp() call. + // TODO(urvang): We probably don't need the +1 here. + aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; +} SequenceHeader; + +typedef struct { + int skip_mode_allowed; + int skip_mode_flag; + int ref_frame_idx_0; + int ref_frame_idx_1; +} SkipModeInfo; + +typedef struct { + FRAME_TYPE frame_type; + REFERENCE_MODE reference_mode; + + unsigned int order_hint; + unsigned int display_order_hint; + unsigned int frame_number; + SkipModeInfo skip_mode_info; + int refresh_frame_flags; // Which ref frames are overwritten by this frame + int frame_refs_short_signaling; +} CurrentFrame; + +// Struct containing some frame level features. +typedef struct { + bool disable_cdf_update; + bool allow_high_precision_mv; + bool cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer + bool allow_screen_content_tools; + bool allow_intrabc; + bool allow_warped_motion; + // Whether to use previous frames' motion vectors for prediction. + bool allow_ref_frame_mvs; + bool coded_lossless; // frame is fully lossless at the coded resolution. + bool all_lossless; // frame is fully lossless at the upscaled resolution. + bool reduced_tx_set_used; + bool error_resilient_mode; + bool switchable_motion_mode; + TX_MODE tx_mode; + InterpFilter interp_filter; + int primary_ref_frame; + int byte_alignment; + // Flag signaling how frame contexts should be updated at the end of + // a frame decode + REFRESH_FRAME_CONTEXT_MODE refresh_frame_context; +} FeatureFlags; + +// Struct containing params related to tiles. +typedef struct CommonTileParams { + int cols; // number of tile columns that frame is divided into + int rows; // number of tile rows that frame is divided into + int max_width_sb; // maximum tile width in superblock units. + int max_height_sb; // maximum tile height in superblock units. + // Min width of non-rightmost tile in MI units. Only valid if cols > 1. + int min_inner_width; + + // If true, tiles are uniformly spaced with power-of-two number of rows and + // columns. + // If false, tiles have explicitly configured widths and heights. + int uniform_spacing; + + // Following members are only valid when uniform_spacing == 1 + int log2_cols; // log2 of 'cols'. + int log2_rows; // log2 of 'rows'. + int width; // tile width in MI units + int height; // tile height in MI units + // End of members that are only valid when uniform_spacing == 1 + + // Min num of tile columns possible based on 'max_width_sb' and frame width. + int min_log2_cols; + // Min num of tile rows possible based on 'max_height_sb' and frame height. + int min_log2_rows; + // Min num of tile columns possible based on frame width. + int max_log2_cols; + // Max num of tile columns possible based on frame width. + int max_log2_rows; + // log2 of min number of tiles (same as min_log2_cols + min_log2_rows). + int min_log2; + // col_start_sb[i] is the start position of tile column i in superblock units. + // valid for 0 <= i <= cols + int col_start_sb[MAX_TILE_COLS + 1]; + // row_start_sb[i] is the start position of tile row i in superblock units. + // valid for 0 <= i <= rows + int row_start_sb[MAX_TILE_ROWS + 1]; + // If true, we are using large scale tile mode. + unsigned int large_scale; + // Only relevant when large_scale == 1. + // If true, the independent decoding of a single tile or a section of a frame + // is allowed. + unsigned int single_tile_decoding; +} CommonTileParams; + +// Struct containing params related to MB_MODE_INFO arrays and related info. +typedef struct CommonModeInfoParams CommonModeInfoParams; +struct CommonModeInfoParams { + // Number of rows/cols in the frame in 16 pixel units. + // This is computed from frame width and height aligned to a multiple of 8. + int mb_rows; + int mb_cols; + // Total MBs = mb_rows * mb_cols. + int MBs; + + // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units. + // This is computed from frame width and height aligned to a multiple of 8. + int mi_rows; + int mi_cols; + + // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block + // in the frame. + // Note: This array should be treated like a scratch memory, and should NOT be + // accessed directly, in most cases. Please use 'mi_grid_base' array instead. + MB_MODE_INFO *mi_alloc; + // Number of allocated elements in 'mi_alloc'. + int mi_alloc_size; + // Stride for 'mi_alloc' array. + int mi_alloc_stride; + // The minimum block size that each element in 'mi_alloc' can correspond to. + // For decoder, this is always BLOCK_4X4. + // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k, + // and BLOCK_8X8 for resolution >= 4k. + BLOCK_SIZE mi_alloc_bsize; + + // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'. + // It's possible that: + // - Multiple pointers in the grid point to the same element in 'mi_alloc' + // (for example, for all 4x4 blocks that belong to the same partition block). + // - Some pointers can be NULL (for example, for blocks outside visible area). + MB_MODE_INFO **mi_grid_base; + // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also). + int mi_grid_size; + // Stride for 'mi_grid_base' (and 'tx_type_map' also). + int mi_stride; + + // An array of tx types for each 4x4 block in the frame. + // Number of allocated elements is same as 'mi_grid_size', and stride is + // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of + // 'mi_grid_base'. + TX_TYPE *tx_type_map; + + // Function pointers to allow separate logic for encoder and decoder. + void (*free_mi)(struct CommonModeInfoParams *mi_params); + void (*setup_mi)(struct CommonModeInfoParams *mi_params); + void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width, + int height); +}; + +// Parameters related to quantization at the frame level. +typedef struct CommonQuantParams CommonQuantParams; +struct CommonQuantParams { + // Base qindex of the frame in the range 0 to 255. + int base_qindex; + + // Delta of qindex (from base_qindex) for Y plane DC coefficient. + // Note: y_ac_delta_q is implicitly 0. + int y_dc_delta_q; + + // Delta of qindex (from base_qindex) for U plane DC and AC coefficients. + int u_dc_delta_q; + int v_dc_delta_q; + + // Delta of qindex (from base_qindex) for V plane DC and AC coefficients. + // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0. + int u_ac_delta_q; + int v_ac_delta_q; + + // Note: The qindex per superblock may have a delta from the qindex obtained + // at frame level from parameters above, based on 'cm->delta_q_info'. + + // The dequantizers below are true dequantizers used only in the + // dequantization process. They have the same coefficient + // shift/scale as TX. + int16_t y_dequant_QTX[MAX_SEGMENTS][2]; + int16_t u_dequant_QTX[MAX_SEGMENTS][2]; + int16_t v_dequant_QTX[MAX_SEGMENTS][2]; + + // Global quant matrix tables + const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + + // Local quant matrix tables for each frame + const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + + // Flag indicating whether quantization matrices are being used: + // - If true, qm_level_y, qm_level_u and qm_level_v indicate the level + // indices to be used to access appropriate global quant matrix tables. + // - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'. + bool using_qmatrix; + int qmatrix_level_y; + int qmatrix_level_u; + int qmatrix_level_v; +}; + +// Context used for transmitting various symbols in the bistream. +typedef struct CommonContexts CommonContexts; +struct CommonContexts { + // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type. + // partition[i][j] is the context for ith tile row, jth mi_col. + PARTITION_CONTEXT **partition; + + // Context used to derive context for multiple symbols: + // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit + // to transmit skip_txfm flag. + // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit + // sign. + // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col. + ENTROPY_CONTEXT **entropy[MAX_MB_PLANE]; + + // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to + // transmit 'is_split' flag to indicate if this transform block should be + // split into smaller sub-blocks. + // txfm[i][j] is the context for ith tile row, jth mi_col. + TXFM_CONTEXT **txfm; + + // Dimensions that were used to allocate the arrays above. + // If these dimensions change, the arrays may have to be re-allocated. + int num_planes; // Corresponds to av1_num_planes(cm) + int num_tile_rows; // Corresponds to cm->tiles.row + int num_mi_cols; // Corresponds to cm->mi_params.mi_cols +}; + +typedef struct AV1Common { + // Information about the current frame that is being coded. + CurrentFrame current_frame; + // Code and details about current error status. + struct aom_internal_error_info error; + + // AV1 allows two types of frame scaling operations: + // (1) Frame super-resolution: that allows coding a frame at lower resolution + // and after decoding the frame, normatively uscales and restores the frame -- + // inside the coding loop. + // (2) Frame resize: that allows coding frame at lower/higher resolution, and + // then non-normatively upscale the frame at the time of rendering -- outside + // the coding loop. + // Hence, the need for 3 types of dimensions. + + // Coded frame dimensions. + int width; + int height; + + // Rendered frame dimensions, after applying both super-resolution and resize + // to the coded frame. + // Different from coded dimensions if super-resolution and/or resize are + // being used for this frame. + int render_width; + int render_height; + + // Frame dimensions after applying super-resolution to the coded frame (if + // present), but before applying resize. + // Larger than the coded dimensions if super-resolution is being used for + // this frame. + // Different from rendered dimensions if resize is being used for this frame. + int superres_upscaled_width; + int superres_upscaled_height; + + // The denominator of the superres scale used by this frame. + // Note: The numerator is fixed to be SCALE_NUMERATOR. + uint8_t superres_scale_denominator; + + // If true, buffer removal times are present. + bool buffer_removal_time_present; + // buffer_removal_times[op_num] specifies the frame removal time in units of + // DecCT clock ticks counted from the removal time of the last random access + // point for operating point op_num. + // TODO(urvang): We probably don't need the +1 here. + uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1]; + // Presentation time of the frame in clock ticks DispCT counted from the + // removal time of the last random access point for the operating point that + // is being decoded. + uint32_t frame_presentation_time; + + // Buffer where previous frame is stored. + RefCntBuffer *prev_frame; + + // Buffer into which the current frame will be stored and other related info. + // TODO(hkuang): Combine this with cur_buf in macroblockd. + RefCntBuffer *cur_frame; + + // For encoder, we have a two-level mapping from reference frame type to the + // corresponding buffer in the buffer pool: + // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ... + // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1) + // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to + // the reference counted buffer structure RefCntBuffer, taken from the buffer + // pool cm->buffer_pool->frame_bufs. + // + // LAST_FRAME, ..., EXTREF_FRAME + // | | + // v v + // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] + // | | + // v v + // ref_frame_map[], ..., ref_frame_map[] + // + // Note: INTRA_FRAME always refers to the current frame, so there's no need to + // have a remapped index for the same. + int remapped_ref_idx[REF_FRAMES]; + + // Scale of the current frame with respect to itself. + // This is currently used for intra block copy, which behaves like an inter + // prediction mode, where the reference frame is the current frame itself. + struct scale_factors sf_identity; + + // Scale factors of the reference frame with respect to the current frame. + // This is required for generating inter prediction and will be non-identity + // for a reference frame, if it has different dimensions than the coded + // dimensions of the current frame. + struct scale_factors ref_scale_factors[REF_FRAMES]; + + // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to + // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps + // remapped reference index 'j' (that is, original reference type 'i') to + // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + RefCntBuffer *ref_frame_map[REF_FRAMES]; + + // If true, this frame is actually shown after decoding. + // If false, this frame is coded in the bitstream, but not shown. It is only + // used as a reference for other frames coded later. + int show_frame; + + // If true, this frame can be used as a show-existing frame for other frames + // coded later. + // When 'show_frame' is true, this is always true for all non-keyframes. + // When 'show_frame' is false, this value is transmitted in the bitstream. + int showable_frame; + + // If true, show an existing frame coded before, instead of actually coding a + // frame. The existing frame comes from one of the existing reference buffers, + // as signaled in the bitstream. + int show_existing_frame; + + // Whether some features are allowed or not. + FeatureFlags features; + + // Params related to MB_MODE_INFO arrays and related info. + CommonModeInfoParams mi_params; + +#if CONFIG_ENTROPY_STATS + int coef_cdf_category; +#endif + // Quantization params. + CommonQuantParams quant_params; + + // Segmentation info for current frame. + struct segmentation seg; + + // Segmentation map for previous frame. + uint8_t *last_frame_seg_map; + + // Deblocking filter parameters. + loop_filter_info_n lf_info; + struct loopfilter lf; + + // Loop Restoration filter parameters. + RestorationInfo rst_info[MAX_MB_PLANE]; // Loop Restoration filter info. + int32_t *rst_tmpbuf; // Scratch buffer for self-guided restoration filter. + RestorationLineBuffers *rlbs; // Line buffers required by loop restoration. + YV12_BUFFER_CONFIG rst_frame; // Stores the output of loop restoration. + + // CDEF (Constrained Directional Enhancement Filter) parameters. + CdefInfo cdef_info; + + // Parameters for film grain synthesis. + aom_film_grain_t film_grain_params; + + // Parameters for delta quantization and delta loop filter level. + DeltaQInfo delta_q_info; + + // Global motion parameters for each reference frame. + WarpedMotionParams global_motion[REF_FRAMES]; + + // Elements part of the sequence header, that are applicable for all the + // frames in the video. + SequenceHeader seq_params; + + // Current CDFs of all the symbols for the current frame. + FRAME_CONTEXT *fc; + // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE + // (e.g. for a keyframe). These default CDFs are defined by the bitstream and + // copied from default CDF tables for each symbol. + FRAME_CONTEXT *default_frame_context; + + // Parameters related to tiling. + CommonTileParams tiles; + + // External BufferPool passed from outside. + BufferPool *buffer_pool; + + // Above context buffers and their sizes. + // Note: above contexts are allocated in this struct, as their size is + // dependent on frame width, while left contexts are declared and allocated in + // MACROBLOCKD struct, as they have a fixed size. + CommonContexts above_contexts; + + // When cm->seq_params.frame_id_numbers_present_flag == 1, current and + // reference frame IDs are signaled in the bitstream. + int current_frame_id; + int ref_frame_id[REF_FRAMES]; + + // Motion vectors provided by motion field estimation. + // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where: + // mi_row = 2 * row, + // mi_col = 2 * col, and + // stride = cm->mi_params.mi_stride / 2 + TPL_MV_REF *tpl_mvs; + // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function. + int tpl_mvs_mem_size; + // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and + // current frame is positive; and 0 otherwise. + int ref_frame_sign_bias[REF_FRAMES]; + // ref_frame_side[k] is 1 if relative distance between reference 'k' and + // current frame is positive, -1 if relative distance is 0; and 0 otherwise. + // TODO(jingning): This can be combined with sign_bias later. + int8_t ref_frame_side[REF_FRAMES]; + + // Number of temporal layers: may be > 1 for SVC (scalable vector coding). + unsigned int number_temporal_layers; + // Temporal layer ID of this frame + // (in the range 0 ... (number_temporal_layers - 1)). + int temporal_layer_id; + + // Number of spatial layers: may be > 1 for SVC (scalable vector coding). + unsigned int number_spatial_layers; + // Spatial layer ID of this frame + // (in the range 0 ... (number_spatial_layers - 1)). + int spatial_layer_id; + +#if TXCOEFF_TIMER + int64_t cum_txcoeff_timer; + int64_t txcoeff_timer; + int txb_count; +#endif // TXCOEFF_TIMER + +#if TXCOEFF_COST_TIMER + int64_t cum_txcoeff_cost_timer; + int64_t txcoeff_cost_timer; + int64_t txcoeff_cost_count; +#endif // TXCOEFF_COST_TIMER + +#if CONFIG_LPF_MASK + int is_decoding; +#endif // CONFIG_LPF_MASK +} AV1_COMMON; + +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +static void lock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static void unlock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) return NULL; + if (cm->ref_frame_map[index] == NULL) return NULL; + return &cm->ref_frame_map[index]->buf; +} + +static INLINE int get_free_fb(AV1_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + lock_buffer_pool(cm->buffer_pool); + for (i = 0; i < FRAME_BUFFERS; ++i) + if (frame_bufs[i].ref_count == 0) break; + + if (i != FRAME_BUFFERS) { + if (frame_bufs[i].buf.use_external_reference_buffers) { + // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the + // external reference buffers. Restore the buffer pointers to point to the + // internally allocated memory. + YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf; + ybf->y_buffer = ybf->store_buf_adr[0]; + ybf->u_buffer = ybf->store_buf_adr[1]; + ybf->v_buffer = ybf->store_buf_adr[2]; + ybf->use_external_reference_buffers = 0; + } + + frame_bufs[i].ref_count = 1; + } else { + // We should never run out of free buffers. If this assertion fails, there + // is a reference leak. + assert(0 && "Ran out of free frame buffers. Likely a reference leak."); + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + + unlock_buffer_pool(cm->buffer_pool); + return i; +} + +static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) { + // Release the previously-used frame-buffer + if (cm->cur_frame != NULL) { + --cm->cur_frame->ref_count; + cm->cur_frame = NULL; + } + + // Assign a new framebuffer + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) return NULL; + + cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx]; + cm->cur_frame->buf.buf_8bit_valid = 0; + av1_zero(cm->cur_frame->interp_filter_selected); + return cm->cur_frame; +} + +// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref +// counts accordingly. +static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr, + RefCntBuffer *rhs_ptr) { + RefCntBuffer *const old_ptr = *lhs_ptr; + if (old_ptr != NULL) { + assert(old_ptr->ref_count > 0); + // One less reference to the buffer at 'old_ptr', so decrease ref count. + --old_ptr->ref_count; + } + + *lhs_ptr = rhs_ptr; + // One more reference to the buffer at 'rhs_ptr', so increase ref count. + ++rhs_ptr->ref_count; +} + +static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) { + return cm->current_frame.frame_type == KEY_FRAME || + cm->current_frame.frame_type == INTRA_ONLY_FRAME; +} + +static INLINE int frame_is_sframe(const AV1_COMMON *cm) { + return cm->current_frame.frame_type == S_FRAME; +} + +// These functions take a reference frame label between LAST_FRAME and +// EXTREF_FRAME inclusive. Note that this is different to the indexing +// previously used by the frame_refs[] array. +static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME ref_frame) { + return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME) + ? cm->remapped_ref_idx[ref_frame - LAST_FRAME] + : INVALID_IDX; +} + +static INLINE RefCntBuffer *get_ref_frame_buf( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Both const and non-const versions of this function are provided so that it +// can be used with a const AV1_COMMON if needed. +static INLINE const struct scale_factors *get_ref_scale_factors_const( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE struct scale_factors *get_ref_scale_factors( + AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE RefCntBuffer *get_primary_ref_frame_buf( + const AV1_COMMON *const cm) { + const int primary_ref_frame = cm->features.primary_ref_frame; + if (primary_ref_frame == PRIMARY_REF_NONE) return NULL; + const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Returns 1 if this frame might allow mvs from some reference frame. +static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && + cm->seq_params.order_hint_info.enable_ref_frame_mvs && + cm->seq_params.order_hint_info.enable_order_hint && + !frame_is_intra_only(cm); +} + +// Returns 1 if this frame might use warped_motion +static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) && + cm->seq_params.enable_warped_motion; +} + +static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { + const int buf_rows = buf->mi_rows; + const int buf_cols = buf->mi_cols; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + if (buf->mvs == NULL || buf_rows != mi_params->mi_rows || + buf_cols != mi_params->mi_cols) { + aom_free(buf->mvs); + buf->mi_rows = mi_params->mi_rows; + buf->mi_cols = mi_params->mi_cols; + CHECK_MEM_ERROR(cm, buf->mvs, + (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) * + ((mi_params->mi_cols + 1) >> 1), + sizeof(*buf->mvs))); + aom_free(buf->seg_map); + CHECK_MEM_ERROR( + cm, buf->seg_map, + (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols, + sizeof(*buf->seg_map))); + } + + const int mem_size = + ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1); + int realloc = cm->tpl_mvs == NULL; + if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size; + + if (realloc) { + aom_free(cm->tpl_mvs); + CHECK_MEM_ERROR(cm, cm->tpl_mvs, + (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs))); + cm->tpl_mvs_mem_size = mem_size; + } +} + +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); + +static INLINE int av1_num_planes(const AV1_COMMON *cm) { + return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE; +} + +static INLINE void av1_init_above_context(CommonContexts *above_contexts, + int num_planes, int tile_row, + MACROBLOCKD *xd) { + for (int i = 0; i < num_planes; ++i) { + xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row]; + } + xd->above_partition_context = above_contexts->partition[tile_row]; + xd->above_txfm_context = above_contexts->txfm[tile_row]; +} + +static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd, + tran_low_t *dqcoeff) { + const int num_planes = av1_num_planes(cm); + const CommonQuantParams *const quant_params = &cm->quant_params; + + for (int i = 0; i < num_planes; ++i) { + xd->plane[i].dqcoeff = dqcoeff; + + if (xd->plane[i].plane_type == PLANE_TYPE_Y) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX, + sizeof(quant_params->y_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix, + sizeof(quant_params->y_iqmatrix)); + + } else { + if (i == AOM_PLANE_U) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX, + sizeof(quant_params->u_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix, + sizeof(quant_params->u_iqmatrix)); + } else { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX, + sizeof(quant_params->v_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix, + sizeof(quant_params->v_iqmatrix)); + } + } + } + xd->mi_stride = cm->mi_params.mi_stride; + xd->error_info = &cm->error; + cfl_init(&xd->cfl, &cm->seq_params); +} + +static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col, + const int num_planes) { + int i; + int row_offset = mi_row; + int col_offset = mi_col; + for (i = 0; i < num_planes; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + // Offset the buffer pointer + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) + row_offset = mi_row - 1; + if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) + col_offset = mi_col - 1; + int above_idx = col_offset; + int left_idx = row_offset & MAX_MIB_MASK; + pd->above_entropy_context = + &xd->above_entropy_context[i][above_idx >> pd->subsampling_x]; + pd->left_entropy_context = + &xd->left_entropy_context[i][left_idx >> pd->subsampling_y]; + } +} + +static INLINE int calc_mi_size(int len) { + // len is in mi units. Align to a multiple of SBs. + return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2); +} + +static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, + const int num_planes) { + int i; + for (i = 0; i < num_planes; i++) { + xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x; + xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y; + + xd->plane[i].width = AOMMAX(xd->plane[i].width, 4); + xd->plane[i].height = AOMMAX(xd->plane[i].height, 4); + } +} + +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, int mi_col, int bw, + int mi_rows, int mi_cols) { + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE); + xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE)); + xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE); + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + + // Are edges available for intra prediction? + xd->up_available = (mi_row > tile->mi_row_start); + + const int ss_x = xd->plane[1].subsampling_x; + const int ss_y = xd->plane[1].subsampling_y; + + xd->left_available = (mi_col > tile->mi_col_start); + xd->chroma_up_available = xd->up_available; + xd->chroma_left_available = xd->left_available; + if (ss_x && bw < mi_size_wide[BLOCK_8X8]) + xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start; + if (ss_y && bh < mi_size_high[BLOCK_8X8]) + xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start; + if (xd->up_available) { + xd->above_mbmi = xd->mi[-xd->mi_stride]; + } else { + xd->above_mbmi = NULL; + } + + if (xd->left_available) { + xd->left_mbmi = xd->mi[-1]; + } else { + xd->left_mbmi = NULL; + } + + const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !ss_x); + xd->is_chroma_ref = chroma_ref; + if (chroma_ref) { + // To help calculate the "above" and "left" chroma blocks, note that the + // current block may cover multiple luma blocks (eg, if partitioned into + // 4x4 luma blocks). + // First, find the top-left-most luma block covered by this chroma block + MB_MODE_INFO **base_mi = + &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)]; + + // Then, we consider the luma region covered by the left or above 4x4 chroma + // prediction. We want to point to the chroma reference block in that + // region, which is the bottom-right-most mi unit. + // This leads to the following offsets: + MB_MODE_INFO *chroma_above_mi = + xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL; + xd->chroma_above_mbmi = chroma_above_mi; + + MB_MODE_INFO *chroma_left_mi = + xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL; + xd->chroma_left_mbmi = chroma_left_mi; + } + + xd->height = bh; + xd->width = bw; + xd->is_sec_rect = 0; + if (xd->width < xd->height) { + // Only mark is_sec_rect as 1 for the last block. + // For PARTITION_VERT_4, it would be (0, 0, 0, 1); + // For other partitions, it would be (0, 1). + if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1; + } + + if (xd->width > xd->height) + if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1; +} + +static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi) { + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + return tile_ctx->kf_y_cdf[above_ctx][left_ctx]; +} + +static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col; + PARTITION_CONTEXT *const left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + memset(above_ctx, partition_context_lookup[subsize].above, bw); + memset(left_ctx, partition_context_lookup[subsize].left, bh); +} + +static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize, + int subsampling_x, int subsampling_y) { + assert(bsize < BLOCK_SIZES_ALL); + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x); + return ref_pos; +} + +static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf, + size_t element) { + assert(cdf != NULL); + return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element]; +} + +static INLINE void partition_gather_horz_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_HORZ); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_B); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void partition_gather_vert_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_VERT); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_B); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + if (bsize >= BLOCK_8X8) { + const int hbs = mi_size_wide[bsize] / 2; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + switch (partition) { + case PARTITION_SPLIT: + if (bsize != BLOCK_8X8) break; + AOM_FALLTHROUGH_INTENDED; + case PARTITION_NONE: + case PARTITION_HORZ: + case PARTITION_VERT: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + update_partition_context(xd, mi_row, mi_col, subsize, bsize); + break; + case PARTITION_HORZ_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize); + break; + case PARTITION_HORZ_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize); + break; + case PARTITION_VERT_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize); + break; + case PARTITION_VERT_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize); + break; + default: assert(0 && "Invalid partition type"); + } + } +} + +static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col; + const PARTITION_CONTEXT *left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + // Minimum partition point is 8x8. Offset the bsl accordingly. + const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8]; + int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; + + assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]); + assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +// Return the number of elements in the partition CDF when +// partitioning the (square) block with luma block size of bsize. +static INLINE int partition_cdf_length(BLOCK_SIZE bsize) { + if (bsize <= BLOCK_8X8) + return PARTITION_TYPES; + else if (bsize == BLOCK_128X128) + return EXT_PARTITION_TYPES - 2; + else + return EXT_PARTITION_TYPES; +} + +static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + assert(bsize < BLOCK_SIZES_ALL); + int max_blocks_wide = block_size_wide[bsize]; + + if (xd->mb_to_right_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x); + } + + // Scale the width in the transform block unit. + return max_blocks_wide >> MI_SIZE_LOG2; +} + +static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + int max_blocks_high = block_size_high[bsize]; + + if (xd->mb_to_bottom_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y); + } + + // Scale the height in the transform block unit. + return max_blocks_high >> MI_SIZE_LOG2; +} + +static INLINE void av1_zero_above_context(AV1_COMMON *const cm, + const MACROBLOCKD *xd, + int mi_col_start, int mi_col_end, + const int tile_row) { + const SequenceHeader *const seq_params = &cm->seq_params; + const int num_planes = av1_num_planes(cm); + const int width = mi_col_end - mi_col_start; + const int aligned_width = + ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2); + const int offset_y = mi_col_start; + const int width_y = aligned_width; + const int offset_uv = offset_y >> seq_params->subsampling_x; + const int width_uv = width_y >> seq_params->subsampling_x; + CommonContexts *const above_contexts = &cm->above_contexts; + + av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y); + if (num_planes > 1) { + if (above_contexts->entropy[1][tile_row] && + above_contexts->entropy[2][tile_row]) { + av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv, + width_uv); + av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv, + width_uv); + } else { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid value of planes"); + } + } + + av1_zero_array(above_contexts->partition[tile_row] + mi_col_start, + aligned_width); + + memset(above_contexts->txfm[tile_row] + mi_col_start, + tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT)); +} + +static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) { + av1_zero(xd->left_entropy_context); + av1_zero(xd->left_partition_context); + + memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST], + sizeof(xd->left_txfm_context_buffer)); +} + +// Disable array-bounds checks as the TX_SIZE enum contains values larger than +// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround +// infeasible. The assert is enough for static analysis and this or other tools +// asan, valgrind would catch oob access at runtime. +#if defined(__GNUC__) && __GNUC__ >= 4 +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + +#if defined(__GNUC__) && __GNUC__ >= 4 +#pragma GCC diagnostic warning "-Warray-bounds" +#endif + +static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { + int i; + for (i = 0; i < len; ++i) txfm_ctx[i] = txs; +} + +static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, + const MACROBLOCKD *xd) { + uint8_t bw = tx_size_wide[tx_size]; + uint8_t bh = tx_size_high[tx_size]; + + if (skip) { + bw = n4_w * MI_SIZE; + bh = n4_h * MI_SIZE; + } + + set_txfm_ctx(xd->above_txfm_context, bw, n4_w); + set_txfm_ctx(xd->left_txfm_context, bh, n4_h); +} + +static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + return mi_row * mi_params->mi_stride + mi_col; +} + +static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int mi_alloc_row = mi_row / mi_alloc_size_1d; + const int mi_alloc_col = mi_col / mi_alloc_size_1d; + + return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col; +} + +// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi. +static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params, + MACROBLOCKD *const xd, int mi_row, + int mi_col) { + // 'mi_grid_base' should point to appropriate memory in 'mi'. + const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); + mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx]; + // 'xd->mi' should point to an offset in 'mi_grid_base'; + xd->mi = mi_params->mi_grid_base + mi_grid_idx; + // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'. + xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx; + xd->tx_type_map_stride = mi_params->mi_stride; +} + +static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx, + TXFM_CONTEXT *left_ctx, + TX_SIZE tx_size, TX_SIZE txb_size) { + BLOCK_SIZE bsize = txsize_to_bsize[txb_size]; + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + uint8_t txw = tx_size_wide[tx_size]; + uint8_t txh = tx_size_high[tx_size]; + int i; + for (i = 0; i < bh; ++i) left_ctx[i] = txh; + for (i = 0; i < bw; ++i) above_ctx[i] = txw; +} + +static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) { + switch (tx_dim) { + case 128: + case 64: return TX_64X64; break; + case 32: return TX_32X32; break; + case 16: return TX_16X16; break; + case 8: return TX_8X8; break; + default: return TX_4X4; + } +} + +static INLINE TX_SIZE get_tx_size(int width, int height) { + if (width == height) { + return get_sqr_tx_size(width); + } + if (width < height) { + if (width + width == height) { + switch (width) { + case 4: return TX_4X8; break; + case 8: return TX_8X16; break; + case 16: return TX_16X32; break; + case 32: return TX_32X64; break; + } + } else { + switch (width) { + case 4: return TX_4X16; break; + case 8: return TX_8X32; break; + case 16: return TX_16X64; break; + } + } + } else { + if (height + height == width) { + switch (height) { + case 4: return TX_8X4; break; + case 8: return TX_16X8; break; + case 16: return TX_32X16; break; + case 32: return TX_64X32; break; + } + } else { + switch (height) { + case 4: return TX_16X4; break; + case 8: return TX_32X8; break; + case 16: return TX_64X16; break; + } + } + } + assert(0); + return TX_4X4; +} + +static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx, + const TXFM_CONTEXT *const left_ctx, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + const uint8_t txw = tx_size_wide[tx_size]; + const uint8_t txh = tx_size_high[tx_size]; + const int above = *above_ctx < txw; + const int left = *left_ctx < txh; + int category = TXFM_PARTITION_CONTEXTS; + + // dummy return, not used by others. + if (tx_size <= TX_4X4) return 0; + + TX_SIZE max_tx_size = + get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize])); + + if (max_tx_size >= TX_8X8) { + category = + (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) + + (TX_SIZES - 1 - max_tx_size) * 2; + } + assert(category != TXFM_PARTITION_CONTEXTS); + return category * 3 + above + left; +} + +// Compute the next partition in the direction of the sb_type stored in the mi +// array, starting with bsize. +static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) + return PARTITION_INVALID; + + const int offset = mi_row * mi_params->mi_stride + mi_col; + MB_MODE_INFO **mi = mi_params->mi_grid_base + offset; + const BLOCK_SIZE subsize = mi[0]->sb_type; + + if (subsize == bsize) return PARTITION_NONE; + + const int bhigh = mi_size_high[bsize]; + const int bwide = mi_size_wide[bsize]; + const int sshigh = mi_size_high[subsize]; + const int sswide = mi_size_wide[subsize]; + + if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows && + mi_col + bhigh / 2 < mi_params->mi_cols) { + // In this case, the block might be using an extended partition + // type. + const MB_MODE_INFO *const mbmi_right = mi[bwide / 2]; + const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride]; + + if (sswide == bwide) { + // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or + // PARTITION_HORZ_B. To distinguish the latter two, check if the lower + // half was split. + if (sshigh * 4 == bhigh) return PARTITION_HORZ_4; + assert(sshigh * 2 == bhigh); + + if (mbmi_below->sb_type == subsize) + return PARTITION_HORZ; + else + return PARTITION_HORZ_B; + } else if (sshigh == bhigh) { + // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or + // PARTITION_VERT_B. To distinguish the latter two, check if the right + // half was split. + if (sswide * 4 == bwide) return PARTITION_VERT_4; + assert(sswide * 2 == bhigh); + + if (mbmi_right->sb_type == subsize) + return PARTITION_VERT; + else + return PARTITION_VERT_B; + } else { + // Smaller width and smaller height. Might be PARTITION_SPLIT or could be + // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both + // dimensions, we immediately know this is a split (which will recurse to + // get to subsize). Otherwise look down and to the right. With + // PARTITION_VERT_A, the right block will have height bhigh; with + // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise + // it's PARTITION_SPLIT. + if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT; + + if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A; + if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A; + + return PARTITION_SPLIT; + } + } + const int vert_split = sswide < bwide; + const int horz_split = sshigh < bhigh; + const int split_idx = (vert_split << 1) | horz_split; + assert(split_idx != 0); + + static const PARTITION_TYPE base_partitions[4] = { + PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT + }; + + return base_partitions[split_idx]; +} + +static INLINE void set_sb_size(SequenceHeader *const seq_params, + BLOCK_SIZE sb_size) { + seq_params->sb_size = sb_size; + seq_params->mib_size = mi_size_wide[seq_params->sb_size]; + seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size]; +} + +// Returns true if the frame is fully lossless at the coded resolution. +// Note: If super-resolution is used, such a frame will still NOT be lossless at +// the upscaled resolution. +static INLINE int is_coded_lossless(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + int coded_lossless = 1; + if (cm->seg.enabled) { + for (int i = 0; i < MAX_SEGMENTS; ++i) { + if (!xd->lossless[i]) { + coded_lossless = 0; + break; + } + } + } else { + coded_lossless = xd->lossless[0]; + } + return coded_lossless; +} + +static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) { + return seq_level_idx == SEQ_LEVEL_MAX || + (seq_level_idx < SEQ_LEVELS && + // The following levels are currently undefined. + seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 && + seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 && + seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 && + seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 && + seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_AV1_COMMON_INT_H_ diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d.c b/media/libaom/src/av1/common/av1_inv_txfm1d.c index 7ef2d6d7f..8d69efcd2 100644 --- a/media/libaom/src/av1/common/av1_inv_txfm1d.c +++ b/media/libaom/src/av1/common/av1_inv_txfm1d.c @@ -13,11 +13,8 @@ #include "av1/common/av1_inv_txfm1d.h" #include "av1/common/av1_txfm.h" -// TODO(angiebird): Make 1-d txfm functions static -// - -void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { assert(output != input); const int32_t size = 4; const int32_t *cospi = cospi_arr(cos_bit); @@ -57,8 +54,8 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); } -void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { assert(output != input); const int32_t size = 8; const int32_t *cospi = cospi_arr(cos_bit); @@ -138,8 +135,8 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); } -void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { assert(output != input); const int32_t size = 16; const int32_t *cospi = cospi_arr(cos_bit); @@ -303,8 +300,8 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); } -void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { assert(output != input); const int32_t size = 32; const int32_t *cospi = cospi_arr(cos_bit); @@ -656,8 +653,8 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); } -void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { int bit = cos_bit; const int32_t *sinpi = sinpi_arr(bit); int32_t s0, s1, s2, s3, s4, s5, s6, s7; @@ -713,8 +710,8 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, output[3] = round_shift(x3, bit); } -void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { assert(output != input); const int32_t size = 8; const int32_t *cospi = cospi_arr(cos_bit); @@ -809,7 +806,6 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 - stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; @@ -822,8 +818,8 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[7] = -bf0[1]; } -void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { assert(output != input); const int32_t size = 16; const int32_t *cospi = cospi_arr(cos_bit); @@ -1010,7 +1006,6 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 - stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; @@ -1064,8 +1059,8 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4); } -void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range) { +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { assert(output != input); const int32_t size = 64; const int32_t *cospi = cospi_arr(cos_bit); diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d.h b/media/libaom/src/av1/common/av1_inv_txfm1d.h index c31c019aa..e1d5d98d1 100644 --- a/media/libaom/src/av1/common/av1_inv_txfm1d.h +++ b/media/libaom/src/av1/common/av1_inv_txfm1d.h @@ -29,22 +29,22 @@ static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) { for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit); } -void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); -void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); -void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); -void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); -void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); -void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); -void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); -void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, - const int8_t *stage_range); +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, diff --git a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h index 7d80a0099..47fedbd2a 100644 --- a/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h +++ b/media/libaom/src/av1/common/av1_inv_txfm1d_cfg.h @@ -36,12 +36,12 @@ static const int8_t inv_start_range[TX_SIZES_ALL] = { 7, // 64x16 transform }; -extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL]; +extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL]; -// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12 +// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12 // for each valid row and col combination #define INV_COS_BIT 12 -extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/]; -extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/]; +extern const int8_t av1_inv_cos_bit_col[5 /*row*/][5 /*col*/]; +extern const int8_t av1_inv_cos_bit_row[5 /*row*/][5 /*col*/]; #endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ diff --git a/media/libaom/src/av1/common/av1_inv_txfm2d.c b/media/libaom/src/av1/common/av1_inv_txfm2d.c index 4e6944314..559d12129 100644 --- a/media/libaom/src/av1/common/av1_inv_txfm2d.c +++ b/media/libaom/src/av1/common/av1_inv_txfm2d.c @@ -113,14 +113,14 @@ void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { - case TXFM_TYPE_DCT4: return av1_idct4_new; - case TXFM_TYPE_DCT8: return av1_idct8_new; - case TXFM_TYPE_DCT16: return av1_idct16_new; - case TXFM_TYPE_DCT32: return av1_idct32_new; - case TXFM_TYPE_DCT64: return av1_idct64_new; - case TXFM_TYPE_ADST4: return av1_iadst4_new; - case TXFM_TYPE_ADST8: return av1_iadst8_new; - case TXFM_TYPE_ADST16: return av1_iadst16_new; + case TXFM_TYPE_DCT4: return av1_idct4; + case TXFM_TYPE_DCT8: return av1_idct8; + case TXFM_TYPE_DCT16: return av1_idct16; + case TXFM_TYPE_DCT32: return av1_idct32; + case TXFM_TYPE_DCT64: return av1_idct64; + case TXFM_TYPE_ADST4: return av1_iadst4; + case TXFM_TYPE_ADST8: return av1_iadst8; + case TXFM_TYPE_ADST16: return av1_iadst16; case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c; case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c; case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c; @@ -149,7 +149,7 @@ static const int8_t inv_shift_32x8[2] = { -2, -4 }; static const int8_t inv_shift_16x64[2] = { -2, -4 }; static const int8_t inv_shift_64x16[2] = { -2, -4 }; -const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = { +const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = { inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32, inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16, inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64, @@ -158,7 +158,7 @@ const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = { }; /* clang-format off */ -const int8_t inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx +const int8_t av1_inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx [MAX_TXWH_IDX] = { // txh_idx { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 }, { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 }, @@ -167,7 +167,7 @@ const int8_t inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx { 0, 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT } }; -const int8_t inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx +const int8_t av1_inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx [MAX_TXWH_IDX] = { // txh_idx { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 }, { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 }, @@ -177,23 +177,22 @@ const int8_t inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx }; /* clang-format on */ -const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 }; +static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 }; void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, TXFM_2D_FLIP_CFG *cfg) { assert(cfg != NULL); cfg->tx_size = tx_size; - set_flip_cfg(tx_type, cfg); av1_zero(cfg->stage_range_col); av1_zero(cfg->stage_range_row); set_flip_cfg(tx_type, cfg); const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; - cfg->shift = inv_txfm_shift_ls[tx_size]; + cfg->shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + cfg->cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + cfg->cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; if (cfg->txfm_type_col == TXFM_TYPE_ADST4) { memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range)); @@ -229,7 +228,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, (void)real_range_row; if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) { // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 - // so opt_range_col >= real_range_col will not hold + // so opt_range_row >= real_range_row will not hold stage_range_row[i] = opt_range_row; } else { assert(opt_range_row >= real_range_row); @@ -242,7 +241,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1; (void)real_range_col; if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) { - // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 + // the adst4 may use 1 extra bit on top of opt_range_col at stage 1 // so opt_range_col >= real_range_col will not hold stage_range_col[i] = opt_range_col; } else { diff --git a/media/libaom/src/av1/common/av1_loopfilter.c b/media/libaom/src/av1/common/av1_loopfilter.c index 537d8dfe9..c756760de 100644 --- a/media/libaom/src/av1/common/av1_loopfilter.c +++ b/media/libaom/src/av1/common/av1_loopfilter.c @@ -17,8 +17,8 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/av1_loopfilter.h" -#include "av1/common/onyxc_int.h" #include "av1/common/reconinter.h" #include "av1/common/seg_common.h" @@ -28,11 +28,9 @@ static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = { { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V } }; -static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { - { 0, 1 }, { 2, 2 }, { 3, 3 } -}; - -typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR; +static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 }, + { 2, 2 }, + { 3, 3 } }; static const int mode_lf_lut[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES @@ -40,393 +38,6 @@ static const int mode_lf_lut[] = { 1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0) }; -#if LOOP_FILTER_BITMASK -// 256 bit masks (64x64 / 4x4) for left transform size for Y plane. -// We use 4 uint64_t to represent the 256 bit. -// Each 1 represents a position where we should apply a loop filter -// across the left border of an 4x4 block boundary. -// -// In the case of TX_8x8-> ( in low order byte first we end up with -// a mask that looks like this (-- and | are used for better view) -// -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// ----------------- -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// 10101010|10101010 -// -// A loopfilter should be applied to every other 4x4 horizontally. - -// 256 bit masks (64x64 / 4x4) for above transform size for Y plane. -// We use 4 uint64_t to represent the 256 bit. -// Each 1 represents a position where we should apply a loop filter -// across the top border of an 4x4 block boundary. -// -// In the case of TX_8x8-> ( in low order byte first we end up with -// a mask that looks like this -// -// 11111111|11111111 -// 00000000|00000000 -// 11111111|11111111 -// 00000000|00000000 -// 11111111|11111111 -// 00000000|00000000 -// 11111111|11111111 -// 00000000|00000000 -// ----------------- -// 11111111|11111111 -// 00000000|00000000 -// 11111111|11111111 -// 00000000|00000000 -// 11111111|11111111 -// 00000000|00000000 -// 11111111|11111111 -// 00000000|00000000 -// -// A loopfilter should be applied to every other 4x4 horizontally. - -const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18 -}; - -const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = { - -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13 -}; - -const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = { - -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8 -}; - -const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1, - -1, -1, -1, 0, 1, 2, - 3, -1, -1, -1, -1, -1, - -1, -1, -1, -1 }; - -const FilterMask left_mask_univariant_reordered[67] = { - // TX_4X4 - { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 - { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 - { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 - { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 - { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 - { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 - { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 - { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 - { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 - { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 - { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, - 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 - { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 - { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, - 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 - { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 - { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 - { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, - 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 - { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 - // TX_8X8 - { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 - { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 - { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 - { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 - { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 - { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 - { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL, - 0x0055005500550055ULL } }, // block size 32X64, TX_8X8 - { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 - { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL, - 0x5555555555555555ULL } }, // block size 64X64, TX_8X8 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 - { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 - { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL, - 0x0005000500050005ULL } }, // block size 16X64, TX_8X8 - { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 - // TX_16X16 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 - { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 - { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 - { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL, - 0x0011001100110011ULL } }, // block size 32X64, TX_16X16 - { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 - { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL, - 0x1111111111111111ULL } }, // block size 64X64, TX_16X16 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, - 0x0001000100010001ULL } }, // block size 16X64, TX_16X16 - { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 - // TX_32X32 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 - { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, - 0x0101010101010101ULL } }, // block size 32X64, TX_32X32 - { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 - { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, - 0x0101010101010101ULL } }, // block size 64X64, TX_32X32 - // TX_64X64 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, - 0x0001000100010001ULL } }, // block size 64X64, TX_64X64 - // 2:1, 1:2 transform sizes. - { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 - { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 - { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 - { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 - { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, - 0x0001000100010001ULL } }, // block size 16X64, TX_16X32 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 - { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, - 0x0001000100010001ULL } }, // block size 32X64, TX_32X64 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 - // 4:1, 1:4 transform sizes. - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 - { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 - { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 - { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, - 0x0001000100010001ULL } }, // block size 16X64, TX_16X64 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 -}; - -const FilterMask above_mask_univariant_reordered[67] = { - // TX_4X4 - { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 - { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 - { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 - { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 - { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 - { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 - { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 - { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 - { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 - { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 - { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, - 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 - { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 - { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, - 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4 - { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 - { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 - { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 - { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, - 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 - { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 - // TX_8X8 - { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 - { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 - { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 - { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 - { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 - { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 - { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL, - 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8 - { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 - { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, - 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8 - { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 - { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL, - 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8 - { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 - // TX_16X16 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 - { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 - { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 - { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL, - 0x00000000000000ffULL } }, // block size 32X64, TX_16X16 - { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 - { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL, - 0x000000000000ffffULL } }, // block size 64X64, TX_16X16 - { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL, - 0x000000000000000fULL } }, // block size 16X64, TX_16X16 - { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 - // TX_32X32 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL, - 0x0000000000000000ULL } }, // block size 32X64, TX_32X32 - { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 - { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL, - 0x0000000000000000ULL } }, // block size 64X64, TX_32X32 - // TX_64X64 - { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X64, TX_64X64 - // 2:1, 1:2 transform sizes. - { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 - { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 - { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 - { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 - { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL, - 0x0000000000000000ULL } }, // block size 16X64, TX_16X32 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 - { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X64, TX_32X64 - { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 - // 4:1, 1:4 transform sizes. - { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 - { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 - { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 - { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 16X64, TX_16X64 - { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, - 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 -}; - -LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row, - int mi_col) { - assert(cm->lf.lfm != NULL); - const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64 - const int col = mi_col >> MIN_MIB_SIZE_LOG2; - return &cm->lf.lfm[row * cm->lf.lfm_stride + col]; -} - -typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh); - -typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1); - -typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, int bd); - -typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd); -#endif // LOOP_FILTER_BITMASK - static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { int lvl; @@ -448,13 +59,13 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { } } -uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n, - const int dir_idx, int plane, - const MB_MODE_INFO *mbmi) { +uint8_t av1_get_filter_level(const AV1_COMMON *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi) { const int segment_id = mbmi->segment_id; - if (cm->delta_lf_present_flag) { - int delta_lf; - if (cm->delta_lf_multi) { + if (cm->delta_q_info.delta_lf_present_flag) { + int8_t delta_lf; + if (cm->delta_q_info.delta_lf_multi) { const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx]; delta_lf = mbmi->delta_lf[delta_lf_idx]; } else { @@ -531,6 +142,9 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, filt_lvl_r[1] = cm->lf.filter_level_u; filt_lvl_r[2] = cm->lf.filter_level_v; + assert(plane_start >= AOM_PLANE_Y); + assert(plane_end <= MAX_MB_PLANE); + for (plane = plane_start; plane < plane_end; plane++) { if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0]) break; @@ -542,7 +156,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { for (int dir = 0; dir < 2; ++dir) { int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane]; - assert(plane >= 0 && plane <= 2); const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir]; if (segfeature_active(seg, seg_id, seg_lf_feature_id)) { const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id); @@ -575,1321 +188,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, } } -#if LOOP_FILTER_BITMASK -// A 64x64 tx block requires 256 bits to represent each 4x4 tx block. -// Every 4 rows is represented by one uint64_t mask. Hence, -// there are 4 uint64_t bitmask[4] to represent the 64x64 block. -// -// Given a location by (mi_col, mi_row), This function returns the index -// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value. -// -// For example, mi_row is the offset of pixels in mi size (4), -// (mi_row / 4) returns which uint64_t. -// After locating which uint64_t, mi_row % 4 is the -// row offset, and each row has 16 = 1 << stride_log2 4x4 units. -// Therefore, shift = (row << stride_log2) + mi_col; -int get_index_shift(int mi_col, int mi_row, int *index) { - // *index = mi_row >> 2; - // rows = mi_row % 4; - // stride_log2 = 4; - // shift = (rows << stride_log2) + mi_col; - *index = mi_row >> 2; - return ((mi_row & 3) << 4) | mi_col; -} - -static void check_mask(const FilterMask *lfm) { -#ifndef NDEBUG - for (int i = 0; i < 4; ++i) { - assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i])); - assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i])); - assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i])); - assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i])); - assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i])); - assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i])); - assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i])); - assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i])); - assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i])); - assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i])); - } -#else - (void)lfm; -#endif -} - -static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) { - if (plane == 0) { - // Assert if we try to apply 2 different loop filters at the same - // position. - check_mask(lfm->left_y); - check_mask(lfm->above_y); - } else if (plane == 1) { - check_mask(lfm->left_u); - check_mask(lfm->above_u); - } else { - check_mask(lfm->left_v); - check_mask(lfm->above_v); - } -} - -static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask, - TX_SIZE sqr_tx_size, LoopFilterMask *lfm) { - if (dir == VERT_EDGE) { - switch (plane) { - case 0: - for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i]; - break; - case 1: - for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i]; - break; - case 2: - for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i]; - break; - default: assert(plane <= 2); - } - } else { - switch (plane) { - case 0: - for (int i = 0; i < 4; ++i) - lfm->above_y[sqr_tx_size].bits[i] |= mask[i]; - break; - case 1: - for (int i = 0; i < 4; ++i) - lfm->above_u[sqr_tx_size].bits[i] |= mask[i]; - break; - case 2: - for (int i = 0; i < 4; ++i) - lfm->above_v[sqr_tx_size].bits[i] |= mask[i]; - break; - default: assert(plane <= 2); - } - } -} - -static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row, - int mi_col, int ssx, int ssy, EDGE_DIR dir) { - if (plane && (ssx || ssy)) { - if (ssx && ssy) { // format 420 - if ((mi_row << MI_SIZE_LOG2) > cm->height || - (mi_col << MI_SIZE_LOG2) > cm->width) - return 1; - } else if (ssx) { // format 422 - if ((mi_row << MI_SIZE_LOG2) >= cm->height || - (mi_col << MI_SIZE_LOG2) > cm->width) - return 1; - } - } else { - if ((mi_row << MI_SIZE_LOG2) >= cm->height || - (mi_col << MI_SIZE_LOG2) >= cm->width) - return 1; - } - - int row_or_col; - if (plane == 0) { - row_or_col = dir == VERT_EDGE ? mi_col : mi_row; - } else { - // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block. - // So if mi_col == 1, it is actually the frame boundary. - if (dir == VERT_EDGE) { - row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col; - } else { - row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row; - } - } - return row_or_col == 0; -} - -static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, - int ssx, int ssy, TX_SIZE tx_size) { - LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); - const int x = (mi_col << (MI_SIZE_LOG2 - ssx)); - const int y = (mi_row << (MI_SIZE_LOG2 - ssy)); - // decide whether current vertical/horizontal edge needs loop filtering - for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) { - // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block. - mi_row |= ssy; - mi_col |= ssx; - - MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col; - const MB_MODE_INFO *const mbmi = mi[0]; - const int curr_skip = mbmi->skip && is_inter_block(mbmi); - const BLOCK_SIZE bsize = mbmi->sb_type; - const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy); - const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy]; - const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi); - const int prediction_masks = dir == VERT_EDGE - ? block_size_wide[plane_bsize] - 1 - : block_size_high[plane_bsize] - 1; - const int is_coding_block_border = - dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks); - - // TODO(chengchen): step can be optimized. - const int row_step = mi_size_high[TX_4X4] << ssy; - const int col_step = mi_size_wide[TX_4X4] << ssx; - const int mi_height = - dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step; - const int mi_width = - dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx; - - // assign filter levels - for (int r = mi_row; r < mi_row + mi_height; r += row_step) { - for (int c = mi_col; c < mi_col + mi_width; c += col_step) { - // do not filter frame boundary - // Note: when chroma planes' size are half of luma plane, - // chroma plane mi corresponds to even position. - // If frame size is not even, we still need to filter this chroma - // position. Therefore the boundary condition check needs to be - // separated to two cases. - if (plane && (ssx || ssy)) { - if (ssx && ssy) { // format 420 - if ((r << MI_SIZE_LOG2) > cm->height || - (c << MI_SIZE_LOG2) > cm->width) - continue; - } else if (ssx) { // format 422 - if ((r << MI_SIZE_LOG2) >= cm->height || - (c << MI_SIZE_LOG2) > cm->width) - continue; - } - } else { - if ((r << MI_SIZE_LOG2) >= cm->height || - (c << MI_SIZE_LOG2) >= cm->width) - continue; - } - - const int row = r % MI_SIZE_64X64; - const int col = c % MI_SIZE_64X64; - if (plane == 0) { - if (dir == VERT_EDGE) - lfm->lfl_y_ver[row][col] = level; - else - lfm->lfl_y_hor[row][col] = level; - } else if (plane == 1) { - lfm->lfl_u[row][col] = level; - } else { - lfm->lfl_v[row][col] = level; - } - } - } - - for (int r = mi_row; r < mi_row + mi_height; r += row_step) { - for (int c = mi_col; c < mi_col + mi_width; c += col_step) { - // do not filter frame boundary - if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue; - - uint64_t mask[4] = { 0 }; - const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy); - const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c; - MB_MODE_INFO **mi_prev = - cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col; - const MB_MODE_INFO *const mbmi_prev = mi_prev[0]; - const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev); - const uint8_t level_prev = - get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev); - const int is_edge = - (level || level_prev) && - (!curr_skip || !prev_skip || is_coding_block_border); - - if (is_edge) { - const TX_SIZE prev_tx_size = - plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy) - : mbmi_prev->tx_size; - TX_SIZE min_tx_size = (dir == VERT_EDGE) - ? AOMMIN(txsize_horz_map[tx_size], - txsize_horz_map[prev_tx_size]) - : AOMMIN(txsize_vert_map[tx_size], - txsize_vert_map[prev_tx_size]); - min_tx_size = AOMMIN(min_tx_size, TX_16X16); - assert(min_tx_size < TX_SIZES); - const int row = r % MI_SIZE_64X64; - const int col = c % MI_SIZE_64X64; - int index = 0; - const int shift = get_index_shift(col, row, &index); - assert(index < 4 && index >= 0); - mask[index] |= ((uint64_t)1 << shift); - // set mask on corresponding bit - update_masks(dir, plane, mask, min_tx_size, lfm); - } - } - } - } -} - -static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col, - int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - int plane, int ssx, int ssy) { - blk_row <<= ssy; - blk_col <<= ssx; - if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height || - ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width) - return; - - // U/V plane, tx_size is always the largest size - if (plane) { - assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32); - setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy, - tx_size); - return; - } - - MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col; - const MB_MODE_INFO *const mbmi = mi[0]; - // For Y plane: - // If intra block, tx size is univariant. - // If inter block, tx size follows inter_tx_size. - TX_SIZE plane_tx_size = tx_size; - const int is_inter = is_inter_block(mbmi); - - if (plane == 0) { - if (is_inter) { - if (mbmi->skip) { - // TODO(chengchen): change av1_get_transform_size() to be consistant. - // plane_tx_size = get_max_rect_tx_size(plane_bsize); - plane_tx_size = mbmi->tx_size; - } else { - plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index( - plane_bsize, blk_row, blk_col)]; - } - } else { - MB_MODE_INFO **mi_this = cm->mi_grid_visible + - (mi_row + blk_row) * cm->mi_stride + mi_col + - blk_col; - const MB_MODE_INFO *const mbmi_this = mi_this[0]; - plane_tx_size = mbmi_this->tx_size; - } - } - - assert(txsize_to_bsize[plane_tx_size] <= plane_bsize); - - if (plane || plane_tx_size == tx_size) { - setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy, - tx_size); - } else { - const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; - const int bsw = tx_size_wide_unit[sub_txs]; - const int bsh = tx_size_high_unit[sub_txs]; - for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { - for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { - const int offsetr = blk_row + row; - const int offsetc = blk_col + col; - setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize, - sub_txs, plane, ssx, ssy); - } - } - } -} - -static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col, - int plane, int ssx, int ssy) { - MB_MODE_INFO **mi = - cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx); - const MB_MODE_INFO *const mbmi = mi[0]; - - const BLOCK_SIZE bsize = mbmi->sb_type; - const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy); - const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy]; - - const int block_width = mi_size_wide[plane_bsize]; - const int block_height = mi_size_high[plane_bsize]; - - TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize]; - // The decoder is designed so that it can process 64x64 luma pixels at a - // time. If this is a chroma plane with subsampling and bsize corresponds to - // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That - // mustn't be used for the subsampled plane (because it would be bigger than - // a 64x64 luma block) so we round down to TX_32X32. - if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) { - if (max_txsize == TX_16X64) - max_txsize = TX_16X32; - else if (max_txsize == TX_64X16) - max_txsize = TX_32X16; - else - max_txsize = TX_32X32; - } - - const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize]; - const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0]; - const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0]; - const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy]; - int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; - int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0]; - - mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide); - mu_blocks_high = AOMMIN(block_height, mu_blocks_high); - - // Y: Largest tx_size is 64x64, while superblock size can be 128x128. - // Here we ensure that setup_tx_block_mask process at most a 64x64 block. - // U/V: largest tx size is 32x32. - for (int idy = 0; idy < block_height; idy += mu_blocks_high) { - for (int idx = 0; idx < block_width; idx += mu_blocks_wide) { - const int unit_height = AOMMIN(mu_blocks_high + idy, block_height); - const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width); - for (int blk_row = idy; blk_row < unit_height; blk_row += bh) { - for (int blk_col = idx; blk_col < unit_width; blk_col += bw) { - setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize, - max_txsize, plane, ssx, ssy); - } - } - } - } -} - -static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col, - BLOCK_SIZE bsize, int plane, int ssx, int ssy) { - if ((mi_row << MI_SIZE_LOG2) >= cm->height || - (mi_col << MI_SIZE_LOG2) >= cm->width) - return; - - const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); - const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); - const int hbs = mi_size_wide[bsize] / 2; - const int quarter_step = mi_size_wide[bsize] / 4; - const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1; - const int has_next_row = - (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8; - const int has_next_col = - (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8; - int i; - - switch (partition) { - case PARTITION_NONE: - setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy); - break; - case PARTITION_HORZ: - setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy); - if (has_next_row) - setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy); - break; - case PARTITION_VERT: - setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy); - if (has_next_col) - setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy); - break; - case PARTITION_SPLIT: - setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy); - if (has_next_col) - setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy); - if (has_next_row) - setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy); - if (has_next_col & has_next_row) - setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx, - ssy); - break; - case PARTITION_HORZ_A: - setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy); - if (has_next_col) - setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy); - if (has_next_row) - setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy); - break; - case PARTITION_HORZ_B: - setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy); - if (has_next_row) - setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy); - if (has_next_col & has_next_row) - setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy); - break; - case PARTITION_VERT_A: - setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy); - if (has_next_row) - setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy); - if (has_next_col) - setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy); - break; - case PARTITION_VERT_B: - setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy); - if (has_next_col) - setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy); - if (has_next_row) - setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy); - break; - case PARTITION_HORZ_4: - for (i = 0; i < 4; ++i) { - int this_mi_row = mi_row + i * quarter_step; - if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break; - // chroma plane filter the odd location - if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue; - - setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy); - } - break; - case PARTITION_VERT_4: - for (i = 0; i < 4; ++i) { - int this_mi_col = mi_col + i * quarter_step; - if (i > 0 && this_mi_col >= cm->mi_cols) break; - // chroma plane filter the odd location - if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue; - - setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy); - } - break; - default: assert(0); - } -} - -// TODO(chengchen): if lossless, do not need to setup mask. But when -// segments enabled, each segment has different lossless settings. -void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, - int subsampling_x, int subsampling_y, int row_end, - int col_end) { - const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2; - for (int y = 0; y < num_64x64; ++y) { - for (int x = 0; x < num_64x64; ++x) { - const int row = mi_row + y * MI_SIZE_64X64; - const int col = mi_col + x * MI_SIZE_64X64; - if (row >= row_end || col >= col_end) continue; - if ((row << MI_SIZE_LOG2) >= cm->height || - (col << MI_SIZE_LOG2) >= cm->width) - continue; - - LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col); - if (lfm == NULL) return; - - // init mask to zero - if (plane == 0) { - av1_zero(lfm->left_y); - av1_zero(lfm->above_y); - av1_zero(lfm->lfl_y_ver); - av1_zero(lfm->lfl_y_hor); - } else if (plane == 1) { - av1_zero(lfm->left_u); - av1_zero(lfm->above_u); - av1_zero(lfm->lfl_u); - } else { - av1_zero(lfm->left_v); - av1_zero(lfm->above_v); - av1_zero(lfm->lfl_v); - } - } - } - - // set up bitmask for each superblock - setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane, - subsampling_x, subsampling_y); - - for (int y = 0; y < num_64x64; ++y) { - for (int x = 0; x < num_64x64; ++x) { - const int row = mi_row + y * MI_SIZE_64X64; - const int col = mi_col + x * MI_SIZE_64X64; - if (row >= row_end || col >= col_end) continue; - if ((row << MI_SIZE_LOG2) >= cm->height || - (col << MI_SIZE_LOG2) >= cm->width) - continue; - - LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col); - if (lfm == NULL) return; - - // check if the mask is valid - check_loop_filter_masks(lfm, plane); - - { - // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only). - // Even tx size is greater, we only apply max length filter, which - // is 16. - if (plane == 0) { - for (int j = 0; j < 4; ++j) { - lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j]; - lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j]; - lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j]; - lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j]; - - // set 32x32 and 64x64 to 0 - lfm->left_y[TX_32X32].bits[j] = 0; - lfm->left_y[TX_64X64].bits[j] = 0; - lfm->above_y[TX_32X32].bits[j] = 0; - lfm->above_y[TX_64X64].bits[j] = 0; - } - } else if (plane == 1) { - for (int j = 0; j < 4; ++j) { - lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j]; - lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j]; - - // set 32x32 to 0 - lfm->left_u[TX_32X32].bits[j] = 0; - lfm->above_u[TX_32X32].bits[j] = 0; - } - } else { - for (int j = 0; j < 4; ++j) { - lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j]; - lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j]; - - // set 32x32 to 0 - lfm->left_v[TX_32X32].bits[j] = 0; - lfm->above_v[TX_32X32].bits[j] = 0; - } - } - } - - // check if the mask is valid - check_loop_filter_masks(lfm, plane); - } - } -} - -static void filter_selectively_vert_row2( - int subsampling_factor, uint8_t *s, int pitch, int plane, - uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0, - uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1, - const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) { - uint64_t mask; - const int step = 1 << subsampling_factor; - - for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 | - mask_8x8_1 | mask_4x4_1; - mask; mask >>= step) { - const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; - const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2; - - if (mask & 1) { - if ((mask_16x16_0 | mask_16x16_1) & 1) { - // chroma plane filters less pixels introduced in deblock_13tap - // experiment - LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14; - - if ((mask_16x16_0 & mask_16x16_1) & 1) { - if (plane) { - aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } else { - aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } - } else if (mask_16x16_0 & 1) { - lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); - } else { - lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } - } - - if ((mask_8x8_0 | mask_8x8_1) & 1) { - // chroma plane filters less pixels introduced in deblock_13tap - // experiment - LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8; - - if ((mask_8x8_0 & mask_8x8_1) & 1) { - if (plane) { - aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } else { - aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } - } else if (mask_8x8_0 & 1) { - lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); - } else { - lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } - } - - if ((mask_4x4_0 | mask_4x4_1) & 1) { - if ((mask_4x4_0 & mask_4x4_1) & 1) { - aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } else if (mask_4x4_0 & 1) { - aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); - } else { - aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } - } - } - - s += 4; - lfl += step; - lfl2 += step; - mask_16x16_0 >>= step; - mask_8x8_0 >>= step; - mask_4x4_0 >>= step; - mask_16x16_1 >>= step; - mask_8x8_1 >>= step; - mask_4x4_1 >>= step; - } -} - -static void highbd_filter_selectively_vert_row2( - int subsampling_factor, uint16_t *s, int pitch, int plane, - uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0, - uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1, - const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) { - uint64_t mask; - const int step = 1 << subsampling_factor; - - for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 | - mask_8x8_1 | mask_4x4_1; - mask; mask >>= step) { - const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; - const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2; - - if (mask & 1) { - if ((mask_16x16_0 | mask_16x16_1) & 1) { - // chroma plane filters less pixels introduced in deblock_13tap - // experiment - HbdLpfFunc highbd_lpf_vertical = - plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14; - - if ((mask_16x16_0 & mask_16x16_1) & 1) { - if (plane) { - aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); - } else { - aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); - } - } else if (mask_16x16_0 & 1) { - highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - bd); - } else { - highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, bd); - } - } - - if ((mask_8x8_0 | mask_8x8_1) & 1) { - HbdLpfFunc highbd_lpf_vertical = - plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8; - - if ((mask_8x8_0 & mask_8x8_1) & 1) { - if (plane) { - aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); - } else { - aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); - } - } else if (mask_8x8_0 & 1) { - highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, - bd); - } else { - highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, bd); - } - } - - if ((mask_4x4_0 | mask_4x4_1) & 1) { - if ((mask_4x4_0 & mask_4x4_1) & 1) { - aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, bd); - } else if (mask_4x4_0 & 1) { - aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); - } else { - aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); - } - } - } - - s += 4; - lfl += step; - lfl2 += step; - mask_16x16_0 >>= step; - mask_8x8_0 >>= step; - mask_4x4_0 >>= step; - mask_16x16_1 >>= step; - mask_8x8_1 >>= step; - mask_4x4_1 >>= step; - } -} - -static void filter_selectively_horiz(uint8_t *s, int pitch, int plane, - int subsampling, uint64_t mask_16x16, - uint64_t mask_8x8, uint64_t mask_4x4, - const loop_filter_info_n *lfi_n, - const uint8_t *lfl) { - uint64_t mask; - int count; - const int step = 1 << subsampling; - const unsigned int two_block_mask = subsampling ? 5 : 3; - - for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; - // Next block's thresholds. - const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step); - - count = 1; - if (mask & 1) { - if (mask_16x16 & 1) { - // chroma plane filters less pixels introduced in deblock_13tap - // experiment - LpfFunc lpf_horizontal = - plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14; - - if ((mask_16x16 & two_block_mask) == two_block_mask) { - if (plane) { - aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr); - } else { - aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr); - } - count = 2; - } else { - lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - } - } else if (mask_8x8 & 1) { - // chroma plane filters less pixels introduced in deblock_13tap - // experiment - LpfFunc lpf_horizontal = - plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8; - - if ((mask_8x8 & two_block_mask) == two_block_mask) { - if (plane) { - aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr); - } else { - aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr); - } - count = 2; - } else { - lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - } - } else if (mask_4x4 & 1) { - if ((mask_4x4 & two_block_mask) == two_block_mask) { - aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr); - count = 2; - } else { - aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - } - } - } - - s += 4 * count; - lfl += step * count; - mask_16x16 >>= step * count; - mask_8x8 >>= step * count; - mask_4x4 >>= step * count; - } -} - -static void highbd_filter_selectively_horiz( - uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16, - uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n, - uint8_t *lfl, int bd) { - uint64_t mask; - int count; - const int step = 1 << subsampling; - const unsigned int two_block_mask = subsampling ? 5 : 3; - - for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; - // Next block's thresholds. - const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step); - - count = 1; - if (mask & 1) { - if (mask_16x16 & 1) { - HbdLpfFunc highbd_lpf_horizontal = - plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14; - - if ((mask_16x16 & two_block_mask) == two_block_mask) { - if (plane) { - aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, - lfin->lim, lfin->hev_thr, bd); - } else { - aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, - lfin->lim, lfin->hev_thr, bd); - } - count = 2; - } else { - highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, - bd); - } - } else if (mask_8x8 & 1) { - HbdLpfFunc highbd_lpf_horizontal = - plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8; - - if ((mask_8x8 & two_block_mask) == two_block_mask) { - if (plane) { - aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, - lfin->lim, lfin->hev_thr, bd); - } else { - aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, - lfin->lim, lfin->hev_thr, bd); - } - count = 2; - } else { - highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, - bd); - } - } else if (mask_4x4 & 1) { - if ((mask_4x4 & two_block_mask) == two_block_mask) { - aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); - count = 2; - } else { - aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, bd); - } - } - } - - s += 4 * count; - lfl += step * count; - mask_16x16 >>= step * count; - mask_8x8 >>= step * count; - mask_4x4 >>= step * count; - } -} - -void av1_build_bitmask_vert_info( - AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, - int plane) { - const int subsampling_x = plane_ptr->subsampling_x; - const int subsampling_y = plane_ptr->subsampling_y; - const int row_step = (MI_SIZE >> MI_SIZE_LOG2); - const int is_uv = plane > 0; - TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; - uint8_t level, prev_level = 1; - int skip, prev_skip = 0; - int is_coding_block_border; - - for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) { - const int mi_row = r << subsampling_y; - const int row = mi_row % MI_SIZE_64X64; - int index = 0; - const int shift = get_index_shift(0, row, &index); - - for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; - c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) { - const int mi_col = c << subsampling_x; - LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); - - for (int col_in_unit = 0; - col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) { - const int x = (c + col_in_unit) << MI_SIZE_LOG2; - if (x >= plane_ptr->dst.width) break; - const int col = col_in_unit << subsampling_x; - const uint64_t mask = ((uint64_t)1 << (shift | col)); - skip = lfm->skip.bits[index] & mask; - is_coding_block_border = lfm->is_vert_border.bits[index] & mask; - switch (plane) { - case 0: level = lfm->lfl_y_ver[row][col]; break; - case 1: level = lfm->lfl_u[row][col]; break; - case 2: level = lfm->lfl_v[row][col]; break; - default: assert(plane >= 0 && plane <= 2); return; - } - for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { - if (is_uv && ts == TX_64X64) continue; - if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) { - tx_size = ts; - break; - } - } - if ((c + col_in_unit > 0) && (level || prev_level) && - (!prev_skip || !skip || is_coding_block_border)) { - const TX_SIZE min_tx_size = - AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); - const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64; - const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64; - const int shift_1 = get_index_shift(tmp_col, tmp_row, &index); - const uint64_t mask_1 = ((uint64_t)1 << shift_1); - switch (plane) { - case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break; - case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break; - case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break; - default: assert(plane >= 0 && plane <= 2); return; - } - } - - // update prev info - prev_level = level; - prev_skip = skip; - prev_tx_size = tx_size; - // advance - col_in_unit += tx_size_wide_unit[tx_size]; - } - } - } -} - -void av1_build_bitmask_horz_info( - AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, - int plane) { - const int subsampling_x = plane_ptr->subsampling_x; - const int subsampling_y = plane_ptr->subsampling_y; - const int col_step = (MI_SIZE >> MI_SIZE_LOG2); - const int is_uv = plane > 0; - TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; - uint8_t level, prev_level = 1; - int skip, prev_skip = 0; - int is_coding_block_border; - - for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) { - const int mi_col = c << subsampling_x; - const int col = mi_col % MI_SIZE_64X64; - - for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; - r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) { - const int mi_row = r << subsampling_y; - LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); - - for (int r_in_unit = 0; - r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) { - const int y = (r + r_in_unit) << MI_SIZE_LOG2; - if (y >= plane_ptr->dst.height) break; - const int row = r_in_unit << subsampling_y; - int index = 0; - const int shift = get_index_shift(col, row, &index); - const uint64_t mask = ((uint64_t)1 << shift); - skip = lfm->skip.bits[index] & mask; - is_coding_block_border = lfm->is_horz_border.bits[index] & mask; - switch (plane) { - case 0: level = lfm->lfl_y_hor[row][col]; break; - case 1: level = lfm->lfl_u[row][col]; break; - case 2: level = lfm->lfl_v[row][col]; break; - default: assert(plane >= 0 && plane <= 2); return; - } - for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { - if (is_uv && ts == TX_64X64) continue; - if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) { - tx_size = ts; - break; - } - } - if ((r + r_in_unit > 0) && (level || prev_level) && - (!prev_skip || !skip || is_coding_block_border)) { - const TX_SIZE min_tx_size = - AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); - const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64; - const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64; - const int shift_1 = get_index_shift(tmp_col, tmp_row, &index); - const uint64_t mask_1 = ((uint64_t)1 << shift_1); - - switch (plane) { - case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break; - case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break; - case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break; - default: assert(plane >= 0 && plane <= 2); return; - } - } - - // update prev info - prev_level = level; - prev_skip = skip; - prev_tx_size = tx_size; - // advance - r_in_unit += tx_size_high_unit[tx_size]; - } - } - } -} - -void av1_filter_block_plane_bitmask_vert( - AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, - int mi_row, int mi_col) { - struct buf_2d *const dst = &plane_ptr->dst; - uint8_t *const buf0 = dst->buf; - const int ssx = plane_ptr->subsampling_x; - const int ssy = plane_ptr->subsampling_y; - const int mask_cutoff = 0xffff; - const int row_step = 1 << ssy; - const int two_row_step = 2 << ssy; - const int row_stride = dst->stride << MI_SIZE_LOG2; - const int two_row_stride = row_stride << 1; - uint64_t mask_16x16 = 0; - uint64_t mask_8x8 = 0; - uint64_t mask_4x4 = 0; - uint8_t *lfl; - uint8_t *lfl2; - LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); - assert(lfm); - - // 1. vertical filtering. filter two rows at a time - for (int r = 0; - ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; - r += two_row_step) { - const int row = r | ssy; - const int row_next = row + row_step; - const int col = ssx; - int index = 0; - const int shift = get_index_shift(col, row, &index); - int index_next = 0; - const int shift_next = get_index_shift(col, row_next, &index_next); - switch (pl) { - case 0: - mask_16x16 = lfm->left_y[TX_16X16].bits[index]; - mask_8x8 = lfm->left_y[TX_8X8].bits[index]; - mask_4x4 = lfm->left_y[TX_4X4].bits[index]; - lfl = &lfm->lfl_y_ver[row][col]; - lfl2 = &lfm->lfl_y_ver[row_next][col]; - break; - case 1: - mask_16x16 = lfm->left_u[TX_16X16].bits[index]; - mask_8x8 = lfm->left_u[TX_8X8].bits[index]; - mask_4x4 = lfm->left_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u[row][col]; - lfl2 = &lfm->lfl_u[row_next][col]; - break; - case 2: - mask_16x16 = lfm->left_v[TX_16X16].bits[index]; - mask_8x8 = lfm->left_v[TX_8X8].bits[index]; - mask_4x4 = lfm->left_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v[row][col]; - lfl2 = &lfm->lfl_v[row_next][col]; - break; - default: assert(pl >= 0 && pl <= 2); return; - } - uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; - uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; - uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; - uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; - uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; - uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; - - if (cm->seq_params.use_highbitdepth) - highbd_filter_selectively_vert_row2( - ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, - mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, - &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); - else - filter_selectively_vert_row2( - ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, - mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); - dst->buf += two_row_stride; - } - // reset buf pointer for horizontal filtering - dst->buf = buf0; -} - -void av1_filter_block_plane_bitmask_horz( - AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, - int mi_row, int mi_col) { - struct buf_2d *const dst = &plane_ptr->dst; - uint8_t *const buf0 = dst->buf; - const int ssx = plane_ptr->subsampling_x; - const int ssy = plane_ptr->subsampling_y; - const int mask_cutoff = 0xffff; - const int row_step = 1 << ssy; - const int row_stride = dst->stride << MI_SIZE_LOG2; - uint64_t mask_16x16 = 0; - uint64_t mask_8x8 = 0; - uint64_t mask_4x4 = 0; - uint8_t *lfl; - LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); - assert(lfm); - for (int r = 0; - ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; - r += row_step) { - if (mi_row + r == 0) { - dst->buf += row_stride; - continue; - } - const int row = r | ssy; - const int col = ssx; - int index = 0; - const int shift = get_index_shift(col, row, &index); - switch (pl) { - case 0: - mask_16x16 = lfm->above_y[TX_16X16].bits[index]; - mask_8x8 = lfm->above_y[TX_8X8].bits[index]; - mask_4x4 = lfm->above_y[TX_4X4].bits[index]; - lfl = &lfm->lfl_y_hor[row][col]; - break; - case 1: - mask_16x16 = lfm->above_u[TX_16X16].bits[index]; - mask_8x8 = lfm->above_u[TX_8X8].bits[index]; - mask_4x4 = lfm->above_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u[row][col]; - break; - case 2: - mask_16x16 = lfm->above_v[TX_16X16].bits[index]; - mask_8x8 = lfm->above_v[TX_8X8].bits[index]; - mask_4x4 = lfm->above_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v[row][col]; - break; - default: assert(pl >= 0 && pl <= 2); return; - } - mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; - mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; - mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; - - if (cm->seq_params.use_highbitdepth) - highbd_filter_selectively_horiz( - CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, - mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth); - else - filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, - mask_8x8, mask_4x4, &cm->lf_info, lfl); - dst->buf += row_stride; - } - // reset buf pointer for next block - dst->buf = buf0; -} - -void av1_filter_block_plane_ver(AV1_COMMON *const cm, - struct macroblockd_plane *const plane_ptr, - int pl, int mi_row, int mi_col) { - struct buf_2d *const dst = &plane_ptr->dst; - int r, c; - const int ssx = plane_ptr->subsampling_x; - const int ssy = plane_ptr->subsampling_y; - const int mask_cutoff = 0xffff; - const int single_step = 1 << ssy; - const int r_step = 2 << ssy; - uint64_t mask_16x16 = 0; - uint64_t mask_8x8 = 0; - uint64_t mask_4x4 = 0; - uint8_t *lfl; - uint8_t *lfl2; - - // filter two rows at a time - for (r = 0; r < cm->seq_params.mib_size && - ((mi_row + r) << MI_SIZE_LOG2 < cm->height); - r += r_step) { - for (c = 0; c < cm->seq_params.mib_size && - ((mi_col + c) << MI_SIZE_LOG2 < cm->width); - c += MI_SIZE_64X64) { - dst->buf += ((c << MI_SIZE_LOG2) >> ssx); - LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c); - assert(lfm); - const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64; - const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64; - int index = 0; - const int shift = get_index_shift(col, row, &index); - // current and next row should belong to the same mask_idx and index - // next row's shift - const int row_next = row + single_step; - int index_next = 0; - const int shift_next = get_index_shift(col, row_next, &index_next); - switch (pl) { - case 0: - mask_16x16 = lfm->left_y[TX_16X16].bits[index]; - mask_8x8 = lfm->left_y[TX_8X8].bits[index]; - mask_4x4 = lfm->left_y[TX_4X4].bits[index]; - lfl = &lfm->lfl_y_ver[row][col]; - lfl2 = &lfm->lfl_y_ver[row_next][col]; - break; - case 1: - mask_16x16 = lfm->left_u[TX_16X16].bits[index]; - mask_8x8 = lfm->left_u[TX_8X8].bits[index]; - mask_4x4 = lfm->left_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u[row][col]; - lfl2 = &lfm->lfl_u[row_next][col]; - break; - case 2: - mask_16x16 = lfm->left_v[TX_16X16].bits[index]; - mask_8x8 = lfm->left_v[TX_8X8].bits[index]; - mask_4x4 = lfm->left_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v[row][col]; - lfl2 = &lfm->lfl_v[row_next][col]; - break; - default: assert(pl >= 0 && pl <= 2); return; - } - uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; - uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; - uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; - uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; - uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; - uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; - - if (cm->seq_params.use_highbitdepth) - highbd_filter_selectively_vert_row2( - ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, - mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, - &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); - else - filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl, - mask_16x16_0, mask_8x8_0, mask_4x4_0, - mask_16x16_1, mask_8x8_1, mask_4x4_1, - &cm->lf_info, lfl, lfl2); - dst->buf -= ((c << MI_SIZE_LOG2) >> ssx); - } - dst->buf += 2 * MI_SIZE * dst->stride; - } -} - -void av1_filter_block_plane_hor(AV1_COMMON *const cm, - struct macroblockd_plane *const plane_ptr, - int pl, int mi_row, int mi_col) { - struct buf_2d *const dst = &plane_ptr->dst; - int r, c; - const int ssx = plane_ptr->subsampling_x; - const int ssy = plane_ptr->subsampling_y; - const int mask_cutoff = 0xffff; - const int r_step = 1 << ssy; - uint64_t mask_16x16 = 0; - uint64_t mask_8x8 = 0; - uint64_t mask_4x4 = 0; - uint8_t *lfl; - - for (r = 0; r < cm->seq_params.mib_size && - ((mi_row + r) << MI_SIZE_LOG2 < cm->height); - r += r_step) { - for (c = 0; c < cm->seq_params.mib_size && - ((mi_col + c) << MI_SIZE_LOG2 < cm->width); - c += MI_SIZE_64X64) { - if (mi_row + r == 0) continue; - - dst->buf += ((c << MI_SIZE_LOG2) >> ssx); - LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c); - assert(lfm); - const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64; - const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64; - int index = 0; - const int shift = get_index_shift(col, row, &index); - switch (pl) { - case 0: - mask_16x16 = lfm->above_y[TX_16X16].bits[index]; - mask_8x8 = lfm->above_y[TX_8X8].bits[index]; - mask_4x4 = lfm->above_y[TX_4X4].bits[index]; - lfl = &lfm->lfl_y_hor[row][col]; - break; - case 1: - mask_16x16 = lfm->above_u[TX_16X16].bits[index]; - mask_8x8 = lfm->above_u[TX_8X8].bits[index]; - mask_4x4 = lfm->above_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u[row][col]; - break; - case 2: - mask_16x16 = lfm->above_v[TX_16X16].bits[index]; - mask_8x8 = lfm->above_v[TX_8X8].bits[index]; - mask_4x4 = lfm->above_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v[row][col]; - break; - default: assert(pl >= 0 && pl <= 2); return; - } - mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; - mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; - mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; - - if (cm->seq_params.use_highbitdepth) - highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), - dst->stride, pl, ssx, mask_16x16, - mask_8x8, mask_4x4, &cm->lf_info, lfl, - (int)cm->seq_params.bit_depth); - else - filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, - mask_8x8, mask_4x4, &cm->lf_info, lfl); - dst->buf -= ((c << MI_SIZE_LOG2) >> ssx); - } - dst->buf += MI_SIZE * dst->stride; - } -} -#endif // LOOP_FILTER_BITMASK - static TX_SIZE get_transform_size(const MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi, const EDGE_DIR edge_dir, const int mi_row, @@ -1914,7 +212,7 @@ static TX_SIZE get_transform_size(const MACROBLOCKD *const xd, tx_size = mb_tx_size; } - // since in case of chrominance or non-square transorm need to convert + // since in case of chrominance or non-square transform need to convert // transform size into transform size in particular direction. // for vertical edge, filter direction is horizontal, for horizontal // edge, filter direction is vertical. @@ -1933,7 +231,7 @@ typedef struct AV1_DEBLOCKING_PARAMETERS { } AV1_DEBLOCKING_PARAMETERS; // Return TX_SIZE from get_transform_size(), so it is plane and direction -// awared +// aware static TX_SIZE set_lpf_parameters( AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step, const AV1_COMMON *const cm, const MACROBLOCKD *const xd, @@ -1958,7 +256,8 @@ static TX_SIZE set_lpf_parameters( // and mi_col should be odd number for chroma plane. const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2); const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2); - MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col; + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; const MB_MODE_INFO *mbmi = mi[0]; // If current mbmi is not correctly setup, return an invalid value to stop // filtering. One example is that if this tile is not coded, then its mbmi @@ -1979,7 +278,7 @@ static TX_SIZE set_lpf_parameters( // prepare outer edge parameters. deblock the edge if it's an edge of a TU { const uint32_t curr_level = - get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); const int curr_skipped = mbmi->skip && is_inter_block(mbmi); uint32_t level = curr_level; if (coord) { @@ -1994,12 +293,13 @@ static TX_SIZE set_lpf_parameters( xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr); const uint32_t pv_lvl = - get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); const int pv_skip = mi_prev->skip && is_inter_block(mi_prev); const BLOCK_SIZE bsize = get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x, plane_ptr->subsampling_y); + assert(bsize < BLOCK_SIZES_ALL); const int prediction_masks = edge_dir == VERT_EDGE ? block_size_wide[bsize] - 1 : block_size_high[bsize] - 1; @@ -2047,21 +347,18 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const int plane, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col) { - const int row_step = MI_SIZE >> MI_SIZE_LOG2; const uint32_t scale_horz = plane_ptr->subsampling_x; const uint32_t scale_vert = plane_ptr->subsampling_y; uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; const int y_range = (MAX_MIB_SIZE >> scale_vert); const int x_range = (MAX_MIB_SIZE >> scale_horz); - const int use_highbitdepth = cm->seq_params.use_highbitdepth; - const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; - for (int y = 0; y < y_range; y += row_step) { + for (int y = 0; y < y_range; y++) { uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; for (int x = 0; x < x_range;) { // inner loop always filter vertical edges in a MI block. If MI size // is 8x8, it will filter the vertical edge aligned with a 8x8 block. - // If 4x4 trasnform is used, it will then filter the internal edge + // If 4x4 transform is used, it will then filter the internal edge // aligned with a 4x4 block const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; @@ -2078,6 +375,9 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, tx_size = TX_4X4; } +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = cm->seq_params.use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; switch (params.filter_length) { // apply 4-tap filtering case 4: @@ -2122,6 +422,32 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, // no filtering default: break; } +#else + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + assert(plane != 0); + aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } +#endif // CONFIG_AV1_HIGHBITDEPTH // advance the destination pointer advance_units = tx_size_wide_unit[tx_size]; x += advance_units; @@ -2134,21 +460,18 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const int plane, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col) { - const int col_step = MI_SIZE >> MI_SIZE_LOG2; const uint32_t scale_horz = plane_ptr->subsampling_x; const uint32_t scale_vert = plane_ptr->subsampling_y; uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; const int y_range = (MAX_MIB_SIZE >> scale_vert); const int x_range = (MAX_MIB_SIZE >> scale_horz); - const int use_highbitdepth = cm->seq_params.use_highbitdepth; - const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; - for (int x = 0; x < x_range; x += col_step) { + for (int x = 0; x < x_range; x++) { uint8_t *p = dst_ptr + x * MI_SIZE; for (int y = 0; y < y_range;) { // inner loop always filter vertical edges in a MI block. If MI size // is 8x8, it will first filter the vertical edge aligned with a 8x8 - // block. If 4x4 trasnform is used, it will then filter the internal + // block. If 4x4 transform is used, it will then filter the internal // edge aligned with a 4x4 block const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; @@ -2157,14 +480,17 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, AV1_DEBLOCKING_PARAMETERS params; memset(¶ms, 0, sizeof(params)); - tx_size = - set_lpf_parameters(¶ms, (cm->mi_stride << scale_vert), cm, xd, - HORZ_EDGE, curr_x, curr_y, plane, plane_ptr); + tx_size = set_lpf_parameters( + ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, + curr_x, curr_y, plane, plane_ptr); if (tx_size == TX_INVALID) { params.filter_length = 0; tx_size = TX_4X4; } +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = cm->seq_params.use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; switch (params.filter_length) { // apply 4-tap filtering case 4: @@ -2210,6 +536,117 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, // no filtering default: break; } +#else + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 6-tap filtering + case 6: + assert(plane != 0); + aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + + // advance the destination pointer + advance_units = tx_size_high_unit[tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + } + } +} + +void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int y_range = cm->mi_params.mi_rows >> scale_vert; + const int x_range = cm->mi_params.mi_cols >> scale_horz; + for (int y = 0; y < y_range; y++) { + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = + set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd, + VERT_EDGE, curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + // advance the destination pointer + advance_units = tx_size_wide_unit[tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + } + } +} + +void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int y_range = cm->mi_params.mi_rows >> scale_vert; + const int x_range = cm->mi_params.mi_cols >> scale_horz; + for (int x = 0; x < x_range; x++) { + uint8_t *p = dst_ptr + x * MI_SIZE; + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = set_lpf_parameters( + ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, + curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } // advance the destination pointer advance_units = tx_size_high_unit[tx_size]; @@ -2221,18 +658,19 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, MACROBLOCKD *xd, int start, int stop, -#if LOOP_FILTER_BITMASK +#if CONFIG_LPF_MASK int is_decoding, #endif int plane_start, int plane_end) { struct macroblockd_plane *pd = xd->plane; const int col_start = 0; - const int col_end = cm->mi_cols; + const int col_end = cm->mi_params.mi_cols; int mi_row, mi_col; int plane; -#if LOOP_FILTER_BITMASK +#if CONFIG_LPF_MASK if (is_decoding) { + cm->is_decoding = is_decoding; for (plane = plane_start; plane < plane_end; plane++) { if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) break; @@ -2243,24 +681,25 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0, plane, plane + 1); + av1_build_bitmask_vert_info(cm, &pd[plane], plane); av1_build_bitmask_horz_info(cm, &pd[plane], plane); // apply loop filtering which only goes through buffer once for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) { for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) { - av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col, + av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col, plane, plane + 1); av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row, mi_col); if (mi_col - MI_SIZE_64X64 >= 0) { - av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, + av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col - MI_SIZE_64X64, plane, plane + 1); av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, mi_col - MI_SIZE_64X64); } } - av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, + av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col - MI_SIZE_64X64, plane, plane + 1); av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, mi_col - MI_SIZE_64X64); @@ -2278,31 +717,6 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, else if (plane == 2 && !(cm->lf.filter_level_v)) continue; -#if LOOP_FILTER_BITMASK - // filter all vertical edges every superblock (could be 128x128 or 64x64) - for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) { - for (mi_col = col_start; mi_col < col_end; - mi_col += cm->seq_params.mib_size) { - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, - mi_col, plane, plane + 1); - - av1_setup_bitmask(cm, mi_row, mi_col, plane, pd[plane].subsampling_x, - pd[plane].subsampling_y, stop, col_end); - av1_filter_block_plane_ver(cm, &pd[plane], plane, mi_row, mi_col); - } - } - - // filter all horizontal edges every superblock - for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) { - for (mi_col = col_start; mi_col < col_end; - mi_col += cm->seq_params.mib_size) { - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, - mi_col, plane, plane + 1); - - av1_filter_block_plane_hor(cm, &pd[plane], plane, mi_row, mi_col); - } - } -#else if (cm->lf.combine_vert_horz_lf) { // filter all vertical and horizontal edges in every 128x128 super block for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { @@ -2348,29 +762,28 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, } } } -#endif // LOOP_FILTER_BITMASK } } void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, -#if LOOP_FILTER_BITMASK +#if CONFIG_LPF_MASK int is_decoding, #endif int plane_start, int plane_end, int partial_frame) { int start_mi_row, end_mi_row, mi_rows_to_filter; start_mi_row = 0; - mi_rows_to_filter = cm->mi_rows; - if (partial_frame && cm->mi_rows > 8) { - start_mi_row = cm->mi_rows >> 1; + mi_rows_to_filter = cm->mi_params.mi_rows; + if (partial_frame && cm->mi_params.mi_rows > 8) { + start_mi_row = cm->mi_params.mi_rows >> 1; start_mi_row &= 0xfffffff8; - mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8); + mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8); } end_mi_row = start_mi_row + mi_rows_to_filter; av1_loop_filter_frame_init(cm, plane_start, plane_end); loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, -#if LOOP_FILTER_BITMASK +#if CONFIG_LPF_MASK is_decoding, #endif plane_start, plane_end); diff --git a/media/libaom/src/av1/common/av1_loopfilter.h b/media/libaom/src/av1/common/av1_loopfilter.h index 80ac61178..ce26d1647 100644 --- a/media/libaom/src/av1/common/av1_loopfilter.h +++ b/media/libaom/src/av1/common/av1_loopfilter.h @@ -33,11 +33,12 @@ enum lf_path { LF_PATH_SLOW, }; -#if LOOP_FILTER_BITMASK +enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR); typedef struct { uint64_t bits[4]; } FilterMask; +#if CONFIG_LPF_MASK // This structure holds bit masks for all 4x4 blocks in a 64x64 region. // Each 1 bit represents a position in which we want to apply the loop filter. // For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4 @@ -61,10 +62,12 @@ typedef struct { uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64]; // U plane filter level - uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64]; + uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64]; + uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64]; // V plane filter level - uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64]; + uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64]; + uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64]; // other info FilterMask skip; @@ -74,7 +77,7 @@ typedef struct { FilterMask tx_size_ver[2][5]; FilterMask tx_size_hor[2][5]; } LoopFilterMask; -#endif // LOOP_FILTER_BITMASK +#endif // CONFIG_LPF_MASK struct loopfilter { int filter_level[2]; @@ -95,11 +98,11 @@ struct loopfilter { int combine_vert_horz_lf; -#if LOOP_FILTER_BITMASK +#if CONFIG_LPF_MASK LoopFilterMask *lfm; size_t lfm_num; int lfm_stride; -#endif // LOOP_FILTER_BITMASK +#endif // CONFIG_LPF_MASK }; // Need to align this structure so when it is declared and @@ -125,13 +128,13 @@ void av1_loop_filter_init(struct AV1Common *cm); void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start, int plane_end); -#if LOOP_FILTER_BITMASK +#if CONFIG_LPF_MASK void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, - struct macroblockd *mbd, int is_decoding, + struct macroblockd *xd, int is_decoding, int plane_start, int plane_end, int partial_frame); #else void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, - struct macroblockd *mbd, int plane_start, + struct macroblockd *xd, int plane_start, int plane_end, int partial_frame); #endif @@ -154,14 +157,10 @@ typedef struct LoopFilterWorkerData { MACROBLOCKD *xd; } LFWorkerData; -uint8_t get_filter_level(const struct AV1Common *cm, - const loop_filter_info_n *lfi_n, const int dir_idx, - int plane, const MB_MODE_INFO *mbmi); -#if LOOP_FILTER_BITMASK -void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col, - int plane, int subsampling_x, int subsampling_y, - int row_end, int col_end); - +uint8_t av1_get_filter_level(const struct AV1Common *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi); +#if CONFIG_LPF_MASK void av1_filter_block_plane_ver(struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr, int pl, int mi_row, int mi_col); @@ -169,56 +168,38 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm, void av1_filter_block_plane_hor(struct AV1Common *const cm, struct macroblockd_plane *const plane, int pl, int mi_row, int mi_col); -LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm, - int mi_row, int mi_col); -int get_index_shift(int mi_col, int mi_row, int *index); - -static const FilterMask left_txform_mask[TX_SIZES] = { - { { 0x0000000000000001ULL, // TX_4X4, - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0000000000010001ULL, // TX_8X8, - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0001000100010001ULL, // TX_16X16, - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - { { 0x0001000100010001ULL, // TX_32X32, - 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, +int get_index_shift(int mi_col, int mi_row, int *index); - { { 0x0001000100010001ULL, // TX_64X64, - 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } }, -}; +void av1_build_bitmask_vert_info( + struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr, + int plane); -static const uint64_t above_txform_mask[2][TX_SIZES] = { - { - 0x0000000000000001ULL, // TX_4X4 - 0x0000000000000003ULL, // TX_8X8 - 0x000000000000000fULL, // TX_16X16 - 0x00000000000000ffULL, // TX_32X32 - 0x000000000000ffffULL, // TX_64X64 - }, - { - 0x0000000000000001ULL, // TX_4X4 - 0x0000000000000005ULL, // TX_8X8 - 0x0000000000000055ULL, // TX_16X16 - 0x0000000000005555ULL, // TX_32X32 - 0x0000000055555555ULL, // TX_64X64 - }, -}; +void av1_build_bitmask_horz_info( + struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr, + int plane); -extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL]; +void av1_filter_block_plane_bitmask_vert( + struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col); -extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL]; +void av1_filter_block_plane_bitmask_horz( + struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col); -extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL]; +void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row, + int mi_col, BLOCK_SIZE bsize, + MB_MODE_INFO *mbmi); -extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL]; +void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + int is_horz_coding_block_border, + int is_vert_coding_block_border); -extern const FilterMask left_mask_univariant_reordered[67]; - -extern const FilterMask above_mask_univariant_reordered[67]; -#endif +void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + MB_MODE_INFO *mbmi); +#endif // CONFIG_LPF_MASK #ifdef __cplusplus } // extern "C" diff --git a/media/libaom/src/av1/common/av1_rtcd_defs.pl b/media/libaom/src/av1/common/av1_rtcd_defs.pl index dee1f1c79..296c6c572 100644 --- a/media/libaom/src/av1/common/av1_rtcd_defs.pl +++ b/media/libaom/src/av1/common/av1_rtcd_defs.pl @@ -33,21 +33,46 @@ struct txfm_param; struct aom_variance_vtable; struct search_site_config; struct yv12_buffer_config; +struct NN_CONFIG; +typedef struct NN_CONFIG NN_CONFIG; + +enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION); +#if CONFIG_NN_V2 +enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS); +struct NN_CONFIG_V2; +typedef struct NN_CONFIG_V2 NN_CONFIG_V2; +struct FC_LAYER; +typedef struct FC_LAYER FC_LAYER; +#endif // CONFIG_NN_V2 + +struct CNN_CONFIG; +typedef struct CNN_CONFIG CNN_CONFIG; +struct CNN_LAYER_CONFIG; +typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG; +struct CNN_THREAD_DATA; +typedef struct CNN_THREAD_DATA CNN_THREAD_DATA; +struct CNN_BRANCH_CONFIG; +typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG; +struct CNN_MULTI_OUT; +typedef struct CNN_MULTI_OUT CNN_MULTI_OUT; /* Function pointers return by CfL functions */ typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride, uint16_t *output_q3); +#if CONFIG_AV1_HIGHBITDEPTH typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride, uint16_t *output_q3); +typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +#endif + typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst); typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst, int dst_stride, int alpha_q3); -typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); EOF } forward_decls qw/av1_common_forward_decls/; @@ -65,22 +90,24 @@ if ($opts{arch} eq "x86_64") { add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn"; specialize qw/av1_convolve_horiz_rs sse4_1/; -add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd"; -specialize qw/av1_highbd_convolve_horiz_rs sse4_1/; - -add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params"; +if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd"; + specialize qw/av1_highbd_convolve_horiz_rs sse4_1/; -add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps"; + add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd"; + specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/; +} +add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params"; specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/; -specialize qw/av1_highbd_wiener_convolve_add_src ssse3/; -specialize qw/av1_highbd_wiener_convolve_add_src avx2/; - # directional intra predictor functions add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy"; +specialize qw/av1_dr_prediction_z1 avx2/; add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy"; +specialize qw/av1_dr_prediction_z2 avx2/; add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy"; +specialize qw/av1_dr_prediction_z3 avx2/; # FILTER_INTRA predictor functions add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode"; @@ -108,21 +135,21 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64"; add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; specialize qw/av1_inv_txfm_add ssse3 avx2 neon/; -add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param"; specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param"; specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/; -add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param"; specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/; -add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/; -add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/; -add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/; -add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/; +add_proto qw/void av1_highbd_inv_txfm_add_4x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_8x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_4x16/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_16x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/; add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; @@ -151,10 +178,15 @@ add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *out add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; -# directional intra predictor functions -add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd"; -add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd"; -add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd"; +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + # directional intra predictor functions + add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd"; + specialize qw/av1_highbd_dr_prediction_z1 avx2/; + add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd"; + specialize qw/av1_highbd_dr_prediction_z2 avx2/; + add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd"; + specialize qw/av1_highbd_dr_prediction_z3 avx2/; +} # build compound seg mask functions add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w"; @@ -166,6 +198,10 @@ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/; add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd"; specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/; +# Helper functions. +add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit"; +specialize "av1_round_shift_array", qw/sse4_1 neon/; + # # Encoder functions below this point. # @@ -176,10 +212,17 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # the transform coefficients are held in 32-bit # values, so the assembler code for av1_block_error can no longer be used. add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/av1_block_error avx2/; + specialize qw/av1_block_error sse2 avx2 neon/; + + add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size"; + specialize qw/av1_block_error_lp avx2 neon/; add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/av1_quantize_fp sse2 avx2/; + specialize qw/av1_quantize_fp sse2 avx2 neon/; + + add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan"; + specialize qw/av1_quantize_lp avx2 neon/; + add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/av1_quantize_fp_32x32 avx2/; @@ -196,54 +239,71 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_4x8 sse4_1/; add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_8x4 sse4_1/; add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_8x16 sse4_1/; + specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_16x8 sse4_1/; + specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_16x32 sse4_1/; add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_32x16 sse4_1/; add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_4x16 sse4_1/; add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_16x4 sse4_1/; add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_8x32 sse4_1/; add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_32x8 sse4_1/; add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_fwd_txfm2d_4x4 sse4_1/; add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_8x8 sse4_1/; + specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_16x16 sse4_1/; + specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_32x32 sse4_1/; + specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_64x64 sse4_1/; + specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_32x64 sse4_1/; add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_64x32 sse4_1/; add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_16x64 sse4_1/; add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_64x16 sse4_1/; # # Motion search # - add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv"; - add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv"; - add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; - specialize qw/av1_temporal_filter_apply sse2 msa/; + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count"; + specialize qw/av1_apply_temporal_filter_yuv sse4_1/; + } + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count"; + specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/; + } add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale"; # ENCODEMB INVOKE + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; + specialize qw/av1_highbd_block_error sse2 avx2/; + } - add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; - specialize qw/av1_highbd_block_error sse2/; - - add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; - - add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale"; - specialize qw/av1_highbd_quantize_fp sse4_1 avx2/; + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale"; + specialize qw/av1_highbd_quantize_fp sse4_1 avx2/; + } add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; @@ -257,30 +317,57 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N"; specialize qw/av1_wedge_sse_from_residuals sse2 avx2/; - add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit"; + add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit"; specialize qw/av1_wedge_sign_from_residuals sse2 avx2/; add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N"; specialize qw/av1_wedge_compute_delta_squares sse2 avx2/; # hash - add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length"; + add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length"; specialize qw/av1_get_crc32c_value sse4_2/; - add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H"; + add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H"; specialize qw/av1_compute_stats sse4_1 avx2/; + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; + specialize qw/av1_compute_stats_highbd sse4_1 avx2/; + } + + add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params"; + specialize qw/av1_calc_proj_params avx2/; + add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; + specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/; + } + add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr"; + specialize qw/av1_get_horver_correlation_full sse4_1 avx2/; + + add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output"; + specialize qw/av1_nn_predict sse3/; } # end encoder functions +# CNN functions + +add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation"; +add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add"; +add_proto qw/void av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct"; +add_proto qw/void av1_cnn_convolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step"; +add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride"; +add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std"; + # Deringing Functions add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift"; -add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift"; +add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift"; -add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h"; -add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h"; +add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h"; +add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h"; # VS compiling for 32 bit targets does not support vector types in # structs as arguments, which makes the v256 type of the intrinsics @@ -288,27 +375,32 @@ add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const if ($opts{config} !~ /libs-x86-win32-vs.*/) { specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/; specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; } # WARPED_MOTION / GLOBAL_MOTION functions add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; -specialize qw/av1_warp_affine sse4_1 neon/; +specialize qw/av1_warp_affine sse4_1 avx2 neon/; + +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; + specialize qw/av1_highbd_warp_affine sse4_1/; +} -add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; -specialize qw/av1_highbd_warp_affine sse4_1/; +add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride"; +specialize qw/av1_calc_frame_error sse2 avx2/; if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2"; - specialize qw/compute_cross_correlation sse4_1/; + add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2"; + specialize qw/av1_compute_cross_correlation sse4_1 avx2/; } # LOOP_RESTORATION functions -add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd"; -specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/; +add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd"; +specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/; add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, @@ -317,44 +409,48 @@ specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/; # CONVOLVE_ROUND/COMPOUND_ROUND functions -add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; - - add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params"; - add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params"; +if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd"; + add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd"; +} + + add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params"; specialize qw/av1_convolve_2d_sr sse2 avx2 neon/; specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/; specialize qw/av1_convolve_x_sr sse2 avx2 neon/; specialize qw/av1_convolve_y_sr sse2 avx2 neon/; specialize qw/av1_convolve_2d_scale sse4_1/; - specialize qw/av1_jnt_convolve_2d ssse3 avx2 neon/; - specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/; - specialize qw/av1_jnt_convolve_x sse2 avx2 neon/; - specialize qw/av1_jnt_convolve_y sse2 avx2 neon/; - specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/; - specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/; - specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/; - specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/; - specialize qw/av1_highbd_convolve_2d_scale sse4_1/; - specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/; - specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/; - specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/; - specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/; + specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/; + specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/; + specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/; + specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/; + if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/; + specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/; + specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/; + specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/; + specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/; + specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/; + specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/; + specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/; + specialize qw/av1_highbd_convolve_2d_scale sse4_1/; + } # INTRA_EDGE functions add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength"; @@ -368,8 +464,8 @@ add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd"; specialize qw/av1_upsample_intra_edge_high sse4_1/; # CFL -add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size"; -specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/; +add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size"; +specialize qw/cfl_get_subtract_average_fn sse2 avx2 neon vsx/; add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size"; specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/; @@ -380,19 +476,21 @@ specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/; add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size"; specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/; -add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size"; -specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/; +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size"; + specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/; -add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size"; -specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/; + add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size"; + specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/; -add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size"; -specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/; + add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size"; + specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/; -add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size"; -specialize qw/get_predict_lbd_fn ssse3 avx2 neon/; + add_proto qw/cfl_predict_hbd_fn cfl_get_predict_hbd_fn/, "TX_SIZE tx_size"; + specialize qw/cfl_get_predict_hbd_fn ssse3 avx2 neon/; +} -add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size"; -specialize qw/get_predict_hbd_fn ssse3 avx2 neon/; +add_proto qw/cfl_predict_lbd_fn cfl_get_predict_lbd_fn/, "TX_SIZE tx_size"; +specialize qw/cfl_get_predict_lbd_fn ssse3 avx2 neon/; 1; diff --git a/media/libaom/src/av1/common/av1_txfm.c b/media/libaom/src/av1/common/av1_txfm.c index bb70eab70..ac43402f4 100644 --- a/media/libaom/src/av1/common/av1_txfm.c +++ b/media/libaom/src/av1/common/av1_txfm.c @@ -10,10 +10,11 @@ */ #include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/av1_txfm.h" -// av1_cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i))); +// av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i))); const int32_t av1_cospi_arr_data[7][64] = { { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980, 972, 964, 955, 946, 936, 926, 915, 903, 891, 878, 865, 851, 837, diff --git a/media/libaom/src/av1/common/av1_txfm.h b/media/libaom/src/av1/common/av1_txfm.h index 59d64ca4a..20049b680 100644 --- a/media/libaom/src/av1/common/av1_txfm.h +++ b/media/libaom/src/av1/common/av1_txfm.h @@ -59,7 +59,9 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) { const int64_t min_value = -(1LL << (bit - 1)); if (value < min_value || value > max_value) { fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit); +#if !CONFIG_AV1_ENCODER assert(0); +#endif } #endif // CONFIG_COEFFICIENT_RANGE_CHECKING #if DO_RANGE_CHECK_CLAMP @@ -110,7 +112,7 @@ typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit, typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); -typedef enum TXFM_TYPE { +enum { TXFM_TYPE_DCT4, TXFM_TYPE_DCT8, TXFM_TYPE_DCT16, @@ -125,7 +127,7 @@ typedef enum TXFM_TYPE { TXFM_TYPE_IDENTITY32, TXFM_TYPES, TXFM_TYPE_INVALID, -} TXFM_TYPE; +} UENUM1BYTE(TXFM_TYPE); typedef struct TXFM_2D_FLIP_CFG { TX_SIZE tx_size; diff --git a/media/libaom/src/av1/common/blockd.c b/media/libaom/src/av1/common/blockd.c index 2e796b656..00725ea2d 100644 --- a/media/libaom/src/av1/common/blockd.c +++ b/media/libaom/src/av1/common/blockd.c @@ -13,8 +13,8 @@ #include "aom_ports/system_state.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" -#include "av1/common/onyxc_int.h" PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) { if (!left_mi) return DC_PRED; @@ -28,11 +28,12 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) { return above_mi->mode; } -void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, - int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - int has_eob, int aoff, int loff) { - ENTROPY_CONTEXT *const a = pd->above_context + aoff; - ENTROPY_CONTEXT *const l = pd->left_context + loff; +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff; const int txs_wide = tx_size_wide_unit[tx_size]; const int txs_high = tx_size_high_unit[tx_size]; @@ -56,23 +57,18 @@ void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, memset(l, has_eob, sizeof(*l) * txs_high); } } -void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col, - BLOCK_SIZE bsize, const int num_planes) { - int i; - int nplanes; - int chroma_ref; - chroma_ref = - is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y); - nplanes = 1 + (num_planes - 1) * chroma_ref; - for (i = 0; i < nplanes; i++) { +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes) { + assert(bsize < BLOCK_SIZES_ALL); + const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref; + for (int i = 0; i < nplanes; i++) { struct macroblockd_plane *const pd = &xd->plane[i]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0]; - memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide); - memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high); + const int txs_wide = mi_size_wide[plane_bsize]; + const int txs_high = mi_size_high[plane_bsize]; + memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide); + memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high); } } @@ -104,37 +100,3 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, xd->plane[i].subsampling_y = 1; } } - -const int16_t dr_intra_derivative[90] = { - // More evenly spread out angles and limited to 10-bit - // Values that are 0 will never be used - // Approx angle - 0, 0, 0, // - 1023, 0, 0, // 3, ... - 547, 0, 0, // 6, ... - 372, 0, 0, 0, 0, // 9, ... - 273, 0, 0, // 14, ... - 215, 0, 0, // 17, ... - 178, 0, 0, // 20, ... - 151, 0, 0, // 23, ... (113 & 203 are base angles) - 132, 0, 0, // 26, ... - 116, 0, 0, // 29, ... - 102, 0, 0, 0, // 32, ... - 90, 0, 0, // 36, ... - 80, 0, 0, // 39, ... - 71, 0, 0, // 42, ... - 64, 0, 0, // 45, ... (45 & 135 are base angles) - 57, 0, 0, // 48, ... - 51, 0, 0, // 51, ... - 45, 0, 0, 0, // 54, ... - 40, 0, 0, // 58, ... - 35, 0, 0, // 61, ... - 31, 0, 0, // 64, ... - 27, 0, 0, // 67, ... (67 & 157 are base angles) - 23, 0, 0, // 70, ... - 19, 0, 0, // 73, ... - 15, 0, 0, 0, 0, // 76, ... - 11, 0, 0, // 81, ... - 7, 0, 0, // 84, ... - 3, 0, 0, // 87, ... -}; diff --git a/media/libaom/src/av1/common/blockd.h b/media/libaom/src/av1/common/blockd.h index a2311c1b0..47597bc83 100644 --- a/media/libaom/src/av1/common/blockd.h +++ b/media/libaom/src/av1/common/blockd.h @@ -37,20 +37,22 @@ extern "C" { #define MAX_DIFFWTD_MASK_BITS 1 +#define INTERINTRA_WEDGE_SIGN 0 + // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS -typedef enum ATTRIBUTE_PACKED { +enum { DIFFWTD_38 = 0, DIFFWTD_38_INV, DIFFWTD_MASK_TYPES, -} DIFFWTD_MASK_TYPE; +} UENUM1BYTE(DIFFWTD_MASK_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { KEY_FRAME = 0, INTER_FRAME = 1, INTRA_ONLY_FRAME = 2, // replaces intra-only S_FRAME = 3, FRAME_TYPES, -} FRAME_TYPE; +} UENUM1BYTE(FRAME_TYPE); static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) { return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; @@ -73,24 +75,24 @@ static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) { } static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { - static PREDICTION_MODE lut[] = { - MB_MODE_COUNT, // DC_PRED - MB_MODE_COUNT, // V_PRED - MB_MODE_COUNT, // H_PRED - MB_MODE_COUNT, // D45_PRED - MB_MODE_COUNT, // D135_PRED - MB_MODE_COUNT, // D113_PRED - MB_MODE_COUNT, // D157_PRED - MB_MODE_COUNT, // D203_PRED - MB_MODE_COUNT, // D67_PRED - MB_MODE_COUNT, // SMOOTH_PRED - MB_MODE_COUNT, // SMOOTH_V_PRED - MB_MODE_COUNT, // SMOOTH_H_PRED - MB_MODE_COUNT, // PAETH_PRED - MB_MODE_COUNT, // NEARESTMV - MB_MODE_COUNT, // NEARMV - MB_MODE_COUNT, // GLOBALMV - MB_MODE_COUNT, // NEWMV + static const PREDICTION_MODE lut[] = { + DC_PRED, // DC_PRED + V_PRED, // V_PRED + H_PRED, // H_PRED + D45_PRED, // D45_PRED + D135_PRED, // D135_PRED + D113_PRED, // D113_PRED + D157_PRED, // D157_PRED + D203_PRED, // D203_PRED + D67_PRED, // D67_PRED + SMOOTH_PRED, // SMOOTH_PRED + SMOOTH_V_PRED, // SMOOTH_V_PRED + SMOOTH_H_PRED, // SMOOTH_H_PRED + PAETH_PRED, // PAETH_PRED + NEARESTMV, // NEARESTMV + NEARMV, // NEARMV + GLOBALMV, // GLOBALMV + NEWMV, // NEWMV NEARESTMV, // NEAREST_NEARESTMV NEARMV, // NEAR_NEARMV NEARESTMV, // NEAREST_NEWMV @@ -101,12 +103,12 @@ static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { NEWMV, // NEW_NEWMV }; assert(NELEMENTS(lut) == MB_MODE_COUNT); - assert(is_inter_compound_mode(mode)); + assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode)); return lut[mode]; } static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) { - static PREDICTION_MODE lut[] = { + static const PREDICTION_MODE lut[] = { MB_MODE_COUNT, // DC_PRED MB_MODE_COUNT, // V_PRED MB_MODE_COUNT, // H_PRED @@ -156,18 +158,16 @@ static INLINE int is_masked_compound_type(COMPOUND_TYPE type) { modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ -typedef int8_t MV_REFERENCE_FRAME; - typedef struct { - // Number of base colors for Y (0) and UV (1) - uint8_t palette_size[2]; // Value of base colors for Y, U, and V uint16_t palette_colors[3 * PALETTE_MAX_SIZE]; + // Number of base colors for Y (0) and UV (1) + uint8_t palette_size[2]; } PALETTE_MODE_INFO; typedef struct { - uint8_t use_filter_intra; FILTER_INTRA_MODE filter_intra_mode; + uint8_t use_filter_intra; } FILTER_INTRA_MODE_INFO; static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = { @@ -189,23 +189,24 @@ typedef struct RD_STATS { int64_t rdcost; int64_t sse; int skip; // sse should equal to dist when skip == 1 - int64_t ref_rdcost; int zero_rate; - uint8_t invalid_rate; #if CONFIG_RD_DEBUG int txb_coeff_cost[MAX_MB_PLANE]; - int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE] - [TXB_COEFF_COST_MAP_SIZE]; + // TODO(jingning): Temporary solution to silence stack over-size warning + // in handle_inter_mode. This should be fixed after rate-distortion + // optimization refactoring. + int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE] + [TXB_COEFF_COST_MAP_SIZE]; #endif // CONFIG_RD_DEBUG } RD_STATS; // This struct is used to group function args that are commonly // sent together in functions related to interinter compound modes typedef struct { - int wedge_index; - int wedge_sign; - DIFFWTD_MASK_TYPE mask_type; uint8_t *seg_mask; + int8_t wedge_index; + int8_t wedge_sign; + DIFFWTD_MASK_TYPE mask_type; COMPOUND_TYPE type; } INTERINTER_COMPOUND_DATA; @@ -213,66 +214,60 @@ typedef struct { #define TXK_TYPE_BUF_LEN 64 // This structure now relates to 4x4 block regions. typedef struct MB_MODE_INFO { + // interinter members + INTERINTER_COMPOUND_DATA interinter_comp; + WarpedMotionParams wm_params; + int_mv mv[2]; + int current_qindex; + // Only for INTER blocks + int_interpfilters interp_filters; + // TODO(debargha): Consolidate these flags +#if CONFIG_RD_DEBUG + RD_STATS rd_stats; + int mi_row; + int mi_col; +#endif +#if CONFIG_INSPECTION + int16_t tx_skip[TXK_TYPE_BUF_LEN]; +#endif + PALETTE_MODE_INFO palette_mode_info; // Common for both INTER and INTRA blocks BLOCK_SIZE sb_type; PREDICTION_MODE mode; - TX_SIZE tx_size; - uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN]; - int8_t skip; - int8_t skip_mode; - int8_t segment_id; - int8_t seg_id_predicted; // valid only when temporal_update is enabled - // Only for INTRA blocks UV_PREDICTION_MODE uv_mode; - - PALETTE_MODE_INFO palette_mode_info; - uint8_t use_intrabc; - - // Only for INTER blocks - InterpFilters interp_filters; - MV_REFERENCE_FRAME ref_frame[2]; - - TX_TYPE txk_type[TXK_TYPE_BUF_LEN]; - - FILTER_INTRA_MODE_INFO filter_intra_mode_info; - - // The actual prediction angle is the base angle + (angle_delta * step). - int8_t angle_delta[PLANE_TYPES]; - // interintra members INTERINTRA_MODE interintra_mode; - // TODO(debargha): Consolidate these flags - int use_wedge_interintra; - int interintra_wedge_index; - int interintra_wedge_sign; - // interinter members - INTERINTER_COMPOUND_DATA interinter_comp; MOTION_MODE motion_mode; - int overlappable_neighbors[2]; - int_mv mv[2]; - uint8_t ref_mv_idx; PARTITION_TYPE partition; + MV_REFERENCE_FRAME ref_frame[2]; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + int8_t skip; + uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + TX_SIZE tx_size; + int8_t delta_lf_from_base; + int8_t delta_lf[FRAME_LF_COUNT]; + int8_t interintra_wedge_index; + // The actual prediction angle is the base angle + (angle_delta * step). + int8_t angle_delta[PLANE_TYPES]; /* deringing gain *per-superblock* */ - int8_t cdef_strength; - int current_qindex; - int delta_lf_from_base; - int delta_lf[FRAME_LF_COUNT]; -#if CONFIG_RD_DEBUG - RD_STATS rd_stats; - int mi_row; - int mi_col; -#endif - int num_proj_ref; - WarpedMotionParams wm_params; - - // Index of the alpha Cb and alpha Cr combination - int cfl_alpha_idx; // Joint sign of alpha Cb and alpha Cr - int cfl_alpha_signs; - - int compound_idx; - int comp_group_idx; + int8_t cfl_alpha_signs; + // Index of the alpha Cb and alpha Cr combination + uint8_t cfl_alpha_idx; + uint8_t num_proj_ref; + uint8_t overlappable_neighbors[2]; + // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used. + uint8_t compound_idx; + uint8_t use_wedge_interintra : 1; + uint8_t segment_id : 3; + uint8_t seg_id_predicted : 1; // valid only when temporal_update is enabled + uint8_t skip_mode : 1; + uint8_t use_intrabc : 1; + uint8_t ref_mv_idx : 2; + // Indicate if masked compound is used(1) or not(0). + uint8_t comp_group_idx : 1; + int8_t cdef_strength : 4; } MB_MODE_INFO; static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) { @@ -366,13 +361,13 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, int mi_row, int tx_blk_col, int tx_blk_row, int subsampling_x, int subsampling_y) { *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) + - (tx_blk_col << tx_size_wide_log2[0]); + (tx_blk_col << MI_SIZE_LOG2); *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) + - (tx_blk_row << tx_size_high_log2[0]); + (tx_blk_row << MI_SIZE_LOG2); } #endif -enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; +enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision); struct buf_2d { uint8_t *buf; @@ -403,10 +398,10 @@ typedef struct macroblockd_plane { int subsampling_y; struct buf_2d dst; struct buf_2d pre[2]; - ENTROPY_CONTEXT *above_context; - ENTROPY_CONTEXT *left_context; + ENTROPY_CONTEXT *above_entropy_context; + ENTROPY_CONTEXT *left_entropy_context; - // The dequantizers below are true dequntizers used only in the + // The dequantizers below are true dequantizers used only in the // dequantization process. They have the same coefficient // shift/scale as TX. int16_t seg_dequant_QTX[MAX_SEGMENTS][2]; @@ -417,23 +412,9 @@ typedef struct macroblockd_plane { qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; - - // the 'dequantizers' below are not literal dequantizer values. - // They're used by encoder RDO to generate ad-hoc lambda values. - // They use a hardwired Q3 coeff shift and do not necessarily match - // the TX scale in use. - const int16_t *dequant_Q3; } MACROBLOCKD_PLANE; -#define BLOCK_OFFSET(x, i) \ - ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0]))) - -typedef struct RefBuffer { - int idx; // frame buf idx - int map_idx; // frame map idx - YV12_BUFFER_CONFIG *buf; - struct scale_factors sf; -} RefBuffer; +#define BLOCK_OFFSET(i) ((i) << 4) typedef struct { DECLARE_ALIGNED(16, InterpKernel, vfilter); @@ -478,74 +459,148 @@ typedef struct cfl_ctx { // Chroma subsampling int subsampling_x, subsampling_y; - int mi_row, mi_col; - // Whether the reconstructed luma pixels need to be stored int store_y; #if CONFIG_DEBUG int rate; #endif // CONFIG_DEBUG - - int is_chroma_reference; } CFL_CTX; -typedef struct jnt_comp_params { - int use_jnt_comp_avg; +typedef struct dist_wtd_comp_params { + int use_dist_wtd_comp_avg; int fwd_offset; int bck_offset; -} JNT_COMP_PARAMS; +} DIST_WTD_COMP_PARAMS; + +struct scale_factors; // Most/all of the pointers are mere pointers to actual arrays are allocated // elsewhere. This is mostly for coding convenience. typedef struct macroblockd { + // Row and column position of current macroblock in mi units. + int mi_row; + int mi_col; + // Same as cm->mi_params.mi_stride, copied here for convenience. + int mi_stride; + + // True if current block transmits chroma information. + // More detail: + // Smallest supported block size for both luma and chroma plane is 4x4. Hence, + // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma + // blocks smaller than 8x8 maybe combined into one chroma block. + // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4 + // luma blocks. Then, a single chroma block of size 4x4 will cover the area of + // these four luma blocks. This is implemented in bitstream as follows: + // - There are four MB_MODE_INFO structs for the four luma blocks. + // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit + // any information for chroma planes. + // - Last block will have is_chroma_ref = true and transmits chroma + // information for the 4x4 chroma block that covers whole 8x8 area covered by + // four luma blocks. + // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks. + bool is_chroma_ref; + struct macroblockd_plane plane[MAX_MB_PLANE]; TileInfo tile; - int mi_stride; - + // Appropriate offset inside cm->mi_params.mi_grid_base based on current + // mi_row and mi_col. MB_MODE_INFO **mi; + + // True if 4x4 block above the current block is available. + bool up_available; + // True if 4x4 block to the left of the current block is available. + bool left_available; + // True if the above chrome reference block is available. + bool chroma_up_available; + // True if the left chrome reference block is available. + bool chroma_left_available; + + // MB_MODE_INFO for 4x4 block to the left of the current block, if + // left_available == true; otherwise NULL. MB_MODE_INFO *left_mbmi; + // MB_MODE_INFO for 4x4 block above the current block, if + // up_available == true; otherwise NULL. MB_MODE_INFO *above_mbmi; + // Above chroma reference block if is_chroma_ref == true for the current block + // and chroma_up_available == true; otherwise NULL. + // See also: the special case logic when current chroma block covers more than + // one luma blocks in set_mi_row_col(). MB_MODE_INFO *chroma_left_mbmi; + // Left chroma reference block if is_chroma_ref == true for the current block + // and chroma_left_available == true; otherwise NULL. + // See also: the special case logic when current chroma block covers more than + // one luma blocks in set_mi_row_col(). MB_MODE_INFO *chroma_above_mbmi; - int up_available; - int left_available; - int chroma_up_available; - int chroma_left_available; + // Appropriate offset based on current 'mi_row' and 'mi_col', inside + // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or + // 'MACROBLOCK' structs. + uint8_t *tx_type_map; + // Stride for 'tx_type_map'. Note that this may / may not be same as + // 'mi_stride', depending on which actual array 'tx_type_map' points to. + int tx_type_map_stride; - /* Distance of MB away from frame edges in subpixels (1/8th pixel) */ + // Distance of this macroblock from frame edges in 1/8th pixel units. int mb_to_left_edge; int mb_to_right_edge; int mb_to_top_edge; int mb_to_bottom_edge; - /* pointers to reference frames */ - const RefBuffer *block_refs[2]; + // Scale factors for reference frames of the current block. + // These are pointers into 'cm->ref_scale_factors'. + const struct scale_factors *block_ref_scale_factors[2]; - /* pointer to current frame */ const YV12_BUFFER_CONFIG *cur_buf; - ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; - ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE]; - - PARTITION_CONTEXT *above_seg_context; - PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE]; - + // Entropy contexts for the above blocks. + // above_entropy_context[i][j] corresponds to above entropy context for ith + // plane and jth mi column of this *frame*, wrt current 'mi_row'. + // These are pointers into 'cm->above_contexts.entropy'. + ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE]; + // Entropy contexts for the left blocks. + // left_entropy_context[i][j] corresponds to left entropy context for ith + // plane and jth mi row of this *superblock*, wrt current 'mi_col'. + // Note: These contain actual data, NOT pointers. + ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE]; + + // Partition contexts for the above blocks. + // above_partition_context[i] corresponds to above partition context for ith + // mi column of this *frame*, wrt current 'mi_row'. + // These are pointers into 'cm->above_contexts.partition'. + PARTITION_CONTEXT *above_partition_context; + // Partition contexts for the left blocks. + // left_partition_context[i] corresponds to left partition context for ith + // mi row of this *superblock*, wrt current 'mi_col'. + // Note: These contain actual data, NOT pointers. + PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE]; + + // Transform contexts for the above blocks. + // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in + // code currently. Need to make it consistent / document why. TXFM_CONTEXT *above_txfm_context; + // Transform contexts for the left blocks. TXFM_CONTEXT *left_txfm_context; + // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'. + // Can we remove this indirection? TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE]; + // Default values for the two restoration filters for each plane. + // These values are used as reference values when writing the bitstream. That + // is, we transmit the delta between the actual values in + // cm->rst_info[plane].unit_info[unit_idx] and these reference values. WienerInfo wiener_info[MAX_MB_PLANE]; SgrprojInfo sgrproj_info[MAX_MB_PLANE]; - // block dimension in the unit of mode_info. - uint8_t n4_w, n4_h; + // Block dimensions in MB_MODE_INFO units. + uint8_t width; + uint8_t height; uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; uint8_t is_sec_rect; // Counts of each reference frame in the above and left neighboring blocks. @@ -553,15 +608,18 @@ typedef struct macroblockd { uint8_t neighbors_ref_counts[REF_FRAMES]; FRAME_CONTEXT *tile_ctx; - /* Bit depth: 8, 10, 12 */ + // Bit depth: copied from cm->seq_params.bit_depth for convenience. int bd; int qindex[MAX_SEGMENTS]; int lossless[MAX_SEGMENTS]; + // TODO(urvang): Move to decoder. int corrupted; + // Same as cm->features.cur_frame_force_integer_mv. int cur_frame_force_integer_mv; - // same with that in AV1_COMMON + // Pointer to cm->error. struct aom_internal_error_info *error_info; + // Same as cm->global_motion. const WarpedMotionParams *global_motion; int delta_qindex; int current_qindex; @@ -571,7 +629,7 @@ typedef struct macroblockd { // filtering level) and code the delta between previous superblock's delta // lf and current delta lf. It is equivalent to the delta between previous // superblock's actual lf and current lf. - int delta_lf_from_base; + int8_t delta_lf_from_base; // For this experiment, we have four frame filter levels for different plane // and direction. So, to support the per superblock update, we need to add // a few more params as below. @@ -585,14 +643,27 @@ typedef struct macroblockd { // SEG_LVL_ALT_LF_Y_H = 2; // SEG_LVL_ALT_LF_U = 3; // SEG_LVL_ALT_LF_V = 4; - int delta_lf[FRAME_LF_COUNT]; - int cdef_preset[4]; + int8_t delta_lf[FRAME_LF_COUNT]; + // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the + // current superblock has already been read from (decoder) / written to + // (encoder) the bitstream; and false otherwise. + // More detail: + // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st + // non-skip coding block. So, we need this array to keep track of whether CDEF + // strengths for the given CDEF units have been transmitted yet or not. + // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is + // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if + // superblock size is 128x128). Hence the array size is 4. + // (3) In the current implementation, CDEF strength for this CDEF unit is + // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside + // cm->mi_params.mi_grid_base). + bool cdef_transmitted[4]; DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); uint8_t *mc_buf[2]; CFL_CTX cfl; - JNT_COMP_PARAMS jcp_param; + DIST_WTD_COMP_PARAMS jcp_param; uint16_t cb_offset[MAX_MB_PLANE]; uint16_t txb_offset[MAX_MB_PLANE]; @@ -602,7 +673,7 @@ typedef struct macroblockd { uint8_t *tmp_obmc_bufs[2]; } MACROBLOCKD; -static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) { +static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) { return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; } @@ -646,19 +717,19 @@ static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize, static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi, PLANE_TYPE plane_type) { static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = { - DCT_DCT, // DC - ADST_DCT, // V - DCT_ADST, // H - DCT_DCT, // D45 - ADST_ADST, // D135 - ADST_DCT, // D117 - DCT_ADST, // D153 - DCT_ADST, // D207 - ADST_DCT, // D63 - ADST_ADST, // SMOOTH - ADST_DCT, // SMOOTH_V - DCT_ADST, // SMOOTH_H - ADST_ADST, // PAETH + DCT_DCT, // DC_PRED + ADST_DCT, // V_PRED + DCT_ADST, // H_PRED + DCT_DCT, // D45_PRED + ADST_ADST, // D135_PRED + ADST_DCT, // D113_PRED + DCT_ADST, // D157_PRED + DCT_ADST, // D203_PRED + ADST_DCT, // D67_PRED + ADST_ADST, // SMOOTH_PRED + ADST_DCT, // SMOOTH_V_PRED + DCT_ADST, // SMOOTH_H_PRED + ADST_ADST, // PAETH_PRED }; const PREDICTION_MODE mode = (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); @@ -686,6 +757,22 @@ static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = { { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, }; +static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = { + 0x080F, // DC_PRED: 0000 1000 0000 1111 + 0x040F, // V_PRED: 0000 0100 0000 1111 + 0x080F, // H_PRED: 0000 1000 0000 1111 + 0x020F, // D45_PRED: 0000 0010 0000 1111 + 0x080F, // D135_PRED: 0000 1000 0000 1111 + 0x040F, // D113_PRED: 0000 0100 0000 1111 + 0x080F, // D157_PRED: 0000 1000 0000 1111 + 0x080F, // D203_PRED: 0000 1000 0000 1111 + 0x040F, // D67_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_PRED: 0000 1000 0000 1111 + 0x040F, // SMOOTH_V_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_H_PRED: 0000 1000 0000 1111 + 0x0C0E, // PAETH_PRED: 0000 1100 0000 1110 +}; + static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = { 0x0001, // 0000 0000 0000 0001 0x0201, // 0000 0010 0000 0001 @@ -695,6 +782,11 @@ static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = { 0xFFFF, // 1111 1111 1111 1111 }; +static const TxSetType av1_ext_tx_set_lookup[2][2] = { + { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX }, + { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT }, +}; + static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter, int use_reduced_set) { const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; @@ -704,13 +796,7 @@ static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter, if (use_reduced_set) return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX; const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size]; - if (is_inter) { - return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT - : EXT_TX_SET_ALL16); - } else { - return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX - : EXT_TX_SET_DTT4_IDTX_1DDCT); - } + return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16]; } // Maps tx set types to the indices. @@ -749,7 +835,6 @@ static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) { return largest_tx_size; } -extern const int16_t dr_intra_derivative[90]; static const uint8_t mode_to_angle_map[] = { 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0, }; @@ -777,11 +862,13 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size, static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd, - TX_SIZE tx_size) { + TX_SIZE tx_size, + int is_screen_content_type) { const MB_MODE_INFO *const mbmi = xd->mi[0]; if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y || - xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32) + xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 || + is_screen_content_type) return DCT_DCT; return intra_mode_to_tx_type(mbmi, plane_type); @@ -792,45 +879,77 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, int subsampling_x, int subsampling_y) { - if (bsize == BLOCK_INVALID) return BLOCK_INVALID; + assert(bsize < BLOCK_SIZES_ALL); + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); return ss_size_lookup[bsize][subsampling_x][subsampling_y]; } +/* + * Logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row, int blk_col) { - TX_SIZE txs = max_txsize_rect_lookup[bsize]; - for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) - txs = sub_tx_size_map[txs]; - const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; - const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; - const int bw_log2 = mi_size_wide_log2[bsize]; - const int stride_log2 = bw_log2 - tx_w_log2; + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1, + }; const int index = - ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2); + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); assert(index < INTER_TX_SIZE_BUF_LEN); return index; } +#if CONFIG_INSPECTION +/* + * Here is the logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row, int blk_col) { - TX_SIZE txs = max_txsize_rect_lookup[bsize]; - for (int level = 0; level < MAX_VARTX_DEPTH; ++level) - txs = sub_tx_size_map[txs]; - const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; - const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; - const int bw_uint_log2 = mi_size_wide_log2[bsize]; - const int stride_log2 = bw_uint_log2 - tx_w_log2; + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2, + }; const int index = - ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2); + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); assert(index < TXK_TYPE_BUF_LEN); return index; } +#endif // CONFIG_INSPECTION -static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize, - int blk_row, int blk_col, TX_SIZE tx_size, +static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row, + int blk_col, TX_SIZE tx_size, TX_TYPE tx_type) { - const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col); - txk_type[txk_type_idx] = tx_type; + const int stride = xd->tx_type_map_stride; + xd->tx_type_map[blk_row * stride + blk_col] = tx_type; const int txw = tx_size_wide_unit[tx_size]; const int txh = tx_size_high_unit[tx_size]; @@ -843,71 +962,84 @@ static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize, const int tx_unit = tx_size_wide_unit[TX_16X16]; for (int idy = 0; idy < txh; idy += tx_unit) { for (int idx = 0; idx < txw; idx += tx_unit) { - const int this_index = - av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx); - txk_type[this_index] = tx_type; + xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type; } } } } -static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type, - const MACROBLOCKD *xd, int blk_row, +static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd, + PLANE_TYPE plane_type, int blk_row, int blk_col, TX_SIZE tx_size, int reduced_tx_set) { const MB_MODE_INFO *const mbmi = xd->mi[0]; - const struct macroblockd_plane *const pd = &xd->plane[plane_type]; - const TxSetType tx_set_type = - av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set); + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { + return DCT_DCT; + } TX_TYPE tx_type; - if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { - tx_type = DCT_DCT; + if (plane_type == PLANE_TYPE_Y) { + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; } else { - if (plane_type == PLANE_TYPE_Y) { - const int txk_type_idx = - av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col); - tx_type = mbmi->txk_type[txk_type_idx]; - } else if (is_inter_block(mbmi)) { + if (is_inter_block(mbmi)) { // scale back to y plane's coordinate + const struct macroblockd_plane *const pd = &xd->plane[plane_type]; blk_row <<= pd->subsampling_y; blk_col <<= pd->subsampling_x; - const int txk_type_idx = - av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col); - tx_type = mbmi->txk_type[txk_type_idx]; + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; } else { // In intra mode, uv planes don't share the same prediction mode as y // plane, so the tx_type should not be shared tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV); } + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set); + if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT; } assert(tx_type < TX_TYPES); - if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT; + assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), + reduced_tx_set)][tx_type]); return tx_type; } void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, const int num_planes); +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * int depth = 0; + * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + */ static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) { - TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; - int depth = 0; - while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) { - depth++; - tx_size = sub_tx_size_map[tx_size]; - } - return depth; + static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + }; + return bsize_to_max_depth_table[bsize]; } +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * assert(tx_size != TX_4X4); + * int depth = 0; + * while (tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + * assert(depth < 10); + */ static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) { - TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; - assert(tx_size != TX_4X4); - int depth = 0; - while (tx_size != TX_4X4) { - depth++; - tx_size = sub_tx_size_map[tx_size]; - assert(depth < 10); - } + assert(bsize < BLOCK_SIZES_ALL); + static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4, + }; + const int depth = bsize_to_tx_size_depth_table[bsize]; assert(depth <= MAX_TX_CATS); return depth - 1; } @@ -948,8 +1080,8 @@ static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) { pd->subsampling_y); } -void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col, - BLOCK_SIZE bsize, const int num_planes); +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes); void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes); @@ -960,9 +1092,10 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); -void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, - int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - int has_eob, int aoff, int loff); +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff); #define MAX_INTERINTRA_SB_SQUARE 32 * 32 static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) { @@ -1013,15 +1146,13 @@ static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize, } static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; } static INLINE int is_motion_variation_allowed_compound( const MB_MODE_INFO *mbmi) { - if (!has_second_ref(mbmi)) - return 1; - else - return 0; + return !has_second_ref(mbmi); } // input: log2 of length, 0(4), 1(8), ... @@ -1045,7 +1176,8 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; assert(!has_second_ref(mbmi)); if (mbmi->num_proj_ref >= 1 && - (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) { + (allow_warped_motion && + !av1_is_scaled(xd->block_ref_scale_factors[0]))) { if (xd->cur_frame_force_integer_mv) { return OBMC_CAUSAL; } @@ -1057,25 +1189,13 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, } } -static INLINE void assert_motion_mode_valid(MOTION_MODE mode, - const WarpedMotionParams *gm_params, - const MACROBLOCKD *xd, - const MB_MODE_INFO *mbmi, - int allow_warped_motion) { - const MOTION_MODE last_motion_mode_allowed = - motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion); - - // Check that the input mode is not illegal - if (last_motion_mode_allowed < mode) - assert(0 && "Illegal motion mode selected"); -} - static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) { return (is_inter_block(mbmi)); } static INLINE int av1_allow_palette(int allow_screen_content_tools, BLOCK_SIZE sb_type) { + assert(sb_type < BLOCK_SIZES_ALL); return allow_screen_content_tools && block_size_wide[sb_type] <= 64 && block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8; } diff --git a/media/libaom/src/av1/common/cdef.c b/media/libaom/src/av1/common/cdef.c index e9e2b0e42..ef7b866b5 100644 --- a/media/libaom/src/av1/common/cdef.c +++ b/media/libaom/src/av1/common/cdef.c @@ -16,45 +16,29 @@ #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" #include "av1/common/cdef.h" #include "av1/common/cdef_block.h" -#include "av1/common/onyxc_int.h" #include "av1/common/reconinter.h" -int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) { - int maxc, maxr; - int skip = 1; - maxc = cm->mi_cols - mi_col; - maxr = cm->mi_rows - mi_row; - - maxr = AOMMIN(maxr, MI_SIZE_64X64); - maxc = AOMMIN(maxc, MI_SIZE_64X64); - - for (int r = 0; r < maxr; r++) { - for (int c = 0; c < maxc; c++) { - skip = - skip && - cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip; - } - } - return skip; -} - static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col, int mi_stride) { - int is_skip = 1; - for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r) - for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) - is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip; + MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col; + for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) { + for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) { + if (!mbmi[c]->skip) return 0; + } + } - return is_skip; + return 1; } -int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col, - cdef_list *dlist, BLOCK_SIZE bs) { - MB_MODE_INFO **grid = cm->mi_grid_visible; - int maxc = cm->mi_cols - mi_col; - int maxr = cm->mi_rows - mi_row; +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bs) { + MB_MODE_INFO **grid = mi_params->mi_grid_base; + int maxc = mi_params->mi_cols - mi_col; + int maxr = mi_params->mi_rows - mi_row; if (bs == BLOCK_128X128 || bs == BLOCK_128X64) maxc = AOMMIN(maxc, MI_SIZE_128X128); @@ -65,22 +49,17 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col, else maxr = AOMMIN(maxr, MI_SIZE_64X64); - const int r_step = mi_size_high[BLOCK_8X8]; - const int c_step = mi_size_wide[BLOCK_8X8]; - const int r_shift = (r_step == 2); - const int c_shift = (c_step == 2); - - assert(r_step == 1 || r_step == 2); - assert(c_step == 1 || c_step == 2); - + const int r_step = 2; // mi_size_high[BLOCK_8X8] + const int c_step = 2; // mi_size_wide[BLOCK_8X8] + const int r_shift = 1; + const int c_shift = 1; int count = 0; - for (int r = 0; r < maxr; r += r_step) { for (int c = 0; c < maxc; c += c_step) { - if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) { + if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, + mi_params->mi_stride)) { dlist[count].by = r >> r_shift; dlist[count].bx = c >> c_shift; - dlist[count].skip = 0; count++; } } @@ -88,8 +67,9 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col, return count; } -void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, - int sstride, int v, int h) { +void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, int v, + int h) { for (int i = 0; i < v; i++) { for (int j = 0; j < h; j++) { dst[i * dstride + j] = src[i * sstride + j]; @@ -97,9 +77,9 @@ void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, } } -void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, - const uint16_t *src, int sstride, int v, - int h) { +void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, int v, + int h) { for (int i = 0; i < v; i++) { for (int j = 0; j < h; j++) { dst[i * dstride + j] = src[i * sstride + j]; @@ -107,16 +87,16 @@ void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, } } -static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride, +static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { if (cm->seq_params.use_highbitdepth) { const uint16_t *base = &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; - copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); + cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); } else { const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; - copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); + cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); } } @@ -140,6 +120,8 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src, void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd) { + const CdefInfo *const cdef_info = &cm->cdef_info; + const CommonModeInfoParams *const mi_params = &cm->mi_params; const int num_planes = av1_num_planes(cm); DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]); uint16_t *linebuf[3]; @@ -154,8 +136,8 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int xdec[3]; int ydec[3]; int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0); - const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; - const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, num_planes); row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2); @@ -168,7 +150,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x; mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y; } - const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER; + const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER; for (int pli = 0; pli < num_planes; pli++) { linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride); colbuf[pli] = @@ -190,17 +172,18 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int nhb, nvb; int cstart = 0; curr_row_cdef[fbc] = 0; - if (cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride + - MI_SIZE_64X64 * fbc] == NULL || - cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride + - MI_SIZE_64X64 * fbc] + if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] == NULL || + mi_params + ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] ->cdef_strength == -1) { cdef_left = 0; continue; } if (!cdef_left) cstart = -CDEF_HBORDER; - nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc); - nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr); + nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); int frame_top, frame_left, frame_bottom, frame_right; int mi_row = MI_SIZE_64X64 * fbr; @@ -218,32 +201,35 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, frame_left = (mi_col == 0) ? 1 : 0; if (fbr != nvfb - 1) - frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0; + frame_bottom = (mi_row + MI_SIZE_64X64 == mi_params->mi_rows) ? 1 : 0; else frame_bottom = 1; if (fbc != nhfb - 1) - frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0; + frame_right = (mi_col + MI_SIZE_64X64 == mi_params->mi_cols) ? 1 : 0; else frame_right = 1; const int mbmi_cdef_strength = - cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride + - MI_SIZE_64X64 * fbc] + mi_params + ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] ->cdef_strength; - level = cm->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + level = + cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; sec_strength = - cm->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; sec_strength += sec_strength == 3; - uv_level = cm->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + uv_level = + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; uv_sec_strength = - cm->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; uv_sec_strength += uv_sec_strength == 3; if ((level == 0 && sec_strength == 0 && uv_level == 0 && uv_sec_strength == 0) || - (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64, - fbc * MI_SIZE_64X64, dlist, - BLOCK_64X64)) == 0) { + (cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64, + fbc * MI_SIZE_64X64, dlist, + BLOCK_64X64)) == 0) { cdef_left = 0; continue; } @@ -252,8 +238,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, for (int pli = 0; pli < num_planes; pli++) { int coffset; int rend, cend; - int pri_damping = cm->cdef_pri_damping; - int sec_damping = cm->cdef_sec_damping; + int damping = cdef_info->cdef_damping; int hsize = nhb << mi_wide_l2[pli]; int vsize = nvb << mi_high_l2[pli]; @@ -364,7 +349,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, } if (cm->seq_params.use_highbitdepth) { - cdef_filter_fb( + av1_cdef_filter_fb( NULL, &CONVERT_TO_SHORTPTR( xd->plane[pli] @@ -374,9 +359,9 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, xd->plane[pli].dst.stride, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli], ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level, - sec_strength, pri_damping, sec_damping, coeff_shift); + sec_strength, damping, coeff_shift); } else { - cdef_filter_fb( + av1_cdef_filter_fb( &xd->plane[pli] .dst.buf[xd->plane[pli].dst.stride * (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) + @@ -384,7 +369,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, NULL, xd->plane[pli].dst.stride, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli], ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level, - sec_strength, pri_damping, sec_damping, coeff_shift); + sec_strength, damping, coeff_shift); } } cdef_left = 1; diff --git a/media/libaom/src/av1/common/cdef.h b/media/libaom/src/av1/common/cdef.h index 3b2eac8a5..c36fd135a 100644 --- a/media/libaom/src/av1/common/cdef.h +++ b/media/libaom/src/av1/common/cdef.h @@ -20,8 +20,8 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/cdef_block.h" -#include "av1/common/onyxc_int.h" static INLINE int sign(int i) { return i < 0 ? -1 : 1; } @@ -37,13 +37,14 @@ static INLINE int constrain(int diff, int threshold, int damping) { extern "C" { #endif -int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col); -int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col, - cdef_list *dlist, BLOCK_SIZE bsize); +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bsize); void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd); void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, - AV1_COMMON *cm, MACROBLOCKD *xd, int fast); + AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method, + int rdmult); #ifdef __cplusplus } // extern "C" diff --git a/media/libaom/src/av1/common/cdef_block.c b/media/libaom/src/av1/common/cdef_block.c index df1de89be..7120705d3 100644 --- a/media/libaom/src/av1/common/cdef_block.c +++ b/media/libaom/src/av1/common/cdef_block.c @@ -108,17 +108,17 @@ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, } const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } }; -const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } }; +const int cdef_sec_taps[2] = { 2, 1 }; /* Smooth in the direction detected. */ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, - AOM_UNUSED int max_unused, int coeff_shift) { + int coeff_shift) { int i, j, k; const int s = CDEF_BSTRIDE; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) { for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) { int16_t sum = 0; @@ -173,25 +173,20 @@ static INLINE int adjust_strength(int strength, int32_t var) { return var ? (strength * (4 + i) + 8) >> 4 : 0; } -void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in, - int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], - int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, - cdef_list *dlist, int cdef_count, int level, - int sec_strength, int pri_damping, int sec_damping, - int coeff_shift) { +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift) { int bi; int bx; int by; - int bsize, bsizex, bsizey; - - int pri_strength = level << coeff_shift; + const int pri_strength = level << coeff_shift; sec_strength <<= coeff_shift; - sec_damping += coeff_shift - (pli != AOM_PLANE_Y); - pri_damping += coeff_shift - (pli != AOM_PLANE_Y); - bsize = - ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8); - bsizex = 3 - xdec; - bsizey = 3 - ydec; + damping += coeff_shift - (pli != AOM_PLANE_Y); + const int bw_log2 = 3 - xdec; + const int bh_log2 = 3 - ydec; if (dirinit && pri_strength == 0 && sec_strength == 0) { // If we're here, both primary and secondary strengths are 0, and // we still haven't written anything to y[] yet, so we just copy @@ -200,12 +195,12 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in, for (bi = 0; bi < cdef_count; bi++) { by = dlist[bi].by; bx = dlist[bi].bx; - int iy, ix; // TODO(stemidts/jmvalin): SIMD optimisations - for (iy = 0; iy < 1 << bsizey; iy++) - for (ix = 0; ix < 1 << bsizex; ix++) - dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] = - in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix]; + for (int iy = 0; iy < 1 << bh_log2; iy++) { + memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)], + &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)], + ((size_t)1 << bw_log2) * sizeof(*dst16)); + } } return; } @@ -231,27 +226,28 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in, } } + const int bsize = + ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8); + const int t = pri_strength; + const int s = sec_strength; for (bi = 0; bi < cdef_count; bi++) { - int t = dlist[bi].skip ? 0 : pri_strength; - int s = dlist[bi].skip ? 0 : sec_strength; by = dlist[bi].by; bx = dlist[bi].bx; - if (dst8) - cdef_filter_block(&dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL, - dstride, - &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)], - (pli ? t : adjust_strength(t, var[by][bx])), s, - t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize, - (256 << coeff_shift) - 1, coeff_shift); - else + if (dst8) { + cdef_filter_block( + &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], NULL, dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], + (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0, + damping, damping, bsize, coeff_shift); + } else { cdef_filter_block( NULL, - &dst16[dirinit ? bi << (bsizex + bsizey) - : (by << bsizey) * dstride + (bx << bsizex)], - dirinit ? 1 << bsizex : dstride, - &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)], + &dst16[dirinit ? bi << (bw_log2 + bh_log2) + : (by << bh_log2) * dstride + (bx << bw_log2)], + dirinit ? 1 << bw_log2 : dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0, - pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1, - coeff_shift); + damping, damping, bsize, coeff_shift); + } } } diff --git a/media/libaom/src/av1/common/cdef_block.h b/media/libaom/src/av1/common/cdef_block.h index 6b4452cd6..6b0ae0a9d 100644 --- a/media/libaom/src/av1/common/cdef_block.h +++ b/media/libaom/src/av1/common/cdef_block.h @@ -32,28 +32,27 @@ (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER)) extern const int cdef_pri_taps[2][2]; -extern const int cdef_sec_taps[2][2]; +extern const int cdef_sec_taps[2]; DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]); typedef struct { uint8_t by; uint8_t bx; - uint8_t skip; } cdef_list; typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, - int sec_damping, int bsize, int max, + int sec_damping, int bsize, int coeff_shift); void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src, cdef_list *dlist, int cdef_count, int bsize); -void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in, - int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], - int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, - cdef_list *dlist, int cdef_count, int level, - int sec_strength, int pri_damping, int sec_damping, - int coeff_shift); +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift); #endif // AOM_AV1_COMMON_CDEF_BLOCK_H_ diff --git a/media/libaom/src/av1/common/cdef_block_simd.h b/media/libaom/src/av1/common/cdef_block_simd.h index 14587a023..5a52bc1e4 100644 --- a/media/libaom/src/av1/common/cdef_block_simd.h +++ b/media/libaom/src/av1/common/cdef_block_simd.h @@ -226,7 +226,6 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, - AOM_UNUSED int max_unused, int coeff_shift) { v128 p0, p1, p2, p3; v256 sum, row, tap, res; @@ -239,7 +238,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride, int s2o2 = cdef_directions[(dir + 6) & 7][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; if (pri_strength) pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); @@ -393,7 +392,6 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, - AOM_UNUSED int max_unused, int coeff_shift) { int i; v128 p0, p1, p2, p3; @@ -407,7 +405,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride, int s2o2 = cdef_directions[(dir + 6) & 7][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; if (pri_strength) pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); @@ -541,7 +539,6 @@ void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, - AOM_UNUSED int max_unused, int coeff_shift) { int i; v256 p0, p1, p2, p3, sum, row, res; @@ -554,7 +551,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride, int s2o2 = cdef_directions[(dir + 6) & 7][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; if (pri_strength) pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); @@ -699,7 +696,6 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, - AOM_UNUSED int max_unused, int coeff_shift) { int i; v256 sum, p0, p1, p2, p3, row, res; @@ -712,7 +708,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride, int s2o2 = cdef_directions[(dir + 6) & 7][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; if (pri_strength) pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); @@ -833,63 +829,62 @@ void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride, void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, - int sec_damping, int bsize, int max, - int coeff_shift) { + int sec_damping, int bsize, int coeff_shift) { if (dst8) { if (bsize == BLOCK_8X8) { SIMD_FUNC(cdef_filter_block_8x8_8) (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); } else if (bsize == BLOCK_4X8) { SIMD_FUNC(cdef_filter_block_4x4_8) (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); SIMD_FUNC(cdef_filter_block_4x4_8) (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength, - sec_strength, dir, pri_damping, sec_damping, max, coeff_shift); + sec_strength, dir, pri_damping, sec_damping, coeff_shift); } else if (bsize == BLOCK_8X4) { SIMD_FUNC(cdef_filter_block_4x4_8) (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); SIMD_FUNC(cdef_filter_block_4x4_8) (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); } else { SIMD_FUNC(cdef_filter_block_4x4_8) (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); } } else { if (bsize == BLOCK_8X8) { SIMD_FUNC(cdef_filter_block_8x8_16) (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); } else if (bsize == BLOCK_4X8) { SIMD_FUNC(cdef_filter_block_4x4_16) (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); SIMD_FUNC(cdef_filter_block_4x4_16) (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength, - sec_strength, dir, pri_damping, sec_damping, max, coeff_shift); + sec_strength, dir, pri_damping, sec_damping, coeff_shift); } else if (bsize == BLOCK_8X4) { SIMD_FUNC(cdef_filter_block_4x4_16) (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); SIMD_FUNC(cdef_filter_block_4x4_16) (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); } else { assert(bsize == BLOCK_4X4); SIMD_FUNC(cdef_filter_block_4x4_16) (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, - sec_damping, max, coeff_shift); + sec_damping, coeff_shift); } } } -void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, - const uint8_t *src, int sstride, int v, - int h) { +void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int v, int h) { int i, j; for (i = 0; i < v; i++) { for (j = 0; j < (h & ~0x7); j += 8) { @@ -902,9 +897,9 @@ void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, } } -void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, - const uint16_t *src, int sstride, - int v, int h) { +void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int v, int h) { int i, j; for (i = 0; i < v; i++) { for (j = 0; j < (h & ~0x7); j += 8) { diff --git a/media/libaom/src/av1/common/cfl.c b/media/libaom/src/av1/common/cfl.c index ccc59b4eb..98199cb95 100644 --- a/media/libaom/src/av1/common/cfl.c +++ b/media/libaom/src/av1/common/cfl.c @@ -9,9 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "av1/common/av1_common_int.h" #include "av1/common/cfl.h" #include "av1/common/common_data.h" -#include "av1/common/onyxc_int.h" #include "config/av1_rtcd.h" @@ -37,7 +37,7 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, assert(pred_plane < CFL_PRED_PLANES); assert(width <= CFL_BUF_LINE); - if (get_bitdepth_data_path_index(xd)) { + if (is_cur_buf_hbd(xd)) { uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input); memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1); return; @@ -69,7 +69,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, assert(pred_plane < CFL_PRED_PLANES); assert(width <= CFL_BUF_LINE); assert(height <= CFL_BUF_LINE); - if (get_bitdepth_data_path_index(xd)) { + if (is_cur_buf_hbd(xd)) { uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride, width, height); @@ -136,7 +136,7 @@ static void subtract_average_c(const uint16_t *src, int16_t *dst, int width, CFL_SUB_AVG_FN(c) -static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign, +static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign, CFL_PRED_TYPE pred_type) { const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign) : CFL_SIGN_V(joint_sign); @@ -158,18 +158,9 @@ static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst, } } -// Null function used for invalid tx_sizes -void cfl_predict_lbd_null(const int16_t *ac_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3) { - (void)ac_buf_q3; - (void)dst; - (void)dst_stride; - (void)alpha_q3; - assert(0); -} - CFL_PREDICT_FN(c, lbd) +#if CONFIG_AV1_HIGHBITDEPTH void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bit_depth, int width, int height) { for (int j = 0; j < height; j++) { @@ -182,18 +173,8 @@ void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride, } } -// Null function used for invalid tx_sizes -void cfl_predict_hbd_null(const int16_t *ac_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd) { - (void)ac_buf_q3; - (void)dst; - (void)dst_stride; - (void)alpha_q3; - (void)bd; - assert(0); -} - CFL_PREDICT_FN(c, hbd) +#endif static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) { CFL_CTX *const cfl = &xd->cfl; @@ -201,7 +182,7 @@ static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) { assert(cfl->are_parameters_computed == 0); cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]); - get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3); + cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3); cfl->are_parameters_computed = 1; } @@ -217,31 +198,15 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1); assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <= CFL_BUF_SQUARE); - if (get_bitdepth_data_path_index(xd)) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); - get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3, - xd->bd); + cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, + alpha_q3, xd->bd); return; } - get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3); -} - -// Null function used for invalid tx_sizes -void cfl_subsample_lbd_null(const uint8_t *input, int input_stride, - uint16_t *output_q3) { - (void)input; - (void)input_stride; - (void)output_q3; - assert(0); -} - -// Null function used for invalid tx_sizes -void cfl_subsample_hbd_null(const uint16_t *input, int input_stride, - uint16_t *output_q3) { - (void)input; - (void)input_stride; - (void)output_q3; - assert(0); +#endif + cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3); } static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input, @@ -287,6 +252,7 @@ static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input, } } +#if CONFIG_AV1_HIGHBITDEPTH static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input, int input_stride, uint16_t *output_q3, int width, @@ -329,9 +295,11 @@ static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input, output_q3 += CFL_BUF_LINE; } } +#endif CFL_GET_SUBSAMPLE_FUNCTION(c) +#if CONFIG_AV1_HIGHBITDEPTH static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size, int sub_x, int sub_y) { if (sub_x == 1) { @@ -342,6 +310,7 @@ static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size, } return cfl_get_luma_subsampling_444_hbd(tx_size); } +#endif static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size, int sub_x, int sub_y) { @@ -358,7 +327,7 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row, int col, TX_SIZE tx_size, int use_hbd) { const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; - const int tx_off_log2 = tx_size_wide_log2[0]; + const int tx_off_log2 = MI_SIZE_LOG2; const int sub_x = cfl->subsampling_x; const int sub_y = cfl->subsampling_y; const int store_row = row << (tx_off_log2 - sub_y); @@ -387,7 +356,7 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, // Store the input into the CfL pixel buffer uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col); - +#if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) { cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input), input_stride, recon_buf_q3); @@ -395,20 +364,25 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3); } +#else + (void)use_hbd; + cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3); +#endif } // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced // and non-chroma-referenced blocks are stored together in the CfL buffer. -static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out, +static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row, + int mi_col, int *row_out, int *col_out) { // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s. - if ((cfl->mi_row & 0x01) && cfl->subsampling_y) { + if ((mi_row & 0x01) && cfl->subsampling_y) { assert(*row_out == 0); (*row_out)++; } // Increment col index for right: 4x8, 4x16 or both right 4x4s. - if ((cfl->mi_col & 0x01) && cfl->subsampling_x) { + if ((mi_col & 0x01) && cfl->subsampling_x) { assert(*col_out == 0); (*col_out)++; } @@ -418,17 +392,31 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, BLOCK_SIZE bsize) { CFL_CTX *const cfl = &xd->cfl; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; - uint8_t *dst = - &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]]; + uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { // Only dimensions of size 4 can have an odd offset. assert(!((col & 1) && tx_size_wide[tx_size] != 4)); assert(!((row & 1) && tx_size_high[tx_size] != 4)); - sub8x8_adjust_offset(cfl, &row, &col); + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); } - cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, - get_bitdepth_data_path_index(xd)); + cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd)); +} + +static INLINE int max_intra_block_width(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]); +} + +static INLINE int max_intra_block_height(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_high = max_block_high(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]); } void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { @@ -438,11 +426,11 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { int col = 0; if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { - sub8x8_adjust_offset(cfl, &row, &col); + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); } const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size); const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size); tx_size = get_tx_size(width, height); cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size, - get_bitdepth_data_path_index(xd)); + is_cur_buf_hbd(xd)); } diff --git a/media/libaom/src/av1/common/cfl.h b/media/libaom/src/av1/common/cfl.h index d627891bf..a1d6dc2ea 100644 --- a/media/libaom/src/av1/common/cfl.h +++ b/media/libaom/src/av1/common/cfl.h @@ -12,8 +12,8 @@ #ifndef AOM_AV1_COMMON_CFL_H_ #define AOM_AV1_COMMON_CFL_H_ +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" -#include "av1/common/onyxc_int.h" // Can we use CfL for the current block? static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) { @@ -41,7 +41,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm, if (cm->seq_params.monochrome) return CFL_DISALLOWED; - if (!xd->cfl.is_chroma_reference) { + if (!xd->is_chroma_ref) { // For non-chroma-reference blocks, we should always store the luma pixels, // in case the corresponding chroma-reference block uses CfL. // Note that this can only happen for block sizes which are <8 on @@ -80,14 +80,6 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, TX_SIZE tx_size, CFL_PRED_TYPE pred_plane); -// Null function used for invalid tx_sizes -void cfl_subsample_lbd_null(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// Null function used for invalid tx_sizes -void cfl_subsample_hbd_null(const uint16_t *input, int input_stride, - uint16_t *output_q3); - // Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth. #define CFL_lbd_TYPE uint8_t *cfl_type #define CFL_hbd_TYPE uint16_t *cfl_type @@ -97,7 +89,7 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride, // will be constant allowing for loop unrolling and other constant propagated // goodness. #define CFL_SUBSAMPLE(arch, sub, bd, width, height) \ - void subsample_##bd##_##sub##_##width##x##height##_##arch( \ + void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \ cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \ output_q3, width, height); \ @@ -127,31 +119,32 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride, // Declare an architecture-specific array of function pointers for size-specific // wrappers. -#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ - static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ - subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ - subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ - subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ - subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ - cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ - subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ - subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ - subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ - subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ - subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ - cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \ - cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ - subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ - subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ - subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ - cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \ - cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \ +#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ }; // The RTCD script does not support passing in an array, so we wrap it in this // function. +#if CONFIG_AV1_HIGHBITDEPTH #define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ @@ -159,144 +152,137 @@ void cfl_subsample_hbd_null(const uint16_t *input, int input_stride, CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd) - -// Null function used for invalid tx_sizes -static INLINE void cfl_subtract_average_null(const uint16_t *src, - int16_t *dst) { - (void)dst; - (void)src; - assert(0); -} +#else +#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) +#endif // Declare a size-specific wrapper for the size-generic function. The compiler // will inline the size generic function in here, the advantage is that the size // will be constant allowing for loop unrolling and other constant propagated // goodness. -#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ - void subtract_average_##width##x##height##_##arch(const uint16_t *src, \ - int16_t *dst) { \ - subtract_average_##arch(src, dst, width, height, round_offset, \ - num_pel_log2); \ +#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ + void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ + int16_t *dst) { \ + subtract_average_##arch(src, dst, width, height, round_offset, \ + num_pel_log2); \ } // Declare size-specific wrappers for all valid CfL sizes. -#define CFL_SUB_AVG_FN(arch) \ - CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \ - CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \ - CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \ - CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \ - CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \ - CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \ - CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \ - CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \ - CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \ - CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \ - CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \ - CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \ - CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \ - CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \ - cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \ - static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \ - subtract_average_4x4_##arch, /* 4x4 */ \ - subtract_average_8x8_##arch, /* 8x8 */ \ - subtract_average_16x16_##arch, /* 16x16 */ \ - subtract_average_32x32_##arch, /* 32x32 */ \ - cfl_subtract_average_null, /* 64x64 (invalid CFL size) */ \ - subtract_average_4x8_##arch, /* 4x8 */ \ - subtract_average_8x4_##arch, /* 8x4 */ \ - subtract_average_8x16_##arch, /* 8x16 */ \ - subtract_average_16x8_##arch, /* 16x8 */ \ - subtract_average_16x32_##arch, /* 16x32 */ \ - subtract_average_32x16_##arch, /* 32x16 */ \ - cfl_subtract_average_null, /* 32x64 (invalid CFL size) */ \ - cfl_subtract_average_null, /* 64x32 (invalid CFL size) */ \ - subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ - subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ - subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ - subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ - cfl_subtract_average_null, /* 16x64 (invalid CFL size) */ \ - cfl_subtract_average_null, /* 64x16 (invalid CFL size) */ \ - }; \ - /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ - /* index the function pointer array out of bounds. */ \ - return sub_avg[tx_size % TX_SIZES_ALL]; \ +#define CFL_SUB_AVG_FN(arch) \ + CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \ + CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \ + CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \ + CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \ + CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \ + CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \ + CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \ + CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \ + cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \ + TX_SIZE tx_size) { \ + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \ + cfl_subtract_average_4x4_##arch, /* 4x4 */ \ + cfl_subtract_average_8x8_##arch, /* 8x8 */ \ + cfl_subtract_average_16x16_##arch, /* 16x16 */ \ + cfl_subtract_average_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subtract_average_4x8_##arch, /* 4x8 */ \ + cfl_subtract_average_8x4_##arch, /* 8x4 */ \ + cfl_subtract_average_8x16_##arch, /* 8x16 */ \ + cfl_subtract_average_16x8_##arch, /* 16x8 */ \ + cfl_subtract_average_16x32_##arch, /* 16x32 */ \ + cfl_subtract_average_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ + cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ + cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ + cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return sub_avg[tx_size % TX_SIZES_ALL]; \ } // For VSX SIMD optimization, the C versions of width == 4 subtract are // faster than the VSX. As such, the VSX code calls the C versions. -void subtract_average_4x4_c(const uint16_t *src, int16_t *dst); -void subtract_average_4x8_c(const uint16_t *src, int16_t *dst); -void subtract_average_4x16_c(const uint16_t *src, int16_t *dst); - -#define CFL_PREDICT_lbd(arch, width, height) \ - void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \ - uint8_t *dst, int dst_stride, \ - int alpha_q3) { \ - cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ - height); \ +void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); + +#define CFL_PREDICT_lbd(arch, width, height) \ + void cfl_predict_lbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ + int alpha_q3) { \ + cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ + height); \ } -#define CFL_PREDICT_hbd(arch, width, height) \ - void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \ - uint16_t *dst, int dst_stride, \ - int alpha_q3, int bd) { \ - cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ - height); \ +#if CONFIG_AV1_HIGHBITDEPTH +#define CFL_PREDICT_hbd(arch, width, height) \ + void cfl_predict_hbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ + int bd) { \ + cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ + height); \ } +#endif // This wrapper exists because clang format does not like calling macros with // lowercase letters. #define CFL_PREDICT_X(arch, width, height, bd) \ CFL_PREDICT_##bd(arch, width, height) -// Null function used for invalid tx_sizes -void cfl_predict_lbd_null(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); - -// Null function used for invalid tx_sizes -void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); - -#define CFL_PREDICT_FN(arch, bd) \ - CFL_PREDICT_X(arch, 4, 4, bd) \ - CFL_PREDICT_X(arch, 4, 8, bd) \ - CFL_PREDICT_X(arch, 4, 16, bd) \ - CFL_PREDICT_X(arch, 8, 4, bd) \ - CFL_PREDICT_X(arch, 8, 8, bd) \ - CFL_PREDICT_X(arch, 8, 16, bd) \ - CFL_PREDICT_X(arch, 8, 32, bd) \ - CFL_PREDICT_X(arch, 16, 4, bd) \ - CFL_PREDICT_X(arch, 16, 8, bd) \ - CFL_PREDICT_X(arch, 16, 16, bd) \ - CFL_PREDICT_X(arch, 16, 32, bd) \ - CFL_PREDICT_X(arch, 32, 8, bd) \ - CFL_PREDICT_X(arch, 32, 16, bd) \ - CFL_PREDICT_X(arch, 32, 32, bd) \ - cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \ - static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \ - predict_##bd##_4x4_##arch, /* 4x4 */ \ - predict_##bd##_8x8_##arch, /* 8x8 */ \ - predict_##bd##_16x16_##arch, /* 16x16 */ \ - predict_##bd##_32x32_##arch, /* 32x32 */ \ - cfl_predict_##bd##_null, /* 64x64 (invalid CFL size) */ \ - predict_##bd##_4x8_##arch, /* 4x8 */ \ - predict_##bd##_8x4_##arch, /* 8x4 */ \ - predict_##bd##_8x16_##arch, /* 8x16 */ \ - predict_##bd##_16x8_##arch, /* 16x8 */ \ - predict_##bd##_16x32_##arch, /* 16x32 */ \ - predict_##bd##_32x16_##arch, /* 32x16 */ \ - cfl_predict_##bd##_null, /* 32x64 (invalid CFL size) */ \ - cfl_predict_##bd##_null, /* 64x32 (invalid CFL size) */ \ - predict_##bd##_4x16_##arch, /* 4x16 */ \ - predict_##bd##_16x4_##arch, /* 16x4 */ \ - predict_##bd##_8x32_##arch, /* 8x32 */ \ - predict_##bd##_32x8_##arch, /* 32x8 */ \ - cfl_predict_##bd##_null, /* 16x64 (invalid CFL size) */ \ - cfl_predict_##bd##_null, /* 64x16 (invalid CFL size) */ \ - }; \ - /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ - /* index the function pointer array out of bounds. */ \ - return pred[tx_size % TX_SIZES_ALL]; \ +#define CFL_PREDICT_FN(arch, bd) \ + CFL_PREDICT_X(arch, 4, 4, bd) \ + CFL_PREDICT_X(arch, 4, 8, bd) \ + CFL_PREDICT_X(arch, 4, 16, bd) \ + CFL_PREDICT_X(arch, 8, 4, bd) \ + CFL_PREDICT_X(arch, 8, 8, bd) \ + CFL_PREDICT_X(arch, 8, 16, bd) \ + CFL_PREDICT_X(arch, 8, 32, bd) \ + CFL_PREDICT_X(arch, 16, 4, bd) \ + CFL_PREDICT_X(arch, 16, 8, bd) \ + CFL_PREDICT_X(arch, 16, 16, bd) \ + CFL_PREDICT_X(arch, 16, 32, bd) \ + CFL_PREDICT_X(arch, 32, 8, bd) \ + CFL_PREDICT_X(arch, 32, 16, bd) \ + CFL_PREDICT_X(arch, 32, 32, bd) \ + cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \ + static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \ + cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \ + cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \ + cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \ + cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \ + cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \ + cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \ + cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \ + cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \ + cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \ + cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \ + cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \ + cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return pred[tx_size % TX_SIZES_ALL]; \ } #endif // AOM_AV1_COMMON_CFL_H_ diff --git a/media/libaom/src/av1/common/common_data.h b/media/libaom/src/av1/common/common_data.h index 46e455fdb..402845caf 100644 --- a/media/libaom/src/av1/common/common_data.h +++ b/media/libaom/src/av1/common/common_data.h @@ -82,16 +82,16 @@ static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = { BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64 }, { // PARTITION_HORZ_A - BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 }, { // PARTITION_HORZ_B - BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 }, { // PARTITION_VERT_A - BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 }, { // PARTITION_VERT_B - BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 }, { // PARTITION_HORZ_4 BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4, diff --git a/media/libaom/src/av1/common/convolve.c b/media/libaom/src/av1/common/convolve.c index 1f11126fc..e177e3cad 100644 --- a/media/libaom/src/av1/common/convolve.c +++ b/media/libaom/src/av1/common/convolve.c @@ -15,10 +15,10 @@ #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" -#include "av1/common/onyxc_int.h" #include "av1/common/resize.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" @@ -73,15 +73,55 @@ void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, } } +void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst, + int dst_stride, int w, int h, int dir, + double norm) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 }; + DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 }; + const int taps = 3; + int im_h = h + taps - 1; + int im_stride = w; + const int fo_vert = 1; + const int fo_horiz = 1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = dir ? sobel_a : sobel_b; + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int16_t sum = 0; + for (int k = 0; k < taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + im_block[y * im_stride + x] = sum; + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = dir ? sobel_b : sobel_a; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int16_t sum = 0; + for (int k = 0; k < taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + dst[y * dst_stride + x] = sum * norm; + } + } +} + void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + filter_params_y->taps - 1; int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bd = 8; @@ -91,7 +131,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -107,7 +147,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -128,11 +168,11 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int fo_vert = filter_params_y->taps / 2 - 1; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; (void)conv_params; assert(conv_params->round_0 <= FILTER_BITS); @@ -141,7 +181,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -158,12 +198,12 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; (void)conv_params; assert(bits >= 0); @@ -172,7 +212,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -190,27 +230,27 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; (void)conv_params; for (int y = 0; y < h; ++y) { - memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); + memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); } } -void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + filter_params_y->taps - 1; int im_stride = w; @@ -223,7 +263,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -239,7 +279,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -250,8 +290,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, assert(0 <= sum && sum < (1 << (offset_bits + 2))); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -260,23 +300,23 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, } tmp -= (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); - dst8[y * dst8_stride + x] = + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } } -void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; const int bd = 8; @@ -286,11 +326,11 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -301,8 +341,8 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -310,23 +350,23 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, tmp = tmp >> 1; } tmp -= round_offset; - dst8[y * dst8_stride + x] = + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } } -void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_1; const int bd = 8; @@ -336,11 +376,11 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -351,8 +391,8 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, res += round_offset; if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -360,23 +400,24 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, tmp = tmp >> 1; } tmp -= round_offset; - dst8[y * dst8_stride + x] = + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } } -void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, - uint8_t *dst8, int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int bd = 8; @@ -385,8 +426,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, (1 << (offset_bits - conv_params->round_1 - 1)); (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -394,8 +435,8 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, res += round_offset; if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -403,16 +444,16 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, tmp = tmp >> 1; } tmp -= round_offset; - dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } } -void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, +void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, @@ -472,7 +513,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -482,7 +523,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, /* Subtract round offset and convolve round */ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); - dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } else { dst16[y * dst16_stride + x] = res; } @@ -490,7 +531,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, /* Subtract round offset and convolve round */ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); - dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } } src_vert++; @@ -511,89 +552,71 @@ static void convolve_2d_scale_wrapper( y_step_qn, conv_params); } -// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So -// we may create optimized code to do 2-tap filtering for all bilinear filtering -// usages, not just IntraBC. -static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - int subpel_x_q4, int subpel_y_q4, - ConvolveParams *conv_params) { - const InterpFilterParams *filter_params_x = - subpel_x_q4 ? &av1_intrabc_filter_params : NULL; - const InterpFilterParams *filter_params_y = - subpel_y_q4 ? &av1_intrabc_filter_params : NULL; - if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { - av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, 0, 0, conv_params); - } else if (subpel_x_q4 != 0) { - av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, - filter_params_y, 0, 0, conv_params); - } else { - av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, - filter_params_y, 0, 0, conv_params); - } -} - void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilters interp_filters, const int subpel_x_q4, - int x_step_q4, const int subpel_y_q4, int y_step_q4, - int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, int is_intrabc) { - assert(IMPLIES(is_intrabc, !scaled)); + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params, + const struct scale_factors *sf) { (void)x_step_q4; (void)y_step_q4; (void)dst; (void)dst_stride; - if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { - convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4, - subpel_y_q4, conv_params); - return; + const InterpFilterParams *filter_params_x = interp_filters[0]; + const InterpFilterParams *filter_params_y = interp_filters[1]; + + // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case. + // Do we have SIMD support to 4-tap case? + // 2-tap filter indicates that it is for IntraBC. + if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert(!scaled); + if (subpel_x_qn && subpel_y_qn) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } else if (subpel_x_qn) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } else if (subpel_y_qn) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } } - InterpFilter filter_x = 0; - InterpFilter filter_y = 0; - const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; - const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; - if (need_filter_params_x) - filter_x = av1_extract_interp_filter(interp_filters, 1); - if (need_filter_params_y) - filter_y = av1_extract_interp_filter(interp_filters, 0); - const InterpFilterParams *filter_params_x = - need_filter_params_x - ? av1_get_interp_filter_params_with_block_size(filter_x, w) - : NULL; - const InterpFilterParams *filter_params_y = - need_filter_params_y - ? av1_get_interp_filter_params_with_block_size(filter_y, h) - : NULL; - if (scaled) { convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, subpel_x_q4, - x_step_q4, subpel_y_q4, y_step_q4, conv_params); + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params); } else { - sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound]( + sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound]( src, src_stride, dst, dst_stride, w, h, filter_params_x, - filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } } +#if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_convolve_2d_copy_sr_c( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; (void)conv_params; (void)bd; for (int y = 0; y < h; ++y) { - memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); + memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); } } @@ -601,12 +624,12 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || @@ -614,7 +637,7 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -632,11 +655,11 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { const int fo_vert = filter_params_y->taps / 2 - 1; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; (void)conv_params; assert(conv_params->round_0 <= FILTER_BITS); @@ -644,7 +667,7 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -661,11 +684,12 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + filter_params_y->taps - 1; int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = @@ -675,7 +699,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -691,7 +715,7 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { @@ -709,17 +733,15 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, } } -void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, - uint16_t *dst16, int dst16_stride, int w, - int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_dist_wtd_convolve_2d_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int x, y, k; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; int im_h = h + filter_params_y->taps - 1; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; @@ -731,7 +753,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (y = 0; y < im_h; ++y) { for (x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); @@ -749,7 +771,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { int32_t sum = 1 << offset_bits; @@ -759,8 +781,8 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, assert(0 <= sum && sum < (1 << (offset_bits + 2))); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -769,24 +791,22 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, } tmp -= (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); - dst16[y * dst16_stride + x] = + dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } } -void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, - uint16_t *dst16, int dst16_stride, int w, - int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; +void av1_highbd_dist_wtd_convolve_x_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; @@ -796,11 +816,11 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; assert(round_bits >= 0); (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; assert(bits >= 0); // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -811,8 +831,8 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, res += round_offset; if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -820,24 +840,22 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, tmp = tmp >> 1; } tmp -= round_offset; - dst16[y * dst16_stride + x] = + dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } } -void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, - uint16_t *dst16, int dst16_stride, int w, - int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; +void av1_highbd_dist_wtd_convolve_y_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; @@ -847,11 +865,11 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; assert(round_bits >= 0); (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; assert(bits >= 0); // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -862,8 +880,8 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -871,22 +889,22 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, tmp = tmp >> 1; } tmp -= round_offset; - dst16[y * dst16_stride + x] = + dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } } -void av1_highbd_jnt_convolve_2d_copy_c( - const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, - int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; +void av1_highbd_dist_wtd_convolve_2d_copy_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; @@ -895,16 +913,16 @@ void av1_highbd_jnt_convolve_2d_copy_c( assert(bits >= 0); (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { CONV_BUF_TYPE res = src[y * src_stride + x] << bits; res += round_offset; if (conv_params->do_average) { - int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -912,10 +930,10 @@ void av1_highbd_jnt_convolve_2d_copy_c( tmp = tmp >> 1; } tmp -= round_offset; - dst16[y * dst16_stride + x] = + dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); } else { - dst[y * dst_stride + x] = res; + dst16[y * dst16_stride + x] = res; } } } @@ -980,7 +998,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -1007,68 +1025,24 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, } } -static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, - int h, int subpel_x_q4, - int subpel_y_q4, - ConvolveParams *conv_params, - int bd) { - const InterpFilterParams *filter_params_x = - subpel_x_q4 ? &av1_intrabc_filter_params : NULL; - const InterpFilterParams *filter_params_y = - subpel_y_q4 ? &av1_intrabc_filter_params : NULL; - if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { - av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, 0, 0, - conv_params, bd); - } else if (subpel_x_q4 != 0) { - av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, 0, 0, - conv_params, bd); - } else { - av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, 0, 0, - conv_params, bd); - } -} - void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w, int h, - InterpFilters interp_filters, - const int subpel_x_q4, int x_step_q4, - const int subpel_y_q4, int y_step_q4, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, - int is_intrabc, int bd) { - assert(IMPLIES(is_intrabc, !scaled)); + const struct scale_factors *sf, int bd) { (void)x_step_q4; (void)y_step_q4; (void)dst_stride; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, - subpel_x_q4, subpel_y_q4, conv_params, bd); - return; - } - - InterpFilter filter_x = 0; - InterpFilter filter_y = 0; - const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; - const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; - if (need_filter_params_x) - filter_x = av1_extract_interp_filter(interp_filters, 1); - if (need_filter_params_y) - filter_y = av1_extract_interp_filter(interp_filters, 0); + const int need_filter_params_x = (subpel_x_qn != 0) | scaled; + const int need_filter_params_y = (subpel_y_qn != 0) | scaled; const InterpFilterParams *filter_params_x = - need_filter_params_x - ? av1_get_interp_filter_params_with_block_size(filter_x, w) - : NULL; + need_filter_params_x ? interp_filters[0] : NULL; const InterpFilterParams *filter_params_y = - need_filter_params_y - ? av1_get_interp_filter_params_with_block_size(filter_y, h) - : NULL; + need_filter_params_y ? interp_filters[1] : NULL; if (scaled) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -1076,18 +1050,19 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, assert(conv_params->dst != NULL); } av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, - filter_params_x, filter_params_y, subpel_x_q4, - x_step_q4, subpel_y_q4, y_step_q4, conv_params, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params, bd); } else { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 != + sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound]( src, src_stride, dst, dst_stride, w, h, filter_params_x, - filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd); + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); } } +#endif // CONFIG_AV1_HIGHBITDEPTH // Note: Fixed size intermediate buffers, place limits on parameters // of some functions. 2d filtering proceeds in 2 steps: @@ -1109,12 +1084,14 @@ static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { return sum; } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE int highbd_horz_scalar_product(const uint16_t *a, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; return sum; } +#endif static INLINE int highbd_vert_scalar_product(const uint16_t *a, ptrdiff_t a_stride, @@ -1215,6 +1192,7 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h, conv_params->round_1); } +#if CONFIG_AV1_HIGHBITDEPTH static void highbd_convolve_add_src_horiz_hip( const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, @@ -1293,3 +1271,4 @@ void av1_highbd_wiener_convolve_add_src_c( temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); } +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/av1/common/convolve.h b/media/libaom/src/av1/common/convolve.h index 4109dd843..04df86c42 100644 --- a/media/libaom/src/av1/common/convolve.h +++ b/media/libaom/src/av1/common/convolve.h @@ -26,7 +26,8 @@ typedef struct ConvolveParams { int round_1; int plane; int is_compound; - int use_jnt_comp_avg; + int compound_index; // 0: the first single in compound mode, 1: the second. + int use_dist_wtd_comp_avg; int fwd_offset; int bck_offset; } ConvolveParams; @@ -41,32 +42,34 @@ typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); typedef void (*aom_highbd_convolve_fn_t)( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd); + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd); struct AV1Common; struct scale_factors; void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilters interp_filters, const int subpel_x_q4, - int x_step_q4, const int subpel_y_q4, int y_step_q4, - int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, int is_intrabc); + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params, + const struct scale_factors *sf); -static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane, +static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane, CONV_BUF_TYPE *dst, int dst_stride, int is_compound, int bd) { ConvolveParams conv_params; - conv_params.do_average = do_average; - assert(IMPLIES(do_average, is_compound)); + conv_params.compound_index = cmp_index; + assert(IMPLIES(cmp_index, is_compound)); + conv_params.is_compound = is_compound; conv_params.round_0 = ROUND0_BITS; conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS @@ -82,6 +85,10 @@ static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane, conv_params.dst = dst; conv_params.dst_stride = dst_stride; conv_params.plane = plane; + + // By default, set do average to 1 if this is the second single prediction + // in a compound mode. + conv_params.do_average = cmp_index; return conv_params; } @@ -111,12 +118,16 @@ static INLINE ConvolveParams get_conv_params_wiener(int bd) { void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst, int dst_stride, int w, int h, - InterpFilters interp_filters, - const int subpel_x_q4, int x_step_q4, - const int subpel_y_q4, int y_step_q4, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, - int is_intrabc, int bd); + const struct scale_factors *sf, int bd); + +// TODO(sarahparker) This will need to be integerized and optimized +void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst, + int dst_stride, int w, int h, int dir, + double norm); #ifdef __cplusplus } // extern "C" diff --git a/media/libaom/src/av1/common/debugmodes.c b/media/libaom/src/av1/common/debugmodes.c index 868f341b5..ff02ddde0 100644 --- a/media/libaom/src/av1/common/debugmodes.c +++ b/media/libaom/src/av1/common/debugmodes.c @@ -11,14 +11,14 @@ #include <stdio.h> +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" -#include "av1/common/onyxc_int.h" static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) { fprintf(f, "%s", str); - fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame, - cm->show_frame, cm->base_qindex); + fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number, + cm->show_frame, cm->quant_params.base_qindex); } /* This function dereferences a pointer to the mbmi structure * and uses the passed in member offset to print out the value of an integer @@ -26,32 +26,31 @@ static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) { */ static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor, size_t member_offset) { - int mi_row, mi_col; - MB_MODE_INFO **mi = cm->mi_grid_visible; - int rows = cm->mi_rows; - int cols = cm->mi_cols; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MB_MODE_INFO **mi = mi_params->mi_grid_base; + int rows = mi_params->mi_rows; + int cols = mi_params->mi_cols; char prefix = descriptor[0]; log_frame_info(cm, descriptor, file); - for (mi_row = 0; mi_row < rows; mi_row++) { + for (int mi_row = 0; mi_row < rows; mi_row++) { fprintf(file, "%c ", prefix); - for (mi_col = 0; mi_col < cols; mi_col++) { + for (int mi_col = 0; mi_col < cols; mi_col++) { fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); mi++; } fprintf(file, "\n"); - mi += MAX_MIB_SIZE; + mi += mi_params->mi_stride - cols; } fprintf(file, "\n"); } void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { - int mi_row; - int mi_col; + CommonModeInfoParams *mi_params = &cm->mi_params; FILE *mvs = fopen(file, "a"); - MB_MODE_INFO **mi = cm->mi_grid_visible; - int rows = cm->mi_rows; - int cols = cm->mi_cols; + MB_MODE_INFO **mi = mi_params->mi_grid_base; + const int rows = mi_params->mi_rows; + const int cols = mi_params->mi_cols; print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); @@ -61,28 +60,28 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { // output skip infomation. log_frame_info(cm, "Skips:", mvs); - for (mi_row = 0; mi_row < rows; mi_row++) { + for (int mi_row = 0; mi_row < rows; mi_row++) { fprintf(mvs, "S "); - for (mi_col = 0; mi_col < cols; mi_col++) { + for (int mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%2d ", mi[0]->skip); mi++; } fprintf(mvs, "\n"); - mi += MAX_MIB_SIZE; + mi += mi_params->mi_stride - cols; } fprintf(mvs, "\n"); // output motion vectors. log_frame_info(cm, "Vectors ", mvs); - mi = cm->mi_grid_visible; - for (mi_row = 0; mi_row < rows; mi_row++) { + mi = mi_params->mi_grid_base; + for (int mi_row = 0; mi_row < rows; mi_row++) { fprintf(mvs, "V "); - for (mi_col = 0; mi_col < cols; mi_col++) { + for (int mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col); mi++; } fprintf(mvs, "\n"); - mi += MAX_MIB_SIZE; + mi += mi_params->mi_stride - cols; } fprintf(mvs, "\n"); @@ -93,6 +92,13 @@ void av1_print_uncompressed_frame_header(const uint8_t *data, int size, const char *filename) { FILE *hdrFile = fopen(filename, "w"); fwrite(data, size, sizeof(uint8_t), hdrFile); + + // Reset order hints(7bit + a previous bit) to 0, so that all camera frame + // headers are identical in large scale coding. + uint8_t zero = 0; + fseek(hdrFile, 1, SEEK_SET); + // Reset second byte. + fwrite(&zero, 1, sizeof(uint8_t), hdrFile); fclose(hdrFile); } diff --git a/media/libaom/src/av1/common/entropy.c b/media/libaom/src/av1/common/entropy.c index 4f95ef69b..1f7a0efe0 100644 --- a/media/libaom/src/av1/common/entropy.c +++ b/media/libaom/src/av1/common/entropy.c @@ -13,10 +13,10 @@ #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" -#include "av1/common/onyxc_int.h" #include "av1/common/scan.h" #include "av1/common/token_cdfs.h" #include "av1/common/txb_common.h" @@ -29,7 +29,7 @@ static int get_q_ctx(int q) { } void av1_default_coef_probs(AV1_COMMON *cm) { - const int index = get_q_ctx(cm->base_qindex); + const int index = get_q_ctx(cm->quant_params.base_qindex); #if CONFIG_ENTROPY_STATS cm->coef_cdf_category = index; #endif @@ -50,8 +50,9 @@ void av1_default_coef_probs(AV1_COMMON *cm) { av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]); } -static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs, - int cdf_stride, int nsymbs) { +static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, + int num_cdfs, int cdf_stride, + int nsymbs) { for (int i = 0; i < num_cdfs; i++) { cdf_ptr[i * cdf_stride + nsymbs] = 0; } @@ -68,7 +69,7 @@ static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs, reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \ } while (0) -static void reset_nmv_counter(nmv_context *nmv) { +static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) { RESET_CDF_COUNTER(nmv->joints_cdf, 4); for (int i = 0; i < 2; i++) { RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES); @@ -101,7 +102,7 @@ void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) { RESET_CDF_COUNTER(fc->refmv_cdf, 2); RESET_CDF_COUNTER(fc->drl_cdf, 2); RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES); - RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1); + RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES); RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16); RESET_CDF_COUNTER(fc->interintra_cdf, 2); RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2); diff --git a/media/libaom/src/av1/common/entropy.h b/media/libaom/src/av1/common/entropy.h index 991692c2f..ee78f56a3 100644 --- a/media/libaom/src/av1/common/entropy.h +++ b/media/libaom/src/av1/common/entropy.h @@ -48,18 +48,18 @@ extern "C" { #define BR_CDF_SIZE (4) #define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1)) -#define COEFF_CONTEXT_BITS 6 +#define COEFF_CONTEXT_BITS 3 #define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1) #define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1) #define BASE_CONTEXT_POSITION_NUM 12 -typedef enum TX_CLASS { +enum { TX_CLASS_2D = 0, TX_CLASS_HORIZ = 1, TX_CLASS_VERT = 2, TX_CLASSES = 3, -} TX_CLASS; +} UENUM1BYTE(TX_CLASS); #define DCT_MAX_VALUE 16384 #define DCT_MAX_VALUE_HIGH10 65536 diff --git a/media/libaom/src/av1/common/entropymode.c b/media/libaom/src/av1/common/entropymode.c index 41dc30ddb..5f061be35 100644 --- a/media/libaom/src/av1/common/entropymode.c +++ b/media/libaom/src/av1/common/entropymode.c @@ -11,9 +11,9 @@ #include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" #include "av1/common/reconinter.h" #include "av1/common/scan.h" -#include "av1/common/onyxc_int.h" #include "av1/common/seg_common.h" #include "av1/common/txb_common.h" @@ -435,16 +435,16 @@ static const aom_cdf_prob { AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) } }; -static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] = - { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) }, - { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } }; +static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) }, + { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } }; static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE( 2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } }; -static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] = - { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) }, - { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } }; +static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) }, + { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } }; static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = { { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) } @@ -470,11 +470,11 @@ static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( { AOM_CDF2(30237) } }; static const aom_cdf_prob - default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] = - { { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(1875, 11082, 27332) }, - { AOM_CDF4(2473, 9996, 26388) }, - { AOM_CDF4(4238, 11537, 25926) } }; + default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(1875, 11082, 27332) }, + { AOM_CDF4(2473, 9996, 26388) }, + { AOM_CDF4(4238, 11537, 25926) } }; static const aom_cdf_prob default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { @@ -488,63 +488,63 @@ static const aom_cdf_prob { AOM_CDF2(16384) } }; -static const aom_cdf_prob - default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = { - { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, - { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, - { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, - { AOM_CDF2(16384) } - }; +static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MASKED_COMPOUND_TYPES)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, + { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, + { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } +}; -static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] = - { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359, - 22362, 24127, 25702, 27752, 29450, 31171) }, - { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367, - 18452, 19422, 22839, 26127, 29629) }, - { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332, - 24520, 27470, 29456, 30529, 31656) }, - { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163, - 20961, 22884, 24471, 26719, 28714, 30877) }, - { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730, - 18114, 19313, 22521, 26012, 29550) }, - { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270, - 20533, 23434, 25972, 27944, 29570, 31416) }, - { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638, - 22038, 23963, 25311, 26988, 28766, 31012) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284, - 24985, 25684, 27259, 28883, 30911) }, - { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057, - 27251, 29173, 30089, 30960, 31933) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) }, - { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, - 20480, 22528, 24576, 26624, 28672, 30720) } }; +static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + 16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, + 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, + { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, + 17367, 18452, 19422, 22839, 26127, 29629) }, + { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, + 21332, 24520, 27470, 29456, 30529, 31656) }, + { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, + 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, + { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, + 16730, 18114, 19313, 22521, 26012, 29550) }, + { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, + 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, + { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, + 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, + 24284, 24985, 25684, 27259, 28883, 30911) }, + { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, + 25057, 27251, 29173, 30089, 30960, 31933) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) } }; static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE( MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, @@ -1068,9 +1068,16 @@ void av1_setup_frame_contexts(AV1_COMMON *cm) { // This function must ONLY be called when cm->fc has been initialized with // default probs, either by av1_setup_past_independence or after manually // initializing them - cm->frame_contexts[FRAME_CONTEXT_DEFAULTS] = *cm->fc; - if (cm->large_scale_tile) { - for (int i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc; + *cm->default_frame_context = *cm->fc; + // TODO(jack.haughton@argondesign.com): don't think this should be necessary, + // but could do with fuller testing + if (cm->tiles.large_scale) { + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, i); + if (buf != NULL) buf->frame_context = *cm->fc; + } + for (int i = 0; i < FRAME_BUFFERS; ++i) + cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc; } } @@ -1079,10 +1086,9 @@ void av1_setup_past_independence(AV1_COMMON *cm) { // Features disabled, 0, with delta coding (Default state). av1_clearall_segfeatures(&cm->seg); - cm->current_frame_seg_map = cm->cur_frame->seg_map; - - if (cm->current_frame_seg_map) - memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); + if (cm->cur_frame->seg_map) + memset(cm->cur_frame->seg_map, 0, + (cm->mi_params.mi_rows * cm->mi_params.mi_cols)); // reset mode ref deltas av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); @@ -1092,12 +1098,6 @@ void av1_setup_past_independence(AV1_COMMON *cm) { av1_default_coef_probs(cm); init_mode_probs(cm->fc); av1_init_mv_probs(cm); - av1_init_lv_map(cm); cm->fc->initialized = 1; av1_setup_frame_contexts(cm); - - // prev_mip will only be allocated in encoder. - if (frame_is_intra_only(cm) && cm->prev_mip) - memset(cm->prev_mip, 0, - cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip)); } diff --git a/media/libaom/src/av1/common/entropymode.h b/media/libaom/src/av1/common/entropymode.h index 7047f34d2..bbbf55dc8 100644 --- a/media/libaom/src/av1/common/entropymode.h +++ b/media/libaom/src/av1/common/entropymode.h @@ -63,7 +63,6 @@ struct AV1Common; typedef struct { const int16_t *scan; const int16_t *iscan; - const int16_t *neighbors; } SCAN_ORDER; typedef struct frame_contexts { @@ -92,7 +91,8 @@ typedef struct frame_contexts { aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS] [CDF_SIZE(INTER_COMPOUND_MODES)]; - aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)]; + aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL] + [CDF_SIZE(MASKED_COMPOUND_TYPES)]; aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]; aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]; aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; diff --git a/media/libaom/src/av1/common/entropymv.c b/media/libaom/src/av1/common/entropymv.c index 491337387..e1e42f2f1 100644 --- a/media/libaom/src/av1/common/entropymv.c +++ b/media/libaom/src/av1/common/entropymv.c @@ -9,7 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/entropymv.h" static const nmv_context default_nmv_context = { diff --git a/media/libaom/src/av1/common/entropymv.h b/media/libaom/src/av1/common/entropymv.h index fa818a2c1..cddc80768 100644 --- a/media/libaom/src/av1/common/entropymv.h +++ b/media/libaom/src/av1/common/entropymv.h @@ -30,12 +30,12 @@ void av1_init_mv_probs(struct AV1Common *cm); /* Symbols for coding which components are zero jointly */ #define MV_JOINTS 4 -typedef enum { +enum { MV_JOINT_ZERO = 0, /* Zero vector */ MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ -} MV_JOINT_TYPE; +} UENUM1BYTE(MV_JOINT_TYPE); static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; @@ -47,7 +47,7 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { /* Symbols for coding magnitude class of nonzero components */ #define MV_CLASSES 11 -typedef enum { +enum { MV_CLASS_0 = 0, /* (0, 2] integer pel */ MV_CLASS_1 = 1, /* (2, 4] integer pel */ MV_CLASS_2 = 2, /* (4, 8] integer pel */ @@ -59,7 +59,7 @@ typedef enum { MV_CLASS_8 = 8, /* (256, 512] integer pel */ MV_CLASS_9 = 9, /* (512, 1024] integer pel */ MV_CLASS_10 = 10, /* (1024,2048] integer pel */ -} MV_CLASS_TYPE; +} UENUM1BYTE(MV_CLASS_TYPE); #define CLASS0_BITS 1 /* bits at integer precision for class 0 */ #define CLASS0_SIZE (1 << CLASS0_BITS) @@ -91,11 +91,11 @@ typedef struct { nmv_component comps[2]; } nmv_context; -typedef enum { +enum { MV_SUBPEL_NONE = -1, MV_SUBPEL_LOW_PRECISION = 0, MV_SUBPEL_HIGH_PRECISION, -} MvSubpelPrecision; +} SENUM1BYTE(MvSubpelPrecision); #ifdef __cplusplus } // extern "C" diff --git a/media/libaom/src/av1/common/enums.h b/media/libaom/src/av1/common/enums.h index 869c06ef2..0c09a1bc7 100644 --- a/media/libaom/src/av1/common/enums.h +++ b/media/libaom/src/av1/common/enums.h @@ -16,6 +16,7 @@ #include "aom/aom_codec.h" #include "aom/aom_integer.h" +#include "aom_ports/mem.h" #ifdef __cplusplus extern "C" { @@ -63,17 +64,6 @@ extern "C" { #define FRAME_OFFSET_BITS 5 #define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1) -#define REF_FRAMES_LOG2 3 -#define REF_FRAMES (1 << REF_FRAMES_LOG2) - -// 4 scratch frames for the new frames to support a maximum of 4 cores decoding -// in parallel, 3 for scaled references on the encoder. -// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number -// of framebuffers. -// TODO(jkoleszar): These 3 extra references could probably come from the -// normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 7) - // 4 frame filter levels: y plane vertical, y plane horizontal, // u plane, and v plane #define FRAME_LF_COUNT 4 @@ -83,11 +73,6 @@ extern "C" { #define DIST_PRECISION_BITS 4 #define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16 -// TODO(chengchen): Temporal flag serve as experimental flag for WIP -// bitmask construction. -// Shall be removed when bitmask code is completely checkedin -#define LOOP_FILTER_BITMASK 0 - #define PROFILE_BITS 3 // The following three profiles are currently defined. // Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only. @@ -95,21 +80,12 @@ extern "C" { // Profile 2. 8-bit and 10-bit 4:2:2 // 12-bit 4:0:0, 4:2:2 and 4:4:4 // Since we have three bits for the profiles, it can be extended later. -typedef enum BITSTREAM_PROFILE { +enum { PROFILE_0, PROFILE_1, PROFILE_2, MAX_PROFILES, -} BITSTREAM_PROFILE; - -#define LEVEL_MAJOR_BITS 3 -#define LEVEL_MINOR_BITS 2 -#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS) - -#define LEVEL_MAJOR_MIN 2 -#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN) -#define LEVEL_MINOR_MIN 0 -#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1) +} SENUM1BYTE(BITSTREAM_PROFILE); #define OP_POINTS_CNT_MINUS_1_BITS 5 #define OP_POINTS_IDC_BITS 12 @@ -149,7 +125,28 @@ typedef enum ATTRIBUTE_PACKED { // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 #define SQR_BLOCK_SIZES 6 -typedef enum ATTRIBUTE_PACKED { +// Partition types. R: Recursive +// +// NONE HORZ VERT SPLIT +// +-------+ +-------+ +---+---+ +---+---+ +// | | | | | | | | R | R | +// | | +-------+ | | | +---+---+ +// | | | | | | | | R | R | +// +-------+ +-------+ +---+---+ +---+---+ +// +// HORZ_A HORZ_B VERT_A VERT_B +// +---+---+ +-------+ +---+---+ +---+---+ +// | | | | | | | | | | | +// +---+---+ +---+---+ +---+ | | +---+ +// | | | | | | | | | | | +// +-------+ +---+---+ +---+---+ +---+---+ +// +// HORZ_4 VERT_4 +// +-----+ +-+-+-+ +// +-----+ | | | | +// +-----+ | | | | +// +-----+ +-+-+-+ +enum { PARTITION_NONE, PARTITION_HORZ, PARTITION_VERT, @@ -163,7 +160,7 @@ typedef enum ATTRIBUTE_PACKED { EXT_PARTITION_TYPES, PARTITION_TYPES = PARTITION_SPLIT + 1, PARTITION_INVALID = 255 -} PARTITION_TYPE; +} UENUM1BYTE(PARTITION_TYPE); typedef char PARTITION_CONTEXT; #define PARTITION_PLOFFSET 4 // number of probability models per block size @@ -171,12 +168,7 @@ typedef char PARTITION_CONTEXT; #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET) // block transform size -#if defined(_MSC_VER) -typedef uint8_t TX_SIZE; -enum ATTRIBUTE_PACKED { -#else -typedef enum ATTRIBUTE_PACKED { -#endif +enum { TX_4X4, // 4x4 transform TX_8X8, // 8x8 transform TX_16X16, // 16x16 transform @@ -200,11 +192,7 @@ typedef enum ATTRIBUTE_PACKED { TX_SIZES = TX_4X8, // Does NOT include rectangular transforms TX_SIZES_LARGEST = TX_64X64, TX_INVALID = 255 // Invalid transform size -#if defined(_MSC_VER) -}; -#else -} TX_SIZE; -#endif +} UENUM1BYTE(TX_SIZE); #define TX_SIZE_LUMA_MIN (TX_4X4) /* We don't need to code a transform size unless the allowed size is at least @@ -226,7 +214,7 @@ typedef enum ATTRIBUTE_PACKED { #define TX_PAD_HOR 4 // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability // check. -#define TX_PAD_TOP 2 +#define TX_PAD_TOP 0 #define TX_PAD_BOTTOM 4 #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM) // Pad 16 extra bytes to avoid reading overflow in SIMD optimization. @@ -238,43 +226,44 @@ typedef enum ATTRIBUTE_PACKED { #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2) // frame transform mode -typedef enum ATTRIBUTE_PACKED { +enum { ONLY_4X4, // use only 4x4 transform TX_MODE_LARGEST, // transform size is the largest possible for pu size TX_MODE_SELECT, // transform specified for each block TX_MODES, -} TX_MODE; +} UENUM1BYTE(TX_MODE); // 1D tx types -typedef enum ATTRIBUTE_PACKED { +enum { DCT_1D, ADST_1D, FLIPADST_1D, IDTX_1D, TX_TYPES_1D, -} TX_TYPE_1D; - -typedef enum ATTRIBUTE_PACKED { - DCT_DCT, // DCT in both horizontal and vertical - ADST_DCT, // ADST in vertical, DCT in horizontal - DCT_ADST, // DCT in vertical, ADST in horizontal - ADST_ADST, // ADST in both directions - FLIPADST_DCT, - DCT_FLIPADST, - FLIPADST_FLIPADST, - ADST_FLIPADST, - FLIPADST_ADST, - IDTX, - V_DCT, - H_DCT, - V_ADST, - H_ADST, - V_FLIPADST, - H_FLIPADST, +} UENUM1BYTE(TX_TYPE_1D); + +enum { + DCT_DCT, // DCT in both horizontal and vertical + ADST_DCT, // ADST in vertical, DCT in horizontal + DCT_ADST, // DCT in vertical, ADST in horizontal + ADST_ADST, // ADST in both directions + FLIPADST_DCT, // FLIPADST in vertical, DCT in horizontal + DCT_FLIPADST, // DCT in vertical, FLIPADST in horizontal + FLIPADST_FLIPADST, // FLIPADST in both directions + ADST_FLIPADST, // ADST in vertical, FLIPADST in horizontal + FLIPADST_ADST, // FLIPADST in vertical, ADST in horizontal + IDTX, // Identity in both directions + V_DCT, // DCT in vertical, identity in horizontal + H_DCT, // Identity in vertical, DCT in horizontal + V_ADST, // ADST in vertical, identity in horizontal + H_ADST, // Identity in vertical, ADST in horizontal + V_FLIPADST, // FLIPADST in vertical, identity in horizontal + H_FLIPADST, // Identity in vertical, FLIPADST in horizontal TX_TYPES, -} TX_TYPE; + DCT_ADST_TX_MASK = 0x000F, // Either DCT or ADST in each direction +} UENUM1BYTE(TX_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { REG_REG, REG_SMOOTH, REG_SHARP, @@ -284,9 +273,9 @@ typedef enum ATTRIBUTE_PACKED { SHARP_REG, SHARP_SMOOTH, SHARP_SHARP, -} DUAL_FILTER_TYPE; +} UENUM1BYTE(DUAL_FILTER_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { // DCT only EXT_TX_SET_DCTONLY, // DCT + Identity only @@ -300,15 +289,13 @@ typedef enum ATTRIBUTE_PACKED { // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6) EXT_TX_SET_ALL16, EXT_TX_SET_TYPES -} TxSetType; - -#define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX) +} UENUM1BYTE(TxSetType); #define EXT_TX_SIZES 4 // number of sizes that use extended transforms #define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER #define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA -typedef enum ATTRIBUTE_PACKED { +enum { AOM_LAST_FLAG = 1 << 0, AOM_LAST2_FLAG = 1 << 1, AOM_LAST3_FLAG = 1 << 2, @@ -317,19 +304,15 @@ typedef enum ATTRIBUTE_PACKED { AOM_ALT2_FLAG = 1 << 5, AOM_ALT_FLAG = 1 << 6, AOM_REFFRAME_ALL = (1 << 7) - 1 -} AOM_REFFRAME; +} UENUM1BYTE(AOM_REFFRAME); -typedef enum ATTRIBUTE_PACKED { +enum { UNIDIR_COMP_REFERENCE, BIDIR_COMP_REFERENCE, COMP_REFERENCE_TYPES, -} COMP_REFERENCE_TYPE; +} UENUM1BYTE(COMP_REFERENCE_TYPE); -typedef enum ATTRIBUTE_PACKED { - PLANE_TYPE_Y, - PLANE_TYPE_UV, - PLANE_TYPES -} PLANE_TYPE; +enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE); #define CFL_ALPHABET_SIZE_LOG2 4 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2) @@ -337,24 +320,20 @@ typedef enum ATTRIBUTE_PACKED { #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2) #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1)) -typedef enum ATTRIBUTE_PACKED { - CFL_PRED_U, - CFL_PRED_V, - CFL_PRED_PLANES -} CFL_PRED_TYPE; +enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { CFL_SIGN_ZERO, CFL_SIGN_NEG, CFL_SIGN_POS, CFL_SIGNS -} CFL_SIGN_TYPE; +} UENUM1BYTE(CFL_SIGN_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { CFL_DISALLOWED, CFL_ALLOWED, CFL_ALLOWED_TYPES -} CFL_ALLOWED_TYPE; +} UENUM1BYTE(CFL_ALLOWED_TYPE); // CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid #define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1) @@ -371,12 +350,12 @@ typedef enum ATTRIBUTE_PACKED { #define CFL_CONTEXT_V(js) \ (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS) -typedef enum ATTRIBUTE_PACKED { +enum { PALETTE_MAP, COLOR_MAP_TYPES, -} COLOR_MAP_TYPE; +} UENUM1BYTE(COLOR_MAP_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { TWO_COLORS, THREE_COLORS, FOUR_COLORS, @@ -385,9 +364,9 @@ typedef enum ATTRIBUTE_PACKED { SEVEN_COLORS, EIGHT_COLORS, PALETTE_SIZES -} PALETTE_SIZE; +} UENUM1BYTE(PALETTE_SIZE); -typedef enum ATTRIBUTE_PACKED { +enum { PALETTE_COLOR_ONE, PALETTE_COLOR_TWO, PALETTE_COLOR_THREE, @@ -397,11 +376,11 @@ typedef enum ATTRIBUTE_PACKED { PALETTE_COLOR_SEVEN, PALETTE_COLOR_EIGHT, PALETTE_COLORS -} PALETTE_COLOR; +} UENUM1BYTE(PALETTE_COLOR); // Note: All directional predictors must be between V_PRED and D67_PRED (both // inclusive). -typedef enum ATTRIBUTE_PACKED { +enum { DC_PRED, // Average of above and left pixels V_PRED, // Vertical H_PRED, // Horizontal @@ -431,6 +410,8 @@ typedef enum ATTRIBUTE_PACKED { MB_MODE_COUNT, INTRA_MODE_START = DC_PRED, INTRA_MODE_END = NEARESTMV, + DIR_MODE_START = V_PRED, + DIR_MODE_END = D67_PRED + 1, INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START, SINGLE_INTER_MODE_START = NEARESTMV, SINGLE_INTER_MODE_END = NEAREST_NEARESTMV, @@ -442,11 +423,11 @@ typedef enum ATTRIBUTE_PACKED { INTER_MODE_END = MB_MODE_COUNT, INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks -} PREDICTION_MODE; +} UENUM1BYTE(PREDICTION_MODE); // TODO(ltrudeau) Do we really want to pack this? // TODO(ltrudeau) Do we match with PREDICTION_MODE? -typedef enum ATTRIBUTE_PACKED { +enum { UV_DC_PRED, // Average of above and left pixels UV_V_PRED, // Vertical UV_H_PRED, // Horizontal @@ -463,38 +444,71 @@ typedef enum ATTRIBUTE_PACKED { UV_CFL_PRED, // Chroma-from-Luma UV_INTRA_MODES, UV_MODE_INVALID, // For uv_mode in inter blocks -} UV_PREDICTION_MODE; +} UENUM1BYTE(UV_PREDICTION_MODE); -typedef enum ATTRIBUTE_PACKED { +enum { SIMPLE_TRANSLATION, OBMC_CAUSAL, // 2-sided OBMC WARPED_CAUSAL, // 2-sided WARPED MOTION_MODES -} MOTION_MODE; +} UENUM1BYTE(MOTION_MODE); -typedef enum ATTRIBUTE_PACKED { +enum { II_DC_PRED, II_V_PRED, II_H_PRED, II_SMOOTH_PRED, INTERINTRA_MODES -} INTERINTRA_MODE; +} UENUM1BYTE(INTERINTRA_MODE); -typedef enum ATTRIBUTE_PACKED { +enum { COMPOUND_AVERAGE, + COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD, COMPOUND_TYPES, -} COMPOUND_TYPE; + MASKED_COMPOUND_TYPES = 2, +} UENUM1BYTE(COMPOUND_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { FILTER_DC_PRED, FILTER_V_PRED, FILTER_H_PRED, FILTER_D157_PRED, FILTER_PAETH_PRED, FILTER_INTRA_MODES, -} FILTER_INTRA_MODE; +} UENUM1BYTE(FILTER_INTRA_MODE); + +enum { + SEQ_LEVEL_2_0, + SEQ_LEVEL_2_1, + SEQ_LEVEL_2_2, + SEQ_LEVEL_2_3, + SEQ_LEVEL_3_0, + SEQ_LEVEL_3_1, + SEQ_LEVEL_3_2, + SEQ_LEVEL_3_3, + SEQ_LEVEL_4_0, + SEQ_LEVEL_4_1, + SEQ_LEVEL_4_2, + SEQ_LEVEL_4_3, + SEQ_LEVEL_5_0, + SEQ_LEVEL_5_1, + SEQ_LEVEL_5_2, + SEQ_LEVEL_5_3, + SEQ_LEVEL_6_0, + SEQ_LEVEL_6_1, + SEQ_LEVEL_6_2, + SEQ_LEVEL_6_3, + SEQ_LEVEL_7_0, + SEQ_LEVEL_7_1, + SEQ_LEVEL_7_2, + SEQ_LEVEL_7_3, + SEQ_LEVELS, + SEQ_LEVEL_MAX = 31 +} UENUM1BYTE(AV1_LEVEL); + +#define LEVEL_BITS 5 #define DIRECTIONAL_MODES 8 #define MAX_ANGLE_DELTA 3 @@ -529,7 +543,9 @@ typedef enum ATTRIBUTE_PACKED { #define DELTA_Q_SMALL 3 #define DELTA_Q_PROBS (DELTA_Q_SMALL) -#define DEFAULT_DELTA_Q_RES 4 +#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4 +#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4 + #define DELTA_LF_SMALL 3 #define DELTA_LF_PROBS (DELTA_LF_SMALL) #define DEFAULT_DELTA_LF_RES 2 @@ -538,6 +554,7 @@ typedef enum ATTRIBUTE_PACKED { #define MAX_MV_REF_CANDIDATES 2 #define MAX_REF_MV_STACK_SIZE 8 +#define USABLE_REF_MV_STACK_SIZE 4 #define REF_CAT_LEVEL 640 #define INTRA_INTER_CONTEXTS 4 @@ -550,28 +567,47 @@ typedef enum ATTRIBUTE_PACKED { #define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3) typedef uint8_t TXFM_CONTEXT; -#define NONE_FRAME -1 -#define INTRA_FRAME 0 -#define LAST_FRAME 1 -#define LAST2_FRAME 2 -#define LAST3_FRAME 3 -#define GOLDEN_FRAME 4 -#define BWDREF_FRAME 5 -#define ALTREF2_FRAME 6 -#define ALTREF_FRAME 7 -#define EXTREF_FRAME REF_FRAMES -#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1) - -#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1) - -#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1) +// An enum for single reference types (and some derived values). +enum { + NONE_FRAME = -1, + INTRA_FRAME, + LAST_FRAME, + LAST2_FRAME, + LAST3_FRAME, + GOLDEN_FRAME, + BWDREF_FRAME, + ALTREF2_FRAME, + ALTREF_FRAME, + REF_FRAMES, + + // Extra/scratch reference frame. It may be: + // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or + // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()). + EXTREF_FRAME = REF_FRAMES, + + // Number of inter (non-intra) reference types. + INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1, + + // Number of forward (aka past) reference types. + FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1, + + // Number of backward (aka future) reference types. + BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1, + + SINGLE_REFS = FWD_REFS + BWD_REFS, +}; + +#define REF_FRAMES_LOG2 3 + +// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new +// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the +// encoder in the cpi->scaled_ref_buf array. +#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME) + #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME) -#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1) #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME) -#define SINGLE_REFS (FWD_REFS + BWD_REFS) - -typedef enum ATTRIBUTE_PACKED { +enum { LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME } LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME } LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME } @@ -585,7 +621,7 @@ typedef enum ATTRIBUTE_PACKED { // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs // that are explicitly signaled. UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1, -} UNIDIR_COMP_REF; +} UENUM1BYTE(UNIDIR_COMP_REF); #define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS) @@ -596,14 +632,37 @@ typedef enum ATTRIBUTE_PACKED { // possible to have a reference pair not listed for explicit signaling. #define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS) -typedef enum ATTRIBUTE_PACKED { +// Note: It includes single and compound references. So, it can take values from +// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum. +typedef int8_t MV_REFERENCE_FRAME; + +enum { RESTORE_NONE, RESTORE_WIENER, RESTORE_SGRPROJ, RESTORE_SWITCHABLE, RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, RESTORE_TYPES = 4, -} RestorationType; +} UENUM1BYTE(RestorationType); + +// Picture prediction structures (0-12 are predefined) in scalability metadata. +enum { + SCALABILITY_L1T2 = 0, + SCALABILITY_L1T3 = 1, + SCALABILITY_L2T1 = 2, + SCALABILITY_L2T2 = 3, + SCALABILITY_L2T3 = 4, + SCALABILITY_S2T1 = 5, + SCALABILITY_S2T2 = 6, + SCALABILITY_S2T3 = 7, + SCALABILITY_L2T1h = 8, + SCALABILITY_L2T2h = 9, + SCALABILITY_L2T3h = 10, + SCALABILITY_S2T1h = 11, + SCALABILITY_S2T2h = 12, + SCALABILITY_S2T3h = 13, + SCALABILITY_SS = 14 +} UENUM1BYTE(SCALABILITY_STRUCTURES); #define SUPERRES_SCALE_BITS 3 #define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1) diff --git a/media/libaom/src/av1/common/filter.h b/media/libaom/src/av1/common/filter.h index 571422d11..91791d3dc 100644 --- a/media/libaom/src/av1/common/filter.h +++ b/media/libaom/src/av1/common/filter.h @@ -19,6 +19,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" +#include "av1/common/enums.h" #ifdef __cplusplus extern "C" { @@ -35,29 +36,55 @@ typedef enum ATTRIBUTE_PACKED { SWITCHABLE_FILTERS = BILINEAR, SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */ EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS, + INTERP_INVALID = 0xff, } InterpFilter; -// With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since -// there are at most 10 filters, we can use 16 bits for each and have more than -// enough space. This reduces argument passing and unifies the operation of -// setting a (pair of) filters. -// -// Without CONFIG_DUAL_FILTER, -typedef uint32_t InterpFilters; -static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters, - int x_filter) { - return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf); -} +enum { + USE_2_TAPS_ORIG = 0, // This is used in temporal filtering. + USE_2_TAPS, + USE_4_TAPS, + USE_8_TAPS, +} UENUM1BYTE(SUBPEL_SEARCH_TYPE); + +enum { + INTERP_EVAL_LUMA_EVAL_CHROMA = 0, + INTERP_SKIP_LUMA_EVAL_CHROMA, + INTERP_EVAL_INVALID, + INTERP_SKIP_LUMA_SKIP_CHROMA, +} UENUM1BYTE(INTERP_EVAL_PLANE); + +enum { + INTERP_HORZ_NEQ_VERT_NEQ = 0, + INTERP_HORZ_EQ_VERT_NEQ, + INTERP_HORZ_NEQ_VERT_EQ, + INTERP_HORZ_EQ_VERT_EQ, + INTERP_PRED_TYPE_ALL, +} UENUM1BYTE(INTERP_PRED_TYPE); +// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters, +// we can use 16 bits for each and have more than enough space. This reduces +// argument passing and unifies the operation of setting a (pair of) filters. +typedef struct InterpFilters { + uint16_t y_filter; + uint16_t x_filter; +} InterpFilters; -static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter, - InterpFilter x_filter) { - uint16_t y16 = y_filter & 0xf; - uint16_t x16 = x_filter & 0xf; - return y16 | ((uint32_t)x16 << 16); +typedef union int_interpfilters { + uint32_t as_int; + InterpFilters as_filters; +} int_interpfilters; + +static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters, + int dir) { + return (InterpFilter)((dir) ? filters.as_filters.x_filter + : filters.as_filters.y_filter); } -static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) { - return av1_make_interp_filters(filter, filter); +static INLINE int_interpfilters +av1_broadcast_interp_filter(InterpFilter filter) { + int_interpfilters filters; + filters.as_filters.x_filter = filter; + filters.as_filters.y_filter = filter; + return filters; } static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) { @@ -67,10 +94,10 @@ static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) { /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */ #define LOG_SWITCHABLE_FILTERS 2 -#define MAX_SUBPEL_TAPS 12 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4) #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1) #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2) +#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff) typedef struct InterpFilterParams { const int16_t *filter_ptr; @@ -141,9 +168,10 @@ static const InterpFilterParams // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel // MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV. -DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = { - 64, - 64, +DECLARE_ALIGNED(256, static const int16_t, + av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { + 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; static const InterpFilterParams av1_intrabc_filter_params = { @@ -173,6 +201,16 @@ DECLARE_ALIGNED(256, static const InterpKernel, { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 } }; +static const uint16_t + av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = { + { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG), + (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH), + (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) }, + { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP), + (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP), + (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) } + }; + // For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = { { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, @@ -192,14 +230,14 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, return &av1_interp_filter_params_list[interp_filter]; } -static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params( - const InterpFilter interp_filter) { - return &av1_interp_4tap[interp_filter]; -} - static INLINE const int16_t *av1_get_interp_filter_kernel( - const InterpFilter interp_filter) { - return av1_interp_filter_params_list[interp_filter].filter_ptr; + const InterpFilter interp_filter, int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + return (subpel_search == USE_2_TAPS) + ? av1_interp_4tap[BILINEAR].filter_ptr + : ((subpel_search == USE_4_TAPS) + ? av1_interp_4tap[interp_filter].filter_ptr + : av1_interp_filter_params_list[interp_filter].filter_ptr); } static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( @@ -207,6 +245,33 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( return filter_params->filter_ptr + filter_params->taps * subpel; } +static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + + switch (subpel_search) { + case USE_2_TAPS: return &av1_interp_4tap[BILINEAR]; + case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR]; + case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR]; + default: assert(0); return NULL; + } +} + +static INLINE void reset_interp_filter_allowed_mask( + uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + uint16_t tmp = (~(1 << filt_type)) & 0xffff; + *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK); +} + +static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask, + DUAL_FILTER_TYPE filt_type) { + *allow_interp_mask |= (1 << filt_type); +} + +static INLINE uint8_t get_interp_filter_allowed_mask( + uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + return (allow_interp_mask >> filt_type) & 1; +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libaom/src/av1/common/frame_buffers.c b/media/libaom/src/av1/common/frame_buffers.c index fd6c4bc79..f10ccd594 100644 --- a/media/libaom/src/av1/common/frame_buffers.c +++ b/media/libaom/src/av1/common/frame_buffers.c @@ -22,7 +22,11 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) { AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; list->int_fb = (InternalFrameBuffer *)aom_calloc( list->num_internal_frame_buffers, sizeof(*list->int_fb)); - return (list->int_fb == NULL); + if (list->int_fb == NULL) { + list->num_internal_frame_buffers = 0; + return 1; + } + return 0; } void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { @@ -36,6 +40,7 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { } aom_free(list->int_fb); list->int_fb = NULL; + list->num_internal_frame_buffers = 0; } void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) { @@ -69,7 +74,10 @@ int av1_get_frame_buffer(void *cb_priv, size_t min_size, // due to access uninitialized memory in frame border. It could be // skipped if border were totally removed. int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size); - if (!int_fb_list->int_fb[i].data) return -1; + if (!int_fb_list->int_fb[i].data) { + int_fb_list->int_fb[i].size = 0; + return -1; + } int_fb_list->int_fb[i].size = min_size; } @@ -86,6 +94,5 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) { InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; (void)cb_priv; if (int_fb) int_fb->in_use = 0; - fb->priv = NULL; return 0; } diff --git a/media/libaom/src/av1/common/idct.c b/media/libaom/src/av1/common/idct.c index 2c1cb9827..bff438f3c 100644 --- a/media/libaom/src/av1/common/idct.c +++ b/media/libaom/src/av1/common/idct.c @@ -56,87 +56,87 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } -void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } -void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); @@ -204,7 +204,7 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, txfm_param->eob = eob; txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id]; txfm_param->bd = xd->bd; - txfm_param->is_hbd = get_bitdepth_data_path_index(xd); + txfm_param->is_hbd = is_cur_buf_hbd(xd); txfm_param->tx_set_type = av1_get_ext_tx_set_type( txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); } @@ -224,10 +224,10 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param); break; case TX_4X8: - av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param); break; case TX_8X4: - av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param); break; case TX_8X16: av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param); @@ -236,25 +236,25 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param); break; case TX_16X32: - av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param); break; case TX_32X16: - av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param); break; case TX_64X64: av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param); break; case TX_32X64: - av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param); break; case TX_64X32: - av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param); break; case TX_16X64: - av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param); break; case TX_64X16: - av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param); break; case TX_4X4: // this is like av1_short_idct4x4 but has a special case around eob<=1 @@ -263,16 +263,16 @@ void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); break; case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param); break; case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param); break; case TX_8X32: - av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param); break; case TX_32X8: - av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param); break; default: assert(0 && "Invalid transform size"); break; } diff --git a/media/libaom/src/av1/common/idct.h b/media/libaom/src/av1/common/idct.h index d9454e73f..004d25d49 100644 --- a/media/libaom/src/av1/common/idct.h +++ b/media/libaom/src/av1/common/idct.h @@ -44,22 +44,6 @@ static INLINE const int32_t *cast_to_int32(const tran_low_t *input) { return (const int32_t *)input; } -typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *param); - -highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32; -highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8; - #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libaom/src/av1/common/loopfiltermask.c b/media/libaom/src/av1/common/loopfiltermask.c new file mode 100644 index 000000000..157310f2d --- /dev/null +++ b/media/libaom/src/av1/common/loopfiltermask.c @@ -0,0 +1,1458 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <math.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" + +// 256 bit masks (64x64 / 4x4) for left transform size for Y plane. +// We use 4 uint64_t to represent the 256 bit. +// Each 1 represents a position where we should apply a loop filter +// across the left border of an 4x4 block boundary. +// +// In the case of TX_8x8-> ( in low order byte first we end up with +// a mask that looks like this (-- and | are used for better view) +// +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// ----------------- +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// 10101010|10101010 +// +// A loopfilter should be applied to every other 4x4 horizontally. + +// 256 bit masks (64x64 / 4x4) for above transform size for Y plane. +// We use 4 uint64_t to represent the 256 bit. +// Each 1 represents a position where we should apply a loop filter +// across the top border of an 4x4 block boundary. +// +// In the case of TX_8x8-> ( in low order byte first we end up with +// a mask that looks like this +// +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// ----------------- +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// 11111111|11111111 +// 00000000|00000000 +// +// A loopfilter should be applied to every other 4x4 horizontally. +#if CONFIG_LPF_MASK +static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18 +}; + +static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = { + -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13 +}; + +static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = { + -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8 +}; + +static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, + 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; +static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = { + 0, 47, 49, 19, 51, 53, 33, 55, 57, 42, 59, + 60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66 +}; + +static const FilterMask left_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL, + 0x0055005500550055ULL } }, // block size 32X64, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL, + 0x5555555555555555ULL } }, // block size 64X64, TX_8X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL, + 0x0005000500050005ULL } }, // block size 16X64, TX_8X8 + { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL, + 0x0011001100110011ULL } }, // block size 32X64, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL, + 0x1111111111111111ULL } }, // block size 64X64, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X16 + { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 32X64, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X32 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 32X64, TX_32X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X64 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 +}; + +static const FilterMask above_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL, + 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, + 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL, + 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL, + 0x00000000000000ffULL } }, // block size 32X64, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL, + 0x000000000000ffffULL } }, // block size 64X64, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL, + 0x000000000000000fULL } }, // block size 16X64, TX_16X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 +}; + +static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, + int mi_row, int mi_col) { + assert(cm->lf.lfm != NULL); + const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64 + const int col = mi_col >> MIN_MIB_SIZE_LOG2; + return &cm->lf.lfm[row * cm->lf.lfm_stride + col]; +} + +typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh); + +typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1); + +typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, int bd); + +typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd); +// A 64x64 tx block requires 256 bits to represent each 4x4 tx block. +// Every 4 rows is represented by one uint64_t mask. Hence, +// there are 4 uint64_t bitmask[4] to represent the 64x64 block. +// +// Given a location by (mi_col, mi_row), This function returns the index +// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value. +// +// For example, mi_row is the offset of pixels in mi size (4), +// (mi_row / 4) returns which uint64_t. +// After locating which uint64_t, mi_row % 4 is the +// row offset, and each row has 16 = 1 << stride_log2 4x4 units. +// Therefore, shift = (row << stride_log2) + mi_col; +int get_index_shift(int mi_col, int mi_row, int *index) { + // *index = mi_row >> 2; + // rows = mi_row % 4; + // stride_log2 = 4; + // shift = (rows << stride_log2) + mi_col; + *index = mi_row >> 2; + return ((mi_row & 3) << 4) | mi_col; +} + +static void filter_selectively_vert_row2( + int subsampling_factor, uint8_t *s, int pitch, int plane, + uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0, + uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1, + const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) { + uint64_t mask; + const int step = 1 << subsampling_factor; + + for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 | + mask_8x8_1 | mask_4x4_1; + mask; mask >>= step) { + const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2; + + if (mask & 1) { + if ((mask_16x16_0 | mask_16x16_1) & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14; + + if ((mask_16x16_0 & mask_16x16_1) & 1) { + if (plane) { + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else { + aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } else if (mask_16x16_0 & 1) { + lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + } else { + lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } + + if ((mask_8x8_0 | mask_8x8_1) & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8; + + if ((mask_8x8_0 & mask_8x8_1) & 1) { + if (plane) { + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else { + aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } else if (mask_8x8_0 & 1) { + lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + } else { + lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } + + if ((mask_4x4_0 | mask_4x4_1) & 1) { + if ((mask_4x4_0 & mask_4x4_1) & 1) { + aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_4x4_0 & 1) { + aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + } else { + aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } + } + } + + s += 4; + lfl += step; + lfl2 += step; + mask_16x16_0 >>= step; + mask_8x8_0 >>= step; + mask_4x4_0 >>= step; + mask_16x16_1 >>= step; + mask_8x8_1 >>= step; + mask_4x4_1 >>= step; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_filter_selectively_vert_row2( + int subsampling_factor, uint16_t *s, int pitch, int plane, + uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0, + uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1, + const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) { + uint64_t mask; + const int step = 1 << subsampling_factor; + + for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 | + mask_8x8_1 | mask_4x4_1; + mask; mask >>= step) { + const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2; + + if (mask & 1) { + if ((mask_16x16_0 | mask_16x16_1) & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + HbdLpfFunc highbd_lpf_vertical = + plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14; + + if ((mask_16x16_0 & mask_16x16_1) & 1) { + if (plane) { + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } else { + aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } + } else if (mask_16x16_0 & 1) { + highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + bd); + } else { + highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, bd); + } + } + + if ((mask_8x8_0 | mask_8x8_1) & 1) { + HbdLpfFunc highbd_lpf_vertical = + plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8; + + if ((mask_8x8_0 & mask_8x8_1) & 1) { + if (plane) { + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } else { + aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } + } else if (mask_8x8_0 & 1) { + highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + bd); + } else { + highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, bd); + } + } + + if ((mask_4x4_0 | mask_4x4_1) & 1) { + if ((mask_4x4_0 & mask_4x4_1) & 1) { + aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, bd); + } else if (mask_4x4_0 & 1) { + aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, bd); + } else { + aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); + } + } + } + + s += 4; + lfl += step; + lfl2 += step; + mask_16x16_0 >>= step; + mask_8x8_0 >>= step; + mask_4x4_0 >>= step; + mask_16x16_1 >>= step; + mask_8x8_1 >>= step; + mask_4x4_1 >>= step; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void filter_selectively_horiz(uint8_t *s, int pitch, int plane, + int subsampling, uint64_t mask_16x16, + uint64_t mask_8x8, uint64_t mask_4x4, + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { + uint64_t mask; + int count; + const int step = 1 << subsampling; + const unsigned int two_block_mask = subsampling ? 5 : 3; + int offset = 0; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) { + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + // Next block's thresholds, when it is within current 64x64 block. + // If it is out of bound, its mask is zero, and it points to current edge's + // filter parameters, instead of next edge's. + int next_edge = step; + if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0; + const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge); + + count = 1; + if (mask & 1) { + if (mask_16x16 & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_horizontal = + plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14; + + if ((mask_16x16 & two_block_mask) == two_block_mask) { + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } + count = 2; + } else { + lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } else if (mask_8x8 & 1) { + // chroma plane filters less pixels introduced in deblock_13tap + // experiment + LpfFunc lpf_horizontal = + plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8; + + if ((mask_8x8 & two_block_mask) == two_block_mask) { + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } + count = 2; + } else { + lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & two_block_mask) == two_block_mask) { + aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + count = 2; + } else { + aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } + } + + s += 4 * count; + lfl += step * count; + mask_16x16 >>= step * count; + mask_8x8 >>= step * count; + mask_4x4 >>= step * count; + offset += step * count; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_filter_selectively_horiz( + uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16, + uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n, + uint8_t *lfl, int bd) { + uint64_t mask; + int count; + const int step = 1 << subsampling; + const unsigned int two_block_mask = subsampling ? 5 : 3; + int offset = 0; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) { + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + // Next block's thresholds, when it is within current 64x64 block. + // If it is out of bound, its mask is zero, and it points to current edge's + // filter parameters, instead of next edge's. + int next_edge = step; + if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0; + const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge); + + count = 1; + if (mask & 1) { + if (mask_16x16 & 1) { + HbdLpfFunc highbd_lpf_horizontal = + plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14; + + if ((mask_16x16 & two_block_mask) == two_block_mask) { + if (plane) { + aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + count = 2; + } else { + highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + bd); + } + } else if (mask_8x8 & 1) { + HbdLpfFunc highbd_lpf_horizontal = + plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8; + + if ((mask_8x8 & two_block_mask) == two_block_mask) { + if (plane) { + aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + count = 2; + } else { + highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, + bd); + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & two_block_mask) == two_block_mask) { + aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + count = 2; + } else { + aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } + } + } + + s += 4 * count; + lfl += step * count; + mask_16x16 >>= step * count; + mask_8x8 >>= step * count; + mask_4x4 >>= step * count; + offset += step * count; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_build_bitmask_vert_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + uint64_t skip, prev_skip = 0; + uint64_t is_coding_block_border; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) { + const int mi_row = r << subsampling_y; + const int row = mi_row % MI_SIZE_64X64; + const int row_uv = row | subsampling_y; + int index = 0; + const int shift = get_index_shift(0, row, &index); + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; + c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) { + const int mi_col = c << subsampling_x; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int col_in_unit = 0; + col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) { + const int x = (c + col_in_unit) << MI_SIZE_LOG2; + if (x >= plane_ptr->dst.width) break; + const int col = col_in_unit << subsampling_x; + const int col_uv = col | subsampling_x; + const uint64_t mask = ((uint64_t)1 << (shift | col)); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_vert_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break; + case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break; + case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break; + default: assert(plane >= 0 && plane <= 2); return; + } + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((c + col_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int shift_1 = get_index_shift(col_uv, row_uv, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + switch (plane) { + case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + if (level == 0 && prev_level != 0) { + switch (plane) { + case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break; + case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break; + case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + col_in_unit += tx_size_wide_unit[tx_size]; + } + } + } +} + +void av1_build_bitmask_horz_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + uint64_t skip, prev_skip = 0; + uint64_t is_coding_block_border; + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) { + const int mi_col = c << subsampling_x; + const int col = mi_col % MI_SIZE_64X64; + const int col_uv = col | subsampling_x; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; + r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) { + const int mi_row = r << subsampling_y; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int r_in_unit = 0; + r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) { + const int y = (r + r_in_unit) << MI_SIZE_LOG2; + if (y >= plane_ptr->dst.height) break; + const int row = r_in_unit << subsampling_y; + const int row_uv = row | subsampling_y; + int index = 0; + const int shift = get_index_shift(col, row, &index); + const uint64_t mask = ((uint64_t)1 << shift); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_horz_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break; + case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break; + case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break; + default: assert(plane >= 0 && plane <= 2); return; + } + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((r + r_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int shift_1 = get_index_shift(col_uv, row_uv, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + + switch (plane) { + case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + if (level == 0 && prev_level != 0) { + switch (plane) { + case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break; + case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break; + case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + r_in_unit += tx_size_high_unit[tx_size]; + } + } + } +} + +void av1_filter_block_plane_bitmask_vert( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int two_row_step = 2 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + const int two_row_stride = row_stride << 1; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + uint8_t *lfl2; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + + // 1. vertical filtering. filter two rows at a time + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += two_row_step) { + const int row = r | ssy; + const int row_next = row + row_step; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + int index_next = 0; + const int shift_next = get_index_shift(col, row_next, &index_next); + const int has_next_row = row_next < cm->mi_params.mi_rows; + switch (pl) { + case 0: + mask_16x16 = lfm->left_y[TX_16X16].bits[index]; + mask_8x8 = lfm->left_y[TX_8X8].bits[index]; + mask_4x4 = lfm->left_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_ver[row][col]; + lfl2 = &lfm->lfl_y_ver[row_next][col]; + break; + case 1: + mask_16x16 = lfm->left_u[TX_16X16].bits[index]; + mask_8x8 = lfm->left_u[TX_8X8].bits[index]; + mask_4x4 = lfm->left_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_ver[row][col]; + lfl2 = &lfm->lfl_u_ver[row_next][col]; + break; + case 2: + mask_16x16 = lfm->left_v[TX_16X16].bits[index]; + mask_8x8 = lfm->left_v[TX_8X8].bits[index]; + mask_4x4 = lfm->left_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_ver[row][col]; + lfl2 = &lfm->lfl_v_ver[row_next][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; + uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; + uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; + uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; + uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; + uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; + if (!has_next_row) { + mask_16x16_1 = 0; + mask_8x8_1 = 0; + mask_4x4_1 = 0; + } + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_vert_row2( + ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, + mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); +#else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); +#endif + dst->buf += two_row_stride; + } + // reset buf pointer for horizontal filtering + dst->buf = buf0; +} + +void av1_filter_block_plane_bitmask_horz( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += row_step) { + if (mi_row + r == 0) { + dst->buf += row_stride; + continue; + } + const int row = r | ssy; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + switch (pl) { + case 0: + mask_16x16 = lfm->above_y[TX_16X16].bits[index]; + mask_8x8 = lfm->above_y[TX_8X8].bits[index]; + mask_4x4 = lfm->above_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_hor[row][col]; + break; + case 1: + mask_16x16 = lfm->above_u[TX_16X16].bits[index]; + mask_8x8 = lfm->above_u[TX_8X8].bits[index]; + mask_4x4 = lfm->above_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_hor[row][col]; + break; + case 2: + mask_16x16 = lfm->above_v[TX_16X16].bits[index]; + mask_8x8 = lfm->above_v[TX_8X8].bits[index]; + mask_4x4 = lfm->above_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_hor[row][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; + mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; + mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_horiz( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth); + else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#endif + dst->buf += row_stride; + } + // reset buf pointer for next block + dst->buf = buf0; +} + +void av1_filter_block_plane_ver(AV1_COMMON *const cm, + struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + int r, c; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int single_step = 1 << ssy; + const int r_step = 2 << ssy; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + uint8_t *lfl2; + + // filter two rows at a time + for (r = 0; r < cm->seq_params.mib_size && + ((mi_row + r) << MI_SIZE_LOG2 < cm->height); + r += r_step) { + for (c = 0; c < cm->seq_params.mib_size && + ((mi_col + c) << MI_SIZE_LOG2 < cm->width); + c += MI_SIZE_64X64) { + dst->buf += ((c << MI_SIZE_LOG2) >> ssx); + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c); + assert(lfm); + const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64; + const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64; + int index = 0; + const int shift = get_index_shift(col, row, &index); + // current and next row should belong to the same mask_idx and index + // next row's shift + const int row_next = row + single_step; + int index_next = 0; + const int shift_next = get_index_shift(col, row_next, &index_next); + switch (pl) { + case 0: + mask_16x16 = lfm->left_y[TX_16X16].bits[index]; + mask_8x8 = lfm->left_y[TX_8X8].bits[index]; + mask_4x4 = lfm->left_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_ver[row][col]; + lfl2 = &lfm->lfl_y_ver[row_next][col]; + break; + case 1: + mask_16x16 = lfm->left_u[TX_16X16].bits[index]; + mask_8x8 = lfm->left_u[TX_8X8].bits[index]; + mask_4x4 = lfm->left_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_ver[row][col]; + lfl2 = &lfm->lfl_u_ver[row_next][col]; + break; + case 2: + mask_16x16 = lfm->left_v[TX_16X16].bits[index]; + mask_8x8 = lfm->left_v[TX_8X8].bits[index]; + mask_4x4 = lfm->left_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_ver[row][col]; + lfl2 = &lfm->lfl_v_ver[row_next][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; + uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; + uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; + uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; + uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; + uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_vert_row2( + ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, + mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + else + filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl, + mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2); +#else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); +#endif + dst->buf -= ((c << MI_SIZE_LOG2) >> ssx); + } + dst->buf += 2 * MI_SIZE * dst->stride; + } +} + +void av1_filter_block_plane_hor(AV1_COMMON *const cm, + struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + int r, c; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int r_step = 1 << ssy; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + + for (r = 0; r < cm->seq_params.mib_size && + ((mi_row + r) << MI_SIZE_LOG2 < cm->height); + r += r_step) { + for (c = 0; c < cm->seq_params.mib_size && + ((mi_col + c) << MI_SIZE_LOG2 < cm->width); + c += MI_SIZE_64X64) { + if (mi_row + r == 0) continue; + + dst->buf += ((c << MI_SIZE_LOG2) >> ssx); + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c); + assert(lfm); + const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64; + const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64; + int index = 0; + const int shift = get_index_shift(col, row, &index); + switch (pl) { + case 0: + mask_16x16 = lfm->above_y[TX_16X16].bits[index]; + mask_8x8 = lfm->above_y[TX_8X8].bits[index]; + mask_4x4 = lfm->above_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_hor[row][col]; + break; + case 1: + mask_16x16 = lfm->above_u[TX_16X16].bits[index]; + mask_8x8 = lfm->above_u[TX_8X8].bits[index]; + mask_4x4 = lfm->above_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u_hor[row][col]; + break; + case 2: + mask_16x16 = lfm->above_v[TX_16X16].bits[index]; + mask_8x8 = lfm->above_v[TX_8X8].bits[index]; + mask_4x4 = lfm->above_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v_hor[row][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; + mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; + mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, + (int)cm->seq_params.bit_depth); + else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); +#endif + dst->buf -= ((c << MI_SIZE_LOG2) >> ssx); + } + dst->buf += MI_SIZE * dst->stride; + } +} + +void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + MB_MODE_INFO *mbmi) { + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size]; + const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size]; + const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const int is_square_transform_size = tx_size <= TX_64X64; + int mask_id = 0; + int offset = 0; + const int half_ratio_tx_size_max32 = + (tx_size > TX_64X64) & (tx_size <= TX_32X16); + if (is_square_transform_size) { + switch (tx_size) { + case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break; + case TX_8X8: + mask_id = mask_id_table_tx_8x8[bsize]; + offset = 19; + break; + case TX_16X16: + mask_id = mask_id_table_tx_16x16[bsize]; + offset = 33; + break; + case TX_32X32: + mask_id = mask_id_table_tx_32x32[bsize]; + offset = 42; + break; + case TX_64X64: mask_id = 46; break; + default: assert(!is_square_transform_size); return; + } + mask_id += offset; + } else if (half_ratio_tx_size_max32) { + int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size]; + mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1); + } else if (tx_size == TX_32X64) { + mask_id = 59; + } else if (tx_size == TX_64X32) { + mask_id = 60; + } else { // quarter ratio tx size + mask_id = 61 + (tx_size - TX_4X16); + } + int index = 0; + const int row = mi_row % MI_SIZE_64X64; + const int col = mi_col % MI_SIZE_64X64; + const int shift = get_index_shift(col, row, &index); + const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col; + for (int i = 0; i + index < 4; ++i) { + // y vertical. + lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // y horizontal. + lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + // u/v vertical. + lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // u/v horizontal. + lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + } +} + +void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) { + // Use a lookup table that provides one bitmask for a given block size and + // a univariant transform size. + int index; + int shift; + int row; + int col; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size]; + const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size]; + const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const int is_square_transform_size = mbmi->tx_size <= TX_64X64; + int mask_id = 0; + int offset = 0; + const int half_ratio_tx_size_max32 = + (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16); + if (is_square_transform_size) { + switch (mbmi->tx_size) { + case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break; + case TX_8X8: + mask_id = mask_id_table_tx_8x8[bsize]; + offset = 19; + break; + case TX_16X16: + mask_id = mask_id_table_tx_16x16[bsize]; + offset = 33; + break; + case TX_32X32: + mask_id = mask_id_table_tx_32x32[bsize]; + offset = 42; + break; + case TX_64X64: mask_id = 46; break; + default: assert(!is_square_transform_size); return; + } + mask_id += offset; + } else if (half_ratio_tx_size_max32) { + int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size]; + mask_id = + 47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1); + } else if (mbmi->tx_size == TX_32X64) { + mask_id = 59; + } else if (mbmi->tx_size == TX_64X32) { + mask_id = 60; + } else { // quarter ratio tx size + mask_id = 61 + (mbmi->tx_size - TX_4X16); + } + row = mi_row % MI_SIZE_64X64; + col = mi_col % MI_SIZE_64X64; + shift = get_index_shift(col, row, &index); + const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col; + for (int i = 0; i + index < 4; ++i) { + // y vertical. + lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // y horizontal. + lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + // u/v vertical. + lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // u/v horizontal. + lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + } +} + +void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + int is_horz_coding_block_border, + int is_vert_coding_block_border) { + int index; + int shift; + int row; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const int row_start = mi_row % MI_SIZE_64X64; + const int col_start = mi_col % MI_SIZE_64X64; + shift = get_index_shift(col_start, row_start, &index); + if (is_horz_coding_block_border) { + const int block_shift = shift + mi_size_wide[bsize]; + assert(block_shift <= 64); + const uint64_t right_edge_shift = + (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift); + const uint64_t left_edge_shift = (block_shift == 64) + ? (((uint64_t)1 << shift) - 1) + : ((uint64_t)1 << shift); + assert(right_edge_shift > left_edge_shift); + const uint64_t top_edge_mask = right_edge_shift - left_edge_shift; + lfm->is_horz_border.bits[index] |= top_edge_mask; + } + if (is_vert_coding_block_border) { + const int is_vert_border = mask_id_table_vert_border[bsize]; + const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start; + for (int i = 0; i + index < 4; ++i) { + lfm->is_vert_border.bits[i + index] |= + (left_mask_univariant_reordered[is_vert_border].bits[i] + << vert_shift); + } + } + const int is_skip = mbmi->skip && is_inter_block(mbmi); + if (is_skip) { + const int is_skip_mask = mask_id_table_tx_4x4[bsize]; + for (int i = 0; i + index < 4; ++i) { + lfm->skip.bits[i + index] |= + (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift); + } + } + const uint8_t level_vert_y = + av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi); + const uint8_t level_horz_y = + av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi); + const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi); + const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi); + for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) { + index = 0; + row = r % MI_SIZE_64X64; + memset(&lfm->lfl_y_ver[row][col_start], level_vert_y, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_y_hor[row][col_start], level_horz_y, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_u_ver[row][col_start], level_u, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_u_hor[row][col_start], level_u, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_v_ver[row][col_start], level_v, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_v_hor[row][col_start], level_v, + sizeof(uint8_t) * mi_size_wide[bsize]); + } +} +#endif // CONFIG_LPF_MASK diff --git a/media/libaom/src/av1/common/mv.h b/media/libaom/src/av1/common/mv.h index 5b0225192..be539e820 100644 --- a/media/libaom/src/av1/common/mv.h +++ b/media/libaom/src/av1/common/mv.h @@ -21,17 +21,34 @@ extern "C" { #endif #define INVALID_MV 0x80008000 +#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3) +#define GET_MV_SUBPEL(x) ((x)*8) +#define MARK_MV_INVALID(mv) \ + do { \ + ((int_mv *)(mv))->as_int = INVALID_MV; \ + } while (0); +#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col)) + +// The motion vector in units of full pixel +typedef struct fullpel_mv { + int16_t row; + int16_t col; +} FULLPEL_MV; + +// The motion vector in units of 1/8-pel typedef struct mv { int16_t row; int16_t col; } MV; static const MV kZeroMv = { 0, 0 }; +static const FULLPEL_MV kZeroFullMv = { 0, 0 }; typedef union int_mv { uint32_t as_int; MV as_mv; + FULLPEL_MV as_fullmv; } int_mv; /* facilitates faster equality tests and copies */ typedef struct mv32 { @@ -39,6 +56,38 @@ typedef struct mv32 { int32_t col; } MV32; +// The mv limit for fullpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} FullMvLimits; + +// The mv limit for subpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} SubpelMvLimits; + +static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) { + const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row), + (int16_t)GET_MV_RAWPEL(subpel_mv->col) }; + return full_mv; +} + +static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) { + const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row), + (int16_t)GET_MV_SUBPEL(full_mv->col) }; + return subpel_mv; +} + +static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) { + mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv); +} + // Bits of precision used for the model #define WARPEDMODEL_PREC_BITS 16 #define WARPEDMODEL_ROW3HOMO_PREC_BITS 16 @@ -56,13 +105,13 @@ typedef struct mv32 { #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) /* clang-format off */ -typedef enum ATTRIBUTE_PACKED { +enum { IDENTITY = 0, // identity transformation, 0-parameter TRANSLATION = 1, // translational motion 2-parameter ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter AFFINE = 3, // affine, 6-parameter TRANS_TYPES, -} TransformationType; +} UENUM1BYTE(TransformationType); /* clang-format on */ // Number of types used for global motion (must be >= 3 and <= TRANS_TYPES) @@ -87,18 +136,18 @@ static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; // z . y' = m4 m5 m1 * y // 1] m6 m7 1) 1] typedef struct { - TransformationType wmtype; int32_t wmmat[8]; int16_t alpha, beta, gamma, delta; + TransformationType wmtype; int8_t invalid; } WarpedMotionParams; /* clang-format off */ static const WarpedMotionParams default_warp_params = { - IDENTITY, { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0 }, 0, 0, 0, 0, + IDENTITY, 0, }; /* clang-format on */ @@ -225,7 +274,8 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm, // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16) // bits of fractional precision. The offset for a translation is stored in // entries 0 and 1. For translations, all but the top three (two if - // cm->allow_high_precision_mv is false) fractional bits are always zero. + // cm->features.allow_high_precision_mv is false) fractional bits are always + // zero. // // After the right shifts, there are 3 fractional bits of precision. If // allow_hp is false, the bottom bit is always zero (so we don't need a @@ -263,7 +313,7 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm, return res; } -static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) { +static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) { if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] && gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) { return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION); @@ -277,7 +327,6 @@ static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) { typedef struct candidate_mv { int_mv this_mv; int_mv comp_mv; - int weight; } CANDIDATE_MV; static INLINE int is_zero_mv(const MV *mv) { @@ -288,10 +337,14 @@ static INLINE int is_equal_mv(const MV *a, const MV *b) { return *((const uint32_t *)a) == *((const uint32_t *)b); } -static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, - int max_row) { - mv->col = clamp(mv->col, min_col, max_col); - mv->row = clamp(mv->row, min_row, max_row); +static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); +} + +static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); } #ifdef __cplusplus diff --git a/media/libaom/src/av1/common/mvref_common.c b/media/libaom/src/av1/common/mvref_common.c index 7f24ab4e6..db3098cc0 100644 --- a/media/libaom/src/av1/common/mvref_common.c +++ b/media/libaom/src/av1/common/mvref_common.c @@ -23,7 +23,7 @@ static int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, // TODO(jingning): Consider the use of lookup table for (num / den) // altogether. -static void get_mv_projection(MV *output, MV ref, int num, int den) { +static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) { den = AOMMIN(den, MAX_FRAME_DISTANCE); num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE) : AOMMAX(num, -MAX_FRAME_DISTANCE); @@ -40,7 +40,7 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) { void av1_copy_frame_mvs(const AV1_COMMON *const cm, const MB_MODE_INFO *const mi, int mi_row, int mi_col, int x_mis, int y_mis) { - const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1); + const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); MV_REF *frame_mvs = cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); x_mis = ROUND_POWER_OF_TWO(x_mis, 1); @@ -71,34 +71,35 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm, } } -static void add_ref_mv_candidate( +static AOM_INLINE void add_ref_mv_candidate( const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2], uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, - CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates, - const WarpedMotionParams *gm_params, int col, int weight) { - if (!is_inter_block(candidate)) return; // for intrabc - int index = 0, ref; + CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params, + uint16_t weight) { + if (!is_inter_block(candidate)) return; assert(weight % 2 == 0); + int index, ref; if (rf[1] == NONE_FRAME) { // single reference frame for (ref = 0; ref < 2; ++ref) { if (candidate->ref_frame[ref] == rf[0]) { - int_mv this_refmv; - if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype)) - this_refmv = gm_mv_candidates[0]; - else - this_refmv = get_sub_block_mv(candidate, ref, col); - - for (index = 0; index < *refmv_count; ++index) - if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break; - - if (index < *refmv_count) ref_mv_stack[index].weight += weight; + const int is_gm_block = + is_global_mv_block(candidate, gm_params[rf[0]].wmtype); + const int_mv this_refmv = + is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref); + for (index = 0; index < *refmv_count; ++index) { + if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) { + ref_mv_weight[index] += weight; + break; + } + } // Add a new item to the list. if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[index].this_mv = this_refmv; - ref_mv_stack[index].weight = weight; + ref_mv_weight[index] = weight; ++(*refmv_count); } if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; @@ -114,21 +115,22 @@ static void add_ref_mv_candidate( if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype)) this_refmv[ref] = gm_mv_candidates[ref]; else - this_refmv[ref] = get_sub_block_mv(candidate, ref, col); + this_refmv[ref] = get_block_mv(candidate, ref); } - for (index = 0; index < *refmv_count; ++index) + for (index = 0; index < *refmv_count; ++index) { if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) && - (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) + (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { + ref_mv_weight[index] += weight; break; - - if (index < *refmv_count) ref_mv_stack[index].weight += weight; + } + } // Add a new item to the list. if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[index].this_mv = this_refmv[0]; ref_mv_stack[index].comp_mv = this_refmv[1]; - ref_mv_stack[index].weight = weight; + ref_mv_weight[index] = weight; ++(*refmv_count); } if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; @@ -137,42 +139,39 @@ static void add_ref_mv_candidate( } } -static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, - int mi_row, int mi_col, - const MV_REFERENCE_FRAME rf[2], int row_offset, - CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count, - uint8_t *ref_match_count, uint8_t *newmv_count, - int_mv *gm_mv_candidates, int max_row_offset, - int *processed_rows) { - int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col); +static AOM_INLINE void scan_row_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col, + const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset, + int *processed_rows) { + int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]); - const int n8_w_8 = mi_size_wide[BLOCK_8X8]; - const int n8_w_16 = mi_size_wide[BLOCK_16X16]; - int i; + const int width_8x8 = mi_size_wide[BLOCK_8X8]; + const int width_16x16 = mi_size_wide[BLOCK_16X16]; int col_offset = 0; // TODO(jingning): Revisit this part after cb4x4 is stable. if (abs(row_offset) > 1) { col_offset = 1; - if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset; + if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset; } - const int use_step_16 = (xd->n4_w >= 16); + const int use_step_16 = (xd->width >= 16); MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride; - (void)mi_row; - for (i = 0; i < end_mi;) { + for (int i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i]; const int candidate_bsize = candidate->sb_type; const int n4_w = mi_size_wide[candidate_bsize]; - int len = AOMMIN(xd->n4_w, n4_w); + int len = AOMMIN(xd->width, n4_w); if (use_step_16) - len = AOMMAX(n8_w_16, len); + len = AOMMAX(width_16x16, len); else if (abs(row_offset) > 1) - len = AOMMAX(len, n8_w_8); + len = AOMMAX(len, width_8x8); - int weight = 2; - if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) { - int inc = AOMMIN(-max_row_offset + row_offset + 1, - mi_size_high[candidate_bsize]); + uint16_t weight = 2; + if (xd->width >= width_8x8 && xd->width <= n4_w) { + uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1, + mi_size_high[candidate_bsize]); // Obtain range used in weight calculation. weight = AOMMAX(weight, inc); // Update processed rows. @@ -180,21 +179,20 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, } add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, - newmv_count, ref_mv_stack, gm_mv_candidates, - cm->global_motion, col_offset + i, len * weight); + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); i += len; } } -static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, - int mi_row, int mi_col, - const MV_REFERENCE_FRAME rf[2], int col_offset, - CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count, - uint8_t *ref_match_count, uint8_t *newmv_count, - int_mv *gm_mv_candidates, int max_col_offset, - int *processed_cols) { - int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row); +static AOM_INLINE void scan_col_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, + const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset, + int *processed_cols) { + int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]); const int n8_h_8 = mi_size_high[BLOCK_8X8]; const int n8_h_16 = mi_size_high[BLOCK_16X16]; @@ -202,24 +200,23 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, int row_offset = 0; if (abs(col_offset) > 1) { row_offset = 1; - if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset; + if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset; } - const int use_step_16 = (xd->n4_h >= 16); - (void)mi_col; + const int use_step_16 = (xd->height >= 16); for (i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = xd->mi[(row_offset + i) * xd->mi_stride + col_offset]; const int candidate_bsize = candidate->sb_type; const int n4_h = mi_size_high[candidate_bsize]; - int len = AOMMIN(xd->n4_h, n4_h); + int len = AOMMIN(xd->height, n4_h); if (use_step_16) len = AOMMAX(n8_h_16, len); else if (abs(col_offset) > 1) len = AOMMAX(len, n8_h_8); int weight = 2; - if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) { + if (xd->height >= n8_h_8 && xd->height <= n4_h) { int inc = AOMMIN(-max_col_offset + col_offset + 1, mi_size_wide[candidate_bsize]); // Obtain range used in weight calculation. @@ -229,20 +226,19 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, } add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, - newmv_count, ref_mv_stack, gm_mv_candidates, - cm->global_motion, col_offset, len * weight); + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); i += len; } } -static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, - const int mi_row, const int mi_col, - const MV_REFERENCE_FRAME rf[2], int row_offset, - int col_offset, CANDIDATE_MV *ref_mv_stack, - uint8_t *ref_match_count, uint8_t *newmv_count, - int_mv *gm_mv_candidates, - uint8_t refmv_count[MODE_CTX_REF_FRAMES]) { +static AOM_INLINE void scan_blk_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row, + const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset, + int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, + uint8_t *refmv_count) { const TileInfo *const tile = &xd->tile; POSITION mi_pos; @@ -255,8 +251,8 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, const int len = mi_size_wide[BLOCK_8X8]; add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, - newmv_count, ref_mv_stack, gm_mv_candidates, - cm->global_motion, mi_pos.col, 2 * len); + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, 2 * len); } // Analyze a single 8x8 block motion information. } @@ -291,19 +287,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, // The left hand of two vertical rectangles always has a top right (as the // block above will have been decoded) - if (xd->n4_w < xd->n4_h) + if (xd->width < xd->height) if (!xd->is_sec_rect) has_tr = 1; // The bottom of two horizontal rectangles never has a top right (as the block // to the right won't have been decoded) - if (xd->n4_w > xd->n4_h) + if (xd->width > xd->height) if (xd->is_sec_rect) has_tr = 0; // The bottom left square of a Vertical A (in the old format) does // not have a top right as it is decoded before the right hand // rectangle of the partition if (xd->mi[0]->partition == PARTITION_VERT_A) { - if (xd->n4_w == xd->n4_h) + if (xd->width == xd->height) if (mask_row & bs) has_tr = 0; } @@ -326,112 +322,98 @@ static int check_sb_border(const int mi_row, const int mi_col, static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame, int blk_row, int blk_col, int_mv *gm_mv_candidates, - uint8_t refmv_count[MODE_CTX_REF_FRAMES], - CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE], + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], int16_t *mode_context) { POSITION mi_pos; - int idx; - const int weight_unit = 1; // mi_size_wide[BLOCK_8X8]; - mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1; mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1; if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0; const TPL_MV_REF *prev_frame_mvs = - cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) + + cm->tpl_mvs + + ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) + ((mi_col + mi_pos.col) >> 1); + if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0; MV_REFERENCE_FRAME rf[2]; av1_set_ref_frame(rf, ref_frame); + const uint16_t weight_unit = 1; // mi_size_wide[BLOCK_8X8]; + const int cur_frame_index = cm->cur_frame->order_hint; + const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); + const int frame0_index = buf_0->order_hint; + const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, frame0_index); + int idx; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + + int_mv this_refmv; + get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_0, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + if (rf[1] == NONE_FRAME) { - int cur_frame_index = cm->cur_frame->cur_frame_offset; - int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx; - int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset; - int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index); - CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]]; - - if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) { - int_mv this_refmv; - - get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, - cur_offset_0, prev_frame_mvs->ref_frame_offset); - lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv, - cm->cur_frame_force_integer_mv); - - if (blk_row == 0 && blk_col == 0) - if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || - abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16) - mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); - - for (idx = 0; idx < refmv_count[rf[0]]; ++idx) - if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break; - - if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit; - - if (idx == refmv_count[rf[0]] && - refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) { - ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; - ref_mv_stack[idx].weight = 2 * weight_unit; - ++(refmv_count[rf[0]]); - } - return 1; + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break; + + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; + + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); } } else { // Process compound inter mode - int cur_frame_index = cm->cur_frame->cur_frame_offset; - int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx; - int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset; - - int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index); - int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx; - int frame1_index = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset; - int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index); - CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame]; - - if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) { - int_mv this_refmv; - int_mv comp_refmv; - get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, - cur_offset_0, prev_frame_mvs->ref_frame_offset); - get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, - cur_offset_1, prev_frame_mvs->ref_frame_offset); - - lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv, - cm->cur_frame_force_integer_mv); - lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv, - cm->cur_frame_force_integer_mv); - - if (blk_row == 0 && blk_col == 0) - if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || - abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 || - abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 || - abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16) - mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); - - for (idx = 0; idx < refmv_count[ref_frame]; ++idx) - if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int && - comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int) - break; + const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]); + const int frame1_index = buf_1->order_hint; + const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, frame1_index); + int_mv comp_refmv; + get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_1, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 || + abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 || + abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) { + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int && + comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int) + break; + } - if (idx < refmv_count[ref_frame]) - ref_mv_stack[idx].weight += 2 * weight_unit; + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; - if (idx == refmv_count[ref_frame] && - refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) { - ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; - ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int; - ref_mv_stack[idx].weight = 2 * weight_unit; - ++(refmv_count[ref_frame]); - } - return 1; + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); } } - return 0; + + return 1; } -static void process_compound_ref_mv_candidate( +static AOM_INLINE void process_compound_ref_mv_candidate( const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2], int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) { @@ -456,10 +438,11 @@ static void process_compound_ref_mv_candidate( } } -static void process_single_ref_mv_candidate( +static AOM_INLINE void process_single_ref_mv_candidate( const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, - MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES], - CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) { + MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) { for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { int_mv this_mv = candidate->mv[rf_idx]; @@ -469,49 +452,50 @@ static void process_single_ref_mv_candidate( this_mv.as_mv.col = -this_mv.as_mv.col; } int stack_idx; - for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) { - const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv; + for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) { + const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv; if (this_mv.as_int == stack_mv.as_int) break; } - if (stack_idx == refmv_count[ref_frame]) { - ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv; + if (stack_idx == *refmv_count) { + ref_mv_stack[stack_idx].this_mv = this_mv; // TODO(jingning): Set an arbitrary small number here. The weight // doesn't matter as long as it is properly initialized. - ref_mv_stack[ref_frame][stack_idx].weight = 2; - ++refmv_count[ref_frame]; + ref_mv_weight[stack_idx] = 2; + ++(*refmv_count); } } } } -static void setup_ref_mv_list( +static AOM_INLINE void setup_ref_mv_list( const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, - uint8_t refmv_count[MODE_CTX_REF_FRAMES], - CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], - int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, int mi_row, int mi_col, int16_t *mode_context) { - const int bs = AOMMAX(xd->n4_w, xd->n4_h); + const int bs = AOMMAX(xd->width, xd->height); const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs); MV_REFERENCE_FRAME rf[2]; const TileInfo *const tile = &xd->tile; int max_row_offset = 0, max_col_offset = 0; - const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); - const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); + const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); + const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); int processed_rows = 0; int processed_cols = 0; av1_set_ref_frame(rf, ref_frame); mode_context[ref_frame] = 0; - refmv_count[ref_frame] = 0; + *refmv_count = 0; // Find valid maximum row/col offset. if (xd->up_available) { max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj; - if (xd->n4_h < mi_size_high[BLOCK_8X8]) + if (xd->height < mi_size_high[BLOCK_8X8]) max_row_offset = -(2 << 1) + row_adj; max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset); @@ -520,7 +504,7 @@ static void setup_ref_mv_list( if (xd->left_available) { max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj; - if (xd->n4_w < mi_size_wide[BLOCK_8X8]) + if (xd->width < mi_size_wide[BLOCK_8X8]) max_col_offset = -(2 << 1) + col_adj; max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset); @@ -532,48 +516,48 @@ static void setup_ref_mv_list( // Scan the first above row mode info. row_offset = -1; if (abs(max_row_offset) >= 1) - scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame], - &refmv_count[ref_frame], &row_match_count, &newmv_count, - gm_mv_candidates, max_row_offset, &processed_rows); + scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &newmv_count, gm_mv_candidates, + max_row_offset, &processed_rows); // Scan the first left column mode info. col_offset = -1; if (abs(max_col_offset) >= 1) - scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame], - &refmv_count[ref_frame], &col_match_count, &newmv_count, - gm_mv_candidates, max_col_offset, &processed_cols); + scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &newmv_count, gm_mv_candidates, + max_col_offset, &processed_cols); // Check top-right boundary if (has_tr) - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w, - ref_mv_stack[ref_frame], &row_match_count, &newmv_count, - gm_mv_candidates, &refmv_count[ref_frame]); + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack, + ref_mv_weight, &row_match_count, &newmv_count, + gm_mv_candidates, refmv_count); const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); - const uint8_t nearest_refmv_count = refmv_count[ref_frame]; + const uint8_t nearest_refmv_count = *refmv_count; // TODO(yunqing): for comp_search, do it for all 3 cases. for (int idx = 0; idx < nearest_refmv_count; ++idx) - ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL; + ref_mv_weight[idx] += REF_CAT_LEVEL; - if (cm->allow_ref_frame_mvs) { + if (cm->features.allow_ref_frame_mvs) { int is_available = 0; - const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h); - const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w); - const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]); - const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]); + const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height); + const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width); + const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]); + const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]); const int tpl_sample_pos[3][2] = { { voffset, -2 }, { voffset, hoffset }, { voffset - 2, hoffset }, }; - const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) && - (xd->n4_h < mi_size_high[BLOCK_64X64]) && - (xd->n4_w >= mi_size_wide[BLOCK_8X8]) && - (xd->n4_w < mi_size_wide[BLOCK_64X64]); + const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) && + (xd->height < mi_size_high[BLOCK_64X64]) && + (xd->width >= mi_size_wide[BLOCK_8X8]) && + (xd->width < mi_size_wide[BLOCK_64X64]); - const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64]) + const int step_h = (xd->height >= mi_size_high[BLOCK_64X64]) ? mi_size_high[BLOCK_16X16] : mi_size_high[BLOCK_8X8]; - const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64]) + const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64]) ? mi_size_wide[BLOCK_16X16] : mi_size_wide[BLOCK_8X8]; @@ -581,7 +565,7 @@ static void setup_ref_mv_list( for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) { int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col, gm_mv_candidates, refmv_count, - ref_mv_stack, mode_context); + ref_mv_stack, ref_mv_weight, mode_context); if (blk_row == 0 && blk_col == 0) is_available = ret; } } @@ -594,16 +578,17 @@ static void setup_ref_mv_list( if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue; add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col, - gm_mv_candidates, refmv_count, ref_mv_stack, mode_context); + gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight, + mode_context); } } uint8_t dummy_newmv_count = 0; // Scan the second outer area. - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame], + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight, &row_match_count, &dummy_newmv_count, gm_mv_candidates, - &refmv_count[ref_frame]); + refmv_count); for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) { const int row_offset = -(idx << 1) + 1 + row_adj; @@ -611,24 +596,21 @@ static void setup_ref_mv_list( if (abs(row_offset) <= abs(max_row_offset) && abs(row_offset) > processed_rows) - scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset, - ref_mv_stack[ref_frame], &refmv_count[ref_frame], - &row_match_count, &dummy_newmv_count, gm_mv_candidates, - max_row_offset, &processed_rows); + scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &dummy_newmv_count, + gm_mv_candidates, max_row_offset, &processed_rows); if (abs(col_offset) <= abs(max_col_offset) && abs(col_offset) > processed_cols) - scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset, - ref_mv_stack[ref_frame], &refmv_count[ref_frame], - &col_match_count, &dummy_newmv_count, gm_mv_candidates, - max_col_offset, &processed_cols); + scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &dummy_newmv_count, + gm_mv_candidates, max_col_offset, &processed_cols); } const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); switch (nearest_match) { case 0: - mode_context[ref_frame] |= 0; if (ref_match_count >= 1) mode_context[ref_frame] |= 1; if (ref_match_count == 1) mode_context[ref_frame] |= (1 << REFMV_OFFSET); @@ -658,45 +640,48 @@ static void setup_ref_mv_list( while (len > 0) { int nr_len = 0; for (int idx = 1; idx < len; ++idx) { - if (ref_mv_stack[ref_frame][idx - 1].weight < - ref_mv_stack[ref_frame][idx].weight) { - CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1]; - ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx]; - ref_mv_stack[ref_frame][idx] = tmp_mv; + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; nr_len = idx; } } len = nr_len; } - len = refmv_count[ref_frame]; + len = *refmv_count; while (len > nearest_refmv_count) { int nr_len = nearest_refmv_count; for (int idx = nearest_refmv_count + 1; idx < len; ++idx) { - if (ref_mv_stack[ref_frame][idx - 1].weight < - ref_mv_stack[ref_frame][idx].weight) { - CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1]; - ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx]; - ref_mv_stack[ref_frame][idx] = tmp_mv; + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; nr_len = idx; } } len = nr_len; } + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width); + mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height); + mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row); + const int mi_size = AOMMIN(mi_width, mi_height); if (rf[1] > NONE_FRAME) { // TODO(jingning, yunqing): Refactor and consolidate the compound and // single reference frame modes. Reduce unnecessary redundancy. - if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) { + if (*refmv_count < MAX_MV_REF_CANDIDATES) { int_mv ref_id[2][2], ref_diff[2][2]; int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 }; - int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w); - mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col); - int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h); - mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row); - int mi_size = AOMMIN(mi_width, mi_height); - for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; process_compound_ref_mv_candidate( @@ -712,95 +697,82 @@ static void setup_ref_mv_list( } // Build up the compound mv predictor - int_mv comp_list[3][2]; + int_mv comp_list[MAX_MV_REF_CANDIDATES][2]; for (int idx = 0; idx < 2; ++idx) { int comp_idx = 0; - for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2; + for (int list_idx = 0; + list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; ++list_idx, ++comp_idx) comp_list[comp_idx][idx] = ref_id[idx][list_idx]; - for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2; + for (int list_idx = 0; + list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; ++list_idx, ++comp_idx) comp_list[comp_idx][idx] = ref_diff[idx][list_idx]; - for (; comp_idx < 3; ++comp_idx) + for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx) comp_list[comp_idx][idx] = gm_mv_candidates[idx]; } - if (refmv_count[ref_frame]) { - assert(refmv_count[ref_frame] == 1); - if (comp_list[0][0].as_int == - ref_mv_stack[ref_frame][0].this_mv.as_int && - comp_list[0][1].as_int == - ref_mv_stack[ref_frame][0].comp_mv.as_int) { - ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv = - comp_list[1][0]; - ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv = - comp_list[1][1]; + if (*refmv_count) { + assert(*refmv_count == 1); + if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int && + comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) { + ref_mv_stack[*refmv_count].this_mv = comp_list[1][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1]; } else { - ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv = - comp_list[0][0]; - ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv = - comp_list[0][1]; + ref_mv_stack[*refmv_count].this_mv = comp_list[0][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1]; } - ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2; - ++refmv_count[ref_frame]; + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; } else { for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) { - ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv = - comp_list[idx][0]; - ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv = - comp_list[idx][1]; - ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2; - ++refmv_count[ref_frame]; + ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1]; + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; } } } - assert(refmv_count[ref_frame] >= 2); + assert(*refmv_count >= 2); - for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) { - clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv, - xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); - clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv, - xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); } } else { // Handle single reference frame extension - int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w); - mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col); - int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h); - mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row); - int mi_size = AOMMIN(mi_width, mi_height); - for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size && - refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) { + *refmv_count < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, - ref_mv_stack); + ref_mv_stack, ref_mv_weight); idx += mi_size_wide[candidate->sb_type]; } for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size && - refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) { + *refmv_count < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, - ref_mv_stack); + ref_mv_stack, ref_mv_weight); idx += mi_size_high[candidate->sb_type]; } - for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) { - clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv, - xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); } if (mv_ref_list != NULL) { - for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx) - mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int; + for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx) + mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int; - for (int idx = 0; - idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) { - mv_ref_list[rf[0]][idx].as_int = - ref_mv_stack[ref_frame][idx].this_mv.as_int; + for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); + ++idx) { + mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int; } } } @@ -810,43 +782,44 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], - int_mv *global_mvs, int mi_row, int mi_col, - int16_t *mode_context) { - int_mv zeromv[2]; - BLOCK_SIZE bsize = mi->sb_type; - MV_REFERENCE_FRAME rf[2]; - av1_set_ref_frame(rf, ref_frame); - - if (ref_frame < REF_FRAMES) { - if (ref_frame != INTRA_FRAME) { - global_mvs[ref_frame] = gm_get_motion_vector( - &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize, - mi_col, mi_row, cm->cur_frame_force_integer_mv); - } else { + int_mv *global_mvs, int16_t *mode_context) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int_mv gm_mv[2]; + + if (ref_frame == INTRA_FRAME) { + gm_mv[0].as_int = gm_mv[1].as_int = 0; + if (global_mvs != NULL) { global_mvs[ref_frame].as_int = INVALID_MV; } - } - - if (ref_frame != INTRA_FRAME) { - zeromv[0].as_int = - gm_get_motion_vector(&cm->global_motion[rf[0]], - cm->allow_high_precision_mv, bsize, mi_col, mi_row, - cm->cur_frame_force_integer_mv) - .as_int; - zeromv[1].as_int = - (rf[1] != NONE_FRAME) - ? gm_get_motion_vector(&cm->global_motion[rf[1]], - cm->allow_high_precision_mv, bsize, mi_col, - mi_row, cm->cur_frame_force_integer_mv) - .as_int - : 0; } else { - zeromv[0].as_int = zeromv[1].as_int = 0; + const BLOCK_SIZE bsize = mi->sb_type; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + if (ref_frame < REF_FRAMES) { + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1].as_int = 0; + if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0]; + } else { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + } } - setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list, - zeromv, mi_row, mi_col, mode_context); + setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame], + ref_mv_stack[ref_frame], ref_mv_weight[ref_frame], + mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row, + mi_col, mode_context); } void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, @@ -861,26 +834,29 @@ void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, } void av1_setup_frame_buf_refs(AV1_COMMON *cm) { - cm->cur_frame->cur_frame_offset = cm->frame_offset; + cm->cur_frame->order_hint = cm->current_frame.order_hint; + cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint; MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx; - if (buf_idx >= 0) - cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME] = - cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint; + cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] = + buf->display_order_hint; + } } } void av1_setup_frame_sign_bias(AV1_COMMON *cm) { MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx; - if (cm->seq_params.enable_order_hint && buf_idx != INVALID_IDX) { - const int ref_frame_offset = - cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) { + const int ref_order_hint = buf->order_hint; cm->ref_frame_sign_bias[ref_frame] = - (get_relative_dist(cm, ref_frame_offset, (int)cm->frame_offset) <= 0) + (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint, + (int)cm->current_frame.order_hint) <= 0) ? 0 : 1; } else { @@ -908,8 +884,8 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, const int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; - if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 || - col >= (cm->mi_cols >> 1)) + if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 || + col >= (cm->mi_params.mi_cols >> 1)) return 0; if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) || @@ -935,35 +911,36 @@ static int motion_field_projection(AV1_COMMON *cm, TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; int ref_offset[REF_FRAMES] = { 0 }; - (void)dir; - - const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx; - if (start_frame_idx < 0) return 0; + const RefCntBuffer *const start_frame_buf = + get_ref_frame_buf(cm, start_frame); + if (start_frame_buf == NULL) return 0; - if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0; + if (start_frame_buf->frame_type == KEY_FRAME || + start_frame_buf->frame_type == INTRA_ONLY_FRAME) + return 0; - if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows || - cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols) + if (start_frame_buf->mi_rows != cm->mi_params.mi_rows || + start_frame_buf->mi_cols != cm->mi_params.mi_cols) return 0; - const int start_frame_offset = - cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset; - const unsigned int *const ref_frame_offsets = - &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0]; - const int cur_frame_offset = cm->cur_frame->cur_frame_offset; - int start_to_current_frame_offset = - get_relative_dist(cm, start_frame_offset, cur_frame_offset); + const int start_frame_order_hint = start_frame_buf->order_hint; + const unsigned int *const ref_order_hints = + &start_frame_buf->ref_order_hints[0]; + const int cur_order_hint = cm->cur_frame->order_hint; + int start_to_current_frame_offset = get_relative_dist( + &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint); for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { - ref_offset[rf] = get_relative_dist(cm, start_frame_offset, - ref_frame_offsets[rf - LAST_FRAME]); + ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info, + start_frame_order_hint, + ref_order_hints[rf - LAST_FRAME]); } if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset; - MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs; - const int mvs_rows = (cm->mi_rows + 1) >> 1; - const int mvs_cols = (cm->mi_cols + 1) >> 1; + MV_REF *mv_ref_base = start_frame_buf->mvs; + const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1; + const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1; for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) { for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) { @@ -988,7 +965,7 @@ static int motion_field_projection(AV1_COMMON *cm, } if (pos_valid) { - const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c; + const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c; tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row; tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col; @@ -1002,33 +979,35 @@ static int motion_field_projection(AV1_COMMON *cm, } void av1_setup_motion_field(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side)); - if (!cm->seq_params.enable_order_hint) return; + if (!order_hint_info->enable_order_hint) return; TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; - int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1); + int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) * + (cm->mi_params.mi_stride >> 1); for (int idx = 0; idx < size; ++idx) { tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV; tpl_mvs_base[idx].ref_frame_offset = 0; } - const int cur_order_hint = cm->cur_frame->cur_frame_offset; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + const int cur_order_hint = cm->cur_frame->order_hint; - int ref_buf_idx[INTER_REFS_PER_FRAME]; + const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME]; int ref_order_hint[INTER_REFS_PER_FRAME]; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { const int ref_idx = ref_frame - LAST_FRAME; - const int buf_idx = cm->frame_refs[ref_idx].idx; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); int order_hint = 0; - if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset; + if (buf != NULL) order_hint = buf->order_hint; - ref_buf_idx[ref_idx] = buf_idx; + ref_buf[ref_idx] = buf; ref_order_hint[ref_idx] = order_hint; - if (get_relative_dist(cm, order_hint, cur_order_hint) > 0) + if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0) cm->ref_frame_side[ref_frame] = 1; else if (order_hint == cur_order_hint) cm->ref_frame_side[ref_frame] = -1; @@ -1036,10 +1015,10 @@ void av1_setup_motion_field(AV1_COMMON *cm) { int ref_stamp = MFMV_STACK_SIZE - 1; - if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) { + if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) { const int alt_of_lst_order_hint = - frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]] - .ref_frame_offset[ALTREF_FRAME - LAST_FRAME]; + ref_buf[LAST_FRAME - LAST_FRAME] + ->ref_order_hints[ALTREF_FRAME - LAST_FRAME]; const int is_lst_overlay = (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]); @@ -1047,47 +1026,50 @@ void av1_setup_motion_field(AV1_COMMON *cm) { --ref_stamp; } - if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME], + if (get_relative_dist(order_hint_info, + ref_order_hint[BWDREF_FRAME - LAST_FRAME], cur_order_hint) > 0) { if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp; } - if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME], + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF2_FRAME - LAST_FRAME], cur_order_hint) > 0) { if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp; } - if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME], + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF_FRAME - LAST_FRAME], cur_order_hint) > 0 && ref_stamp >= 0) if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp; - if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0) - if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp; + if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2); } -static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref, - int row_offset, int sign_r, int col_offset, - int sign_c) { +static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts, + int *pts_inref, int row_offset, int sign_r, + int col_offset, int sign_c) { int bw = block_size_wide[mbmi->sb_type]; int bh = block_size_high[mbmi->sb_type]; int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1; int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1; - pts[0] = (x * 8); - pts[1] = (y * 8); - pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col; - pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row; + pts[0] = GET_MV_SUBPEL(x); + pts[1] = GET_MV_SUBPEL(y); + pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col; + pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row; } // Select samples according to the motion vector difference. -int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) { +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize) { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int thresh = clamp(AOMMAX(bw, bh), 16, 112); int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 }; int i, j, k, l = len; - int ret = 0; + uint8_t ret = 0; assert(len <= LEAST_SQUARES_SAMPLES_MAX); // Obtain the motion vector difference. @@ -1128,30 +1110,32 @@ int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) { // Note: Samples returned are at 1/8-pel precision // Sample are the neighbor block center point's coordinates relative to the // left-top pixel of current block. -int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, - int *pts, int *pts_inref) { - MB_MODE_INFO *const mbmi0 = xd->mi[0]; - int ref_frame = mbmi0->ref_frame[0]; - int up_available = xd->up_available; - int left_available = xd->left_available; - int i, mi_step = 1, np = 0; - - const TileInfo *const tile = &xd->tile; +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref) { + const MB_MODE_INFO *const mbmi0 = xd->mi[0]; + const int ref_frame = mbmi0->ref_frame[0]; + const int up_available = xd->up_available; + const int left_available = xd->left_available; + int i, mi_step; + uint8_t np = 0; int do_tl = 1; int do_tr = 1; + const int mi_stride = xd->mi_stride; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; // scan the nearest above rows if (up_available) { - int mi_row_offset = -1; - MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride]; - uint8_t n4_w = mi_size_wide[mbmi->sb_type]; + const int mi_row_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride]; + uint8_t superblock_width = mi_size_wide[mbmi->sb_type]; - if (xd->n4_w <= n4_w) { + if (xd->width <= superblock_width) { // Handle "current block width <= above block width" case. - int col_offset = -mi_col % n4_w; + const int col_offset = -mi_col % superblock_width; if (col_offset < 0) do_tl = 0; - if (col_offset + n4_w > xd->n4_w) do_tr = 0; + if (col_offset + superblock_width > xd->width) do_tr = 0; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1); @@ -1162,11 +1146,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, } } else { // Handle "current block width > above block width" case. - for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) { - int mi_col_offset = i; - mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; - n4_w = mi_size_wide[mbmi->sb_type]; - mi_step = AOMMIN(xd->n4_w, n4_w); + for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); + i += mi_step) { + mbmi = xd->mi[i + mi_row_offset * mi_stride]; + superblock_width = mi_size_wide[mbmi->sb_type]; + mi_step = AOMMIN(xd->width, superblock_width); if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { @@ -1183,14 +1167,13 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, // scan the nearest left columns if (left_available) { - int mi_col_offset = -1; - - MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; - uint8_t n4_h = mi_size_high[mbmi->sb_type]; + const int mi_col_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; + uint8_t superblock_height = mi_size_high[mbmi->sb_type]; - if (xd->n4_h <= n4_h) { + if (xd->height <= superblock_height) { // Handle "current block height <= above block height" case. - int row_offset = -mi_row % n4_h; + const int row_offset = -mi_row % superblock_height; if (row_offset < 0) do_tl = 0; @@ -1203,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, } } else { // Handle "current block height > above block height" case. - for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) { - int mi_row_offset = i; - mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; - n4_h = mi_size_high[mbmi->sb_type]; - mi_step = AOMMIN(xd->n4_h, n4_h); + for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); + i += mi_step) { + mbmi = xd->mi[mi_col_offset + i * mi_stride]; + superblock_height = mi_size_high[mbmi->sb_type]; + mi_step = AOMMIN(xd->height, superblock_height); if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { @@ -1224,10 +1207,9 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, // Top-left block if (do_tl && left_available && up_available) { - int mi_row_offset = -1; - int mi_col_offset = -1; - - MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; + const int mi_row_offset = -1; + const int mi_col_offset = -1; + MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1); @@ -1241,18 +1223,17 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, // Top-right block if (do_tr && - has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) { - POSITION trb_pos = { -1, xd->n4_w }; - + has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) { + const POSITION trb_pos = { -1, xd->width }; + const TileInfo *const tile = &xd->tile; if (is_inside(tile, mi_col, mi_row, &trb_pos)) { - int mi_row_offset = -1; - int mi_col_offset = xd->n4_w; - - MB_MODE_INFO *mbmi = - xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; + const int mi_row_offset = -1; + const int mi_col_offset = xd->width; + const MB_MODE_INFO *mbmi = + xd->mi[mi_col_offset + mi_row_offset * mi_stride]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { - record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1); + record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1); np++; if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } @@ -1264,36 +1245,43 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, } void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { - cm->is_skip_mode_allowed = 0; - cm->ref_frame_idx_0 = cm->ref_frame_idx_1 = INVALID_IDX; + const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; - if (!cm->seq_params.enable_order_hint || frame_is_intra_only(cm) || - cm->reference_mode == SINGLE_REFERENCE) + skip_mode_info->skip_mode_allowed = 0; + skip_mode_info->ref_frame_idx_0 = INVALID_IDX; + skip_mode_info->ref_frame_idx_1 = INVALID_IDX; + + if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) || + cm->current_frame.reference_mode == SINGLE_REFERENCE) return; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - const int cur_frame_offset = cm->frame_offset; - int ref_frame_offset[2] = { -1, INT_MAX }; + const int cur_order_hint = cm->current_frame.order_hint; + int ref_order_hints[2] = { -1, INT_MAX }; int ref_idx[2] = { INVALID_IDX, INVALID_IDX }; // Identify the nearest forward and backward references. for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - const int buf_idx = cm->frame_refs[i].idx; - if (buf_idx == INVALID_IDX) continue; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; - const int ref_offset = frame_bufs[buf_idx].cur_frame_offset; - if (get_relative_dist(cm, ref_offset, cur_frame_offset) < 0) { + const int ref_order_hint = buf->order_hint; + if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) < + 0) { // Forward reference - if (ref_frame_offset[0] == -1 || - get_relative_dist(cm, ref_offset, ref_frame_offset[0]) > 0) { - ref_frame_offset[0] = ref_offset; + if (ref_order_hints[0] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) > 0) { + ref_order_hints[0] = ref_order_hint; ref_idx[0] = i; } - } else if (get_relative_dist(cm, ref_offset, cur_frame_offset) > 0) { + } else if (get_relative_dist(order_hint_info, ref_order_hint, + cur_order_hint) > 0) { // Backward reference - if (ref_frame_offset[1] == INT_MAX || - get_relative_dist(cm, ref_offset, ref_frame_offset[1]) < 0) { - ref_frame_offset[1] = ref_offset; + if (ref_order_hints[1] == INT_MAX || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) < 0) { + ref_order_hints[1] = ref_order_hint; ref_idx[1] = i; } } @@ -1301,75 +1289,71 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) { // == Bi-directional prediction == - cm->is_skip_mode_allowed = 1; - cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); - cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) { // == Forward prediction only == // Identify the second nearest forward reference. - ref_frame_offset[1] = -1; + ref_order_hints[1] = -1; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - const int buf_idx = cm->frame_refs[i].idx; - if (buf_idx == INVALID_IDX) continue; - - const int ref_offset = frame_bufs[buf_idx].cur_frame_offset; - if ((ref_frame_offset[0] != -1 && - get_relative_dist(cm, ref_offset, ref_frame_offset[0]) < 0) && - (ref_frame_offset[1] == -1 || - get_relative_dist(cm, ref_offset, ref_frame_offset[1]) > 0)) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; + + const int ref_order_hint = buf->order_hint; + if ((ref_order_hints[0] != -1 && + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) < 0) && + (ref_order_hints[1] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) > 0)) { // Second closest forward reference - ref_frame_offset[1] = ref_offset; + ref_order_hints[1] = ref_order_hint; ref_idx[1] = i; } } - if (ref_frame_offset[1] != -1) { - cm->is_skip_mode_allowed = 1; - cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); - cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + if (ref_order_hints[1] != -1) { + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); } } } typedef struct { - int map_idx; // frame map index - int buf_idx; // frame buffer index - int sort_idx; // index based on the offset to be used for sorting + int map_idx; // frame map index + RefCntBuffer *buf; // frame buffer + int sort_idx; // index based on the offset to be used for sorting } REF_FRAME_INFO; +// Compares the sort_idx fields. If they are equal, then compares the map_idx +// fields to break the tie. This ensures a stable sort. static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a; const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b; - if (info_a->sort_idx < info_b->sort_idx) return -1; - if (info_a->sort_idx > info_b->sort_idx) return 1; - return (info_a->map_idx < info_b->map_idx) - ? -1 - : ((info_a->map_idx > info_b->map_idx) ? 1 : 0); + const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx; + if (sort_idx_diff != 0) return sort_idx_diff; + return info_a->map_idx - info_b->map_idx; } -static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx, - REF_FRAME_INFO *ref_info) { +static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx, + REF_FRAME_INFO *ref_info) { assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); - const int buf_idx = ref_info->buf_idx; - - cm->frame_refs[frame_idx].idx = buf_idx; - cm->frame_refs[frame_idx].buf = &cm->buffer_pool->frame_bufs[buf_idx].buf; - cm->frame_refs[frame_idx].map_idx = ref_info->map_idx; + remapped_ref_idx[frame_idx] = ref_info->map_idx; } -void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, - int gld_map_idx) { - BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = pool->frame_bufs; - +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx) { int lst_frame_sort_idx = -1; int gld_frame_sort_idx = -1; - assert(cm->seq_params.enable_order_hint); - assert(cm->seq_params.order_hint_bits_minus_1 >= 0); - const int cur_frame_offset = (int)cm->frame_offset; - const int cur_frame_sort_idx = 1 << cm->seq_params.order_hint_bits_minus_1; + assert(cm->seq_params.order_hint_info.enable_order_hint); + assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0); + const int cur_order_hint = (int)cm->current_frame.order_hint; + const int cur_frame_sort_idx = + 1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1; REF_FRAME_INFO ref_frame_info[REF_FRAMES]; int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 }; @@ -1380,18 +1364,19 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, ref_frame_info[i].map_idx = map_idx; ref_frame_info[i].sort_idx = -1; - const int buf_idx = cm->ref_frame_map[map_idx]; - ref_frame_info[i].buf_idx = buf_idx; + RefCntBuffer *const buf = cm->ref_frame_map[map_idx]; + ref_frame_info[i].buf = buf; - if (buf_idx < 0 || buf_idx >= FRAME_BUFFERS) continue; - // TODO(zoeliu@google.com): To verify the checking on ref_count. - if (frame_bufs[buf_idx].ref_count <= 0) continue; + if (buf == NULL) continue; + // If this assertion fails, there is a reference leak. + assert(buf->ref_count > 0); - const int offset = (int)frame_bufs[buf_idx].cur_frame_offset; + const int offset = (int)buf->order_hint; ref_frame_info[i].sort_idx = (offset == -1) ? -1 : cur_frame_sort_idx + - get_relative_dist(cm, offset, cur_frame_offset); + get_relative_dist(&cm->seq_params.order_hint_info, + offset, cur_order_hint); assert(ref_frame_info[i].sort_idx >= -1); if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx; @@ -1414,8 +1399,8 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, compare_ref_frame_info); // Identify forward and backward reference frames. - // Forward reference: offset < cur_frame_offset - // Backward reference: offset >= cur_frame_offset + // Forward reference: offset < order_hint + // Backward reference: offset >= order_hint int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1; for (int i = 0; i < REF_FRAMES; i++) { @@ -1437,7 +1422,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, // == ALTREF_FRAME == if (bwd_start_idx <= bwd_end_idx) { - set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME, &ref_frame_info[bwd_end_idx]); ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1; bwd_end_idx--; @@ -1445,7 +1430,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, // == BWDREF_FRAME == if (bwd_start_idx <= bwd_end_idx) { - set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME, &ref_frame_info[bwd_start_idx]); ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1; bwd_start_idx++; @@ -1453,7 +1438,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, // == ALTREF2_FRAME == if (bwd_start_idx <= bwd_end_idx) { - set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME, &ref_frame_info[bwd_start_idx]); ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1; } @@ -1463,13 +1448,15 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) { // == LAST_FRAME == if (ref_frame_info[i].map_idx == lst_map_idx) { - set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]); + set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME, + &ref_frame_info[i]); ref_flag_list[LAST_FRAME - LAST_FRAME] = 1; } // == GOLDEN_FRAME == if (ref_frame_info[i].map_idx == gld_map_idx) { - set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]); + set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME, + &ref_frame_info[i]); ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1; } } @@ -1501,18 +1488,19 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, } if (fwd_start_idx > fwd_end_idx) break; - set_ref_frame_info(cm, ref_frame - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, &ref_frame_info[fwd_end_idx]); ref_flag_list[ref_frame - LAST_FRAME] = 1; fwd_end_idx--; } - // Assign all the remaining frame(s), if any, to the earliest reference frame. + // Assign all the remaining frame(s), if any, to the earliest reference + // frame. for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; - set_ref_frame_info(cm, ref_frame - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, &ref_frame_info[fwd_start_idx]); ref_flag_list[ref_frame - LAST_FRAME] = 1; } diff --git a/media/libaom/src/av1/common/mvref_common.h b/media/libaom/src/av1/common/mvref_common.h index 83f7a1ac0..05a0dbc04 100644 --- a/media/libaom/src/av1/common/mvref_common.h +++ b/media/libaom/src/av1/common/mvref_common.h @@ -11,7 +11,7 @@ #ifndef AOM_AV1_COMMON_MVREF_COMMON_H_ #define AOM_AV1_COMMON_MVREF_COMMON_H_ -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #ifdef __cplusplus @@ -34,10 +34,10 @@ typedef struct position { // clamp_mv_ref #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units -static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) { - if (!cm->seq_params.enable_order_hint) return 0; +static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) { + if (!oh->enable_order_hint) return 0; - const int bits = cm->seq_params.order_hint_bits_minus_1 + 1; + const int bits = oh->order_hint_bits_minus_1 + 1; assert(bits >= 1); assert(a >= 0 && a < (1 << bits)); @@ -50,38 +50,19 @@ static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) { } static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) { - clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER, - xd->mb_to_right_edge + bw * 8 + MV_BORDER, - xd->mb_to_top_edge - bh * 8 - MV_BORDER, - xd->mb_to_bottom_edge + bh * 8 + MV_BORDER); + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER, + xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER, + xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER, + xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER + }; + clamp_mv(mv, &mv_limits); } -// This function returns either the appropriate sub block or block's mv -// on whether the block_size < 8x8 and we have check_sub_blocks set. -static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate, - int which_mv, int search_col) { - (void)search_col; +static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) { return candidate->mv[which_mv]; } -static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate, - int which_mv, int search_col) { - (void)search_col; - return candidate->mv[which_mv]; -} - -// Performs mv sign inversion if indicated by the reference frame combination. -static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, - const MV_REFERENCE_FRAME this_ref_frame, - const int *ref_sign_bias) { - int_mv mv = mbmi->mv[ref]; - if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { - mv.as_mv.row *= -1; - mv.as_mv.col *= -1; - } - return mv; -} - // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, @@ -169,14 +150,14 @@ static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = { // clang-format on static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf, - int8_t ref_frame_type) { + MV_REFERENCE_FRAME ref_frame_type) { if (ref_frame_type >= REF_FRAMES) { rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0]; rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1]; } else { + assert(ref_frame_type > NONE_FRAME); rf[0] = ref_frame_type; rf[1] = NONE_FRAME; - assert(ref_frame_type > NONE_FRAME); } } @@ -201,18 +182,17 @@ static INLINE int16_t av1_mode_context_analyzer( return comp_ctx; } -static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack, - int ref_idx) { - if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL && - ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL) +static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) { + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL) return 0; - if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL && - ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL) + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) return 1; - if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL && - ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL) + if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) return 2; return 0; @@ -222,7 +202,8 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm); void av1_setup_frame_sign_bias(AV1_COMMON *cm); void av1_setup_skip_mode_allowed(AV1_COMMON *cm); void av1_setup_motion_field(AV1_COMMON *cm); -void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx); +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx); static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { av1_zero(xd->neighbors_ref_counts); @@ -255,13 +236,16 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm, const MB_MODE_INFO *const mi, int mi_row, int mi_col, int x_mis, int y_mis); +// The global_mvs output parameter points to an array of REF_FRAMES elements. +// The caller may pass a null global_mvs if it does not need the global_mvs +// output. void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], - int_mv *global_mvs, int mi_row, int mi_col, - int16_t *mode_context); + int_mv *global_mvs, int16_t *mode_context); // check a list of motion vectors by sad score using a number rows of pixels // above and a number cols of pixels in the left to select the one with best @@ -269,25 +253,24 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv, int is_integer); -int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize); -int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, - int *pts, int *pts_inref); +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize); +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref); #define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64) static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile, - int mib_size, int mi_row, int mi_col) { - (void)mi_col; + int mib_size, int mi_row) { if (mi_row - mib_size < tile->mi_row_start) { - ref_dv->as_mv.row = 0; - ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS; + ref_dv->as_fullmv.row = 0; + ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS; } else { - ref_dv->as_mv.row = -MI_SIZE * mib_size; - ref_dv->as_mv.col = 0; + ref_dv->as_fullmv.row = -MI_SIZE * mib_size; + ref_dv->as_fullmv.col = 0; } - ref_dv->as_mv.row *= 8; - ref_dv->as_mv.col *= 8; + convert_fullmv_to_mv(ref_dv); } static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, @@ -319,15 +302,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, // Special case for sub 8x8 chroma cases, to prevent referring to chroma // pixels outside current tile. - for (int plane = 1; plane < av1_num_planes(cm); ++plane) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, - pd->subsampling_y)) { - if (bw < 8 && pd->subsampling_x) - if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0; - if (bh < 8 && pd->subsampling_y) - if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0; - } + if (xd->is_chroma_ref && av1_num_planes(cm) > 1) { + const struct macroblockd_plane *const pd = &xd->plane[1]; + if (bw < 8 && pd->subsampling_x) + if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0; + if (bh < 8 && pd->subsampling_y) + if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0; } // Is the bottom right within an already coded SB? Also consider additional diff --git a/media/libaom/src/av1/common/obmc.h b/media/libaom/src/av1/common/obmc.h index 1c90cd93f..cc97b6bb1 100644 --- a/media/libaom/src/av1/common/obmc.h +++ b/media/libaom/src/av1/common/obmc.h @@ -12,25 +12,24 @@ #ifndef AOM_AV1_COMMON_OBMC_H_ #define AOM_AV1_COMMON_OBMC_H_ -typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos, - uint8_t nb_mi_size, - MB_MODE_INFO *nb_mi, void *fun_ctxt, - const int num_planes); +typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, + void *fun_ctxt, const int num_planes); static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, - MACROBLOCKD *xd, int mi_col, - int nb_max, + MACROBLOCKD *xd, int nb_max, overlappable_nb_visitor_t fun, void *fun_ctxt) { - const int num_planes = av1_num_planes(cm); if (!xd->up_available) return; + const int num_planes = av1_num_planes(cm); int nb_count = 0; - + const int mi_col = xd->mi_col; // prev_row_mi points into the mi array, starting at the beginning of the // previous row. MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; - const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols); + const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols); uint8_t mi_step; for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max; above_mi_col += mi_step) { @@ -49,26 +48,25 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, } if (is_neighbor_overlappable(*above_mi)) { ++nb_count; - fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi, - fun_ctxt, num_planes); + fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0, + *above_mi, fun_ctxt, num_planes); } } } static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, - MACROBLOCKD *xd, int mi_row, - int nb_max, + MACROBLOCKD *xd, int nb_max, overlappable_nb_visitor_t fun, void *fun_ctxt) { - const int num_planes = av1_num_planes(cm); if (!xd->left_available) return; + const int num_planes = av1_num_planes(cm); int nb_count = 0; - // prev_col_mi points into the mi array, starting at the top of the // previous column + const int mi_row = xd->mi_row; MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; - const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows); + const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows); uint8_t mi_step; for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max; left_mi_row += mi_step) { @@ -82,7 +80,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, } if (is_neighbor_overlappable(*left_mi)) { ++nb_count; - fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi, + fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi, fun_ctxt, num_planes); } } diff --git a/media/libaom/src/av1/common/obu_util.c b/media/libaom/src/av1/common/obu_util.c index 823b700b1..7d2694b89 100644 --- a/media/libaom/src/av1/common/obu_util.c +++ b/media/libaom/src/av1/common/obu_util.c @@ -8,6 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <assert.h> + #include "av1/common/obu_util.h" #include "aom_dsp/bitreader_buffer.h" @@ -112,36 +114,41 @@ aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, ObuHeader *obu_header, size_t *const payload_size, size_t *const bytes_read) { - size_t length_field_size = 0, obu_size = 0; + size_t length_field_size_obu = 0; + size_t length_field_size_payload = 0; + size_t obu_size = 0; aom_codec_err_t status; if (is_annexb) { // Size field comes before the OBU header, and includes the OBU header status = - read_obu_size(data, bytes_available, &obu_size, &length_field_size); + read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu); if (status != AOM_CODEC_OK) return status; } - struct aom_read_bit_buffer rb = { data + length_field_size, + struct aom_read_bit_buffer rb = { data + length_field_size_obu, data + bytes_available, 0, NULL, NULL }; status = read_obu_header(&rb, is_annexb, obu_header); if (status != AOM_CODEC_OK) return status; - if (is_annexb) { + if (!obu_header->has_size_field) { + assert(is_annexb); // Derive the payload size from the data we've already read if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; *payload_size = obu_size - obu_header->size; } else { // Size field comes after the OBU header, and is just the payload size - status = read_obu_size(data + obu_header->size, - bytes_available - obu_header->size, payload_size, - &length_field_size); + status = read_obu_size( + data + length_field_size_obu + obu_header->size, + bytes_available - length_field_size_obu - obu_header->size, + payload_size, &length_field_size_payload); if (status != AOM_CODEC_OK) return status; } - *bytes_read = length_field_size + obu_header->size; + *bytes_read = + length_field_size_obu + obu_header->size + length_field_size_payload; return AOM_CODEC_OK; } diff --git a/media/libaom/src/av1/common/onyxc_int.h b/media/libaom/src/av1/common/onyxc_int.h deleted file mode 100644 index ff011c89e..000000000 --- a/media/libaom/src/av1/common/onyxc_int.h +++ /dev/null @@ -1,1342 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AV1_COMMON_ONYXC_INT_H_ -#define AOM_AV1_COMMON_ONYXC_INT_H_ - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "aom/internal/aom_codec_internal.h" -#include "aom_util/aom_thread.h" -#include "av1/common/alloccommon.h" -#include "av1/common/av1_loopfilter.h" -#include "av1/common/entropy.h" -#include "av1/common/entropymode.h" -#include "av1/common/entropymv.h" -#include "av1/common/enums.h" -#include "av1/common/frame_buffers.h" -#include "av1/common/mv.h" -#include "av1/common/quant_common.h" -#include "av1/common/restoration.h" -#include "av1/common/tile_common.h" -#include "av1/common/timing.h" -#include "av1/common/odintrin.h" -#include "av1/encoder/hash_motion.h" -#include "aom_dsp/grain_synthesis.h" -#include "aom_dsp/grain_table.h" -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(__clang__) && defined(__has_warning) -#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") -#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT -#endif -#elif defined(__GNUC__) && __GNUC__ >= 7 -#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT -#endif - -#ifndef AOM_FALLTHROUGH_INTENDED -#define AOM_FALLTHROUGH_INTENDED \ - do { \ - } while (0) -#endif - -#define CDEF_MAX_STRENGTHS 16 - -/* Constant values while waiting for the sequence header */ -#define FRAME_ID_LENGTH 15 -#define DELTA_FRAME_ID_LENGTH 14 - -#define FRAME_CONTEXTS (FRAME_BUFFERS + 1) -// Extra frame context which is always kept at default values -#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1) -#define PRIMARY_REF_BITS 3 -#define PRIMARY_REF_NONE 7 - -#define NUM_PING_PONG_BUFFERS 2 - -#define MAX_NUM_TEMPORAL_LAYERS 8 -#define MAX_NUM_SPATIAL_LAYERS 4 -/* clang-format off */ -// clang-format seems to think this is a pointer dereference and not a -// multiplication. -#define MAX_NUM_OPERATING_POINTS \ - MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS -/* clang-format on*/ - -// TODO(jingning): Turning this on to set up transform coefficient -// processing timer. -#define TXCOEFF_TIMER 0 -#define TXCOEFF_COST_TIMER 0 - -typedef enum { - SINGLE_REFERENCE = 0, - COMPOUND_REFERENCE = 1, - REFERENCE_MODE_SELECT = 2, - REFERENCE_MODES = 3, -} REFERENCE_MODE; - -typedef enum { - /** - * Frame context updates are disabled - */ - REFRESH_FRAME_CONTEXT_DISABLED, - /** - * Update frame context to values resulting from backward probability - * updates based on entropy/counts in the decoded frame - */ - REFRESH_FRAME_CONTEXT_BACKWARD, -} REFRESH_FRAME_CONTEXT_MODE; - -#define MFMV_STACK_SIZE 3 -typedef struct { - int_mv mfmv0; - uint8_t ref_frame_offset; -} TPL_MV_REF; - -typedef struct { - int_mv mv; - MV_REFERENCE_FRAME ref_frame; -} MV_REF; - -typedef struct { - int ref_count; - - unsigned int cur_frame_offset; - unsigned int ref_frame_offset[INTER_REFS_PER_FRAME]; - - MV_REF *mvs; - uint8_t *seg_map; - struct segmentation seg; - int mi_rows; - int mi_cols; - // Width and height give the size of the buffer (before any upscaling, unlike - // the sizes that can be derived from the buf structure) - int width; - int height; - WarpedMotionParams global_motion[REF_FRAMES]; - int showable_frame; // frame can be used as show existing frame in future - int film_grain_params_present; - aom_film_grain_t film_grain_params; - aom_codec_frame_buffer_t raw_frame_buffer; - YV12_BUFFER_CONFIG buf; - hash_table hash_table; - uint8_t intra_only; - FRAME_TYPE frame_type; - // The Following variables will only be used in frame parallel decode. - - // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means - // that no FrameWorker owns, or is decoding, this buffer. - AVxWorker *frame_worker_owner; - - // row and col indicate which position frame has been decoded to in real - // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX - // when the frame is fully decoded. - int row; - int col; - - // Inter frame reference frame delta for loop filter - int8_t ref_deltas[REF_FRAMES]; - - // 0 = ZERO_MV, MV - int8_t mode_deltas[MAX_MODE_LF_DELTAS]; -} RefCntBuffer; - -typedef struct BufferPool { -// Protect BufferPool from being accessed by several FrameWorkers at -// the same time during frame parallel decode. -// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. -#if CONFIG_MULTITHREAD - pthread_mutex_t pool_mutex; -#endif - - // Private data associated with the frame buffer callbacks. - void *cb_priv; - - aom_get_frame_buffer_cb_fn_t get_fb_cb; - aom_release_frame_buffer_cb_fn_t release_fb_cb; - - RefCntBuffer frame_bufs[FRAME_BUFFERS]; - - // Frame buffers allocated internally by the codec. - InternalFrameBufferList int_frame_buffers; -} BufferPool; - -typedef struct { - int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/] - [BASE_CONTEXT_POSITION_NUM + 1]; -} LV_MAP_CTX_TABLE; -typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/] - [BASE_CONTEXT_POSITION_NUM + 1]; - -typedef struct BitstreamLevel { - uint8_t major; - uint8_t minor; -} BitstreamLevel; - -// Sequence header structure. -// Note: All syntax elements of sequence_header_obu that need to be -// bit-identical across multiple sequence headers must be part of this struct, -// so that consistency is checked by are_seq_headers_consistent() function. -typedef struct SequenceHeader { - int num_bits_width; - int num_bits_height; - int max_frame_width; - int max_frame_height; - int frame_id_numbers_present_flag; - int frame_id_length; - int delta_frame_id_length; - BLOCK_SIZE sb_size; // Size of the superblock used for this frame - int mib_size; // Size of the superblock in units of MI blocks - int mib_size_log2; // Log 2 of above. - int order_hint_bits_minus_1; - int force_screen_content_tools; // 0 - force off - // 1 - force on - // 2 - adaptive - int force_integer_mv; // 0 - Not to force. MV can be in 1/4 or 1/8 - // 1 - force to integer - // 2 - adaptive - int still_picture; // Video is a single frame still picture - int reduced_still_picture_hdr; // Use reduced header for still picture - int enable_filter_intra; // enables/disables filterintra - int enable_intra_edge_filter; // enables/disables corner/edge/upsampling - int enable_interintra_compound; // enables/disables interintra_compound - int enable_masked_compound; // enables/disables masked compound - int enable_dual_filter; // 0 - disable dual interpolation filter - // 1 - enable vert/horiz filter selection - int enable_order_hint; // 0 - disable order hint, and related tools - // jnt_comp, ref_frame_mvs, frame_sign_bias - // if 0, enable_jnt_comp and - // enable_ref_frame_mvs must be set zs 0. - int enable_jnt_comp; // 0 - disable joint compound modes - // 1 - enable it - int enable_ref_frame_mvs; // 0 - disable ref frame mvs - // 1 - enable it - int enable_warped_motion; // 0 - disable warped motion for sequence - // 1 - enable it for the sequence - int enable_superres; // 0 - Disable superres for the sequence, and disable - // transmitting per-frame superres enabled flag. - // 1 - Enable superres for the sequence, and also - // enable per-frame flag to denote if superres is - // enabled for that frame. - int enable_cdef; // To turn on/off CDEF - int enable_restoration; // To turn on/off loop restoration - BITSTREAM_PROFILE profile; - - // Operating point info. - int operating_points_cnt_minus_1; - int operating_point_idc[MAX_NUM_OPERATING_POINTS]; - int display_model_info_present_flag; - int decoder_model_info_present_flag; - BitstreamLevel level[MAX_NUM_OPERATING_POINTS]; - uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0 - // or 1. - - // Color config. - aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, - // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. - int use_highbitdepth; // If true, we need to use 16bit frame buffers. - int monochrome; // Monochorme video - aom_color_primaries_t color_primaries; - aom_transfer_characteristics_t transfer_characteristics; - aom_matrix_coefficients_t matrix_coefficients; - int color_range; - int subsampling_x; // Chroma subsampling for x - int subsampling_y; // Chroma subsampling for y - aom_chroma_sample_position_t chroma_sample_position; - int separate_uv_delta_q; - - int film_grain_params_present; -} SequenceHeader; - -typedef struct AV1Common { - struct aom_internal_error_info error; - int width; - int height; - int render_width; - int render_height; - int last_width; - int last_height; - int timing_info_present; - aom_timing_info_t timing_info; - int buffer_removal_time_present; - aom_dec_model_info_t buffer_model; - aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; - aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1]; - uint32_t frame_presentation_time; - - int largest_tile_id; - size_t largest_tile_size; - int context_update_tile_id; - - // Scale of the current frame with respect to itself. - struct scale_factors sf_identity; - - YV12_BUFFER_CONFIG *frame_to_show; - RefCntBuffer *prev_frame; - - // TODO(hkuang): Combine this with cur_buf in macroblockd. - RefCntBuffer *cur_frame; - - int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */ - - // Prepare ref_frame_map for the next frame. - // Only used in frame parallel decode. - int next_ref_frame_map[REF_FRAMES]; - - // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and - // roll new_fb_idx into it. - - // Each Inter frame can reference INTER_REFS_PER_FRAME buffers - RefBuffer frame_refs[INTER_REFS_PER_FRAME]; - int is_skip_mode_allowed; - int skip_mode_flag; - int ref_frame_idx_0; - int ref_frame_idx_1; - - int new_fb_idx; - - FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ - FRAME_TYPE frame_type; - - int show_frame; - int showable_frame; // frame can be used as show existing frame in future - int last_show_frame; - int show_existing_frame; - // Flag for a frame used as a reference - not written to the bitstream - int is_reference_frame; - int reset_decoder_state; - - // Flag signaling that the frame is encoded using only INTRA modes. - uint8_t intra_only; - uint8_t last_intra_only; - uint8_t disable_cdf_update; - int allow_high_precision_mv; - int cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer - - int allow_screen_content_tools; - int allow_intrabc; - int allow_warped_motion; - - // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in - // MB_MODE_INFO (8-pixel) units. - int MBs; - int mb_rows, mi_rows; - int mb_cols, mi_cols; - int mi_stride; - - /* profile settings */ - TX_MODE tx_mode; - -#if CONFIG_ENTROPY_STATS - int coef_cdf_category; -#endif - - int base_qindex; - int y_dc_delta_q; - int u_dc_delta_q; - int v_dc_delta_q; - int u_ac_delta_q; - int v_ac_delta_q; - - // The dequantizers below are true dequntizers used only in the - // dequantization process. They have the same coefficient - // shift/scale as TX. - int16_t y_dequant_QTX[MAX_SEGMENTS][2]; - int16_t u_dequant_QTX[MAX_SEGMENTS][2]; - int16_t v_dequant_QTX[MAX_SEGMENTS][2]; - - // Global quant matrix tables - const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; - const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; - - // Local quant matrix tables for each frame - const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; - const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; - const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; - - // Encoder - int using_qmatrix; - int qm_y; - int qm_u; - int qm_v; - int min_qmlevel; - int max_qmlevel; - - /* We allocate a MB_MODE_INFO struct for each macroblock, together with - an extra row on top and column on the left to simplify prediction. */ - int mi_alloc_size; - MB_MODE_INFO *mip; /* Base of allocated array */ - MB_MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ - - // TODO(agrange): Move prev_mi into encoder structure. - // prev_mip and prev_mi will only be allocated in encoder. - MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */ - MB_MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ - - // Separate mi functions between encoder and decoder. - int (*alloc_mi)(struct AV1Common *cm, int mi_size); - void (*free_mi)(struct AV1Common *cm); - void (*setup_mi)(struct AV1Common *cm); - - // Grid of pointers to 8x8 MB_MODE_INFO structs. Any 8x8 not in the visible - // area will be NULL. - MB_MODE_INFO **mi_grid_base; - MB_MODE_INFO **mi_grid_visible; - MB_MODE_INFO **prev_mi_grid_base; - MB_MODE_INFO **prev_mi_grid_visible; - - // Whether to use previous frames' motion vectors for prediction. - int allow_ref_frame_mvs; - - uint8_t *last_frame_seg_map; - uint8_t *current_frame_seg_map; - int seg_map_alloc_size; - - InterpFilter interp_filter; - - int switchable_motion_mode; - - loop_filter_info_n lf_info; - // The denominator of the superres scale; the numerator is fixed. - uint8_t superres_scale_denominator; - int superres_upscaled_width; - int superres_upscaled_height; - RestorationInfo rst_info[MAX_MB_PLANE]; - - // rst_end_stripe[i] is one more than the index of the bottom stripe - // for tile row i. - int rst_end_stripe[MAX_TILE_ROWS]; - - // Pointer to a scratch buffer used by self-guided restoration - int32_t *rst_tmpbuf; - RestorationLineBuffers *rlbs; - - // Output of loop restoration - YV12_BUFFER_CONFIG rst_frame; - - // Flag signaling how frame contexts should be updated at the end of - // a frame decode - REFRESH_FRAME_CONTEXT_MODE refresh_frame_context; - - int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */ - - struct loopfilter lf; - struct segmentation seg; - int coded_lossless; // frame is fully lossless at the coded resolution. - int all_lossless; // frame is fully lossless at the upscaled resolution. - - int reduced_tx_set_used; - - // Context probabilities for reference frame prediction - MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS]; - MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS]; - REFERENCE_MODE reference_mode; - - FRAME_CONTEXT *fc; /* this frame entropy */ - FRAME_CONTEXT *frame_contexts; // FRAME_CONTEXTS - unsigned int frame_context_idx; /* Context to use/update */ - int fb_of_context_type[REF_FRAMES]; - int primary_ref_frame; - - unsigned int frame_offset; - - unsigned int current_video_frame; - - aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer - - int error_resilient_mode; - int force_primary_ref_none; - - int tile_cols, tile_rows; - int last_tile_cols, last_tile_rows; - - int max_tile_width_sb; - int min_log2_tile_cols; - int max_log2_tile_cols; - int max_log2_tile_rows; - int min_log2_tile_rows; - int min_log2_tiles; - int max_tile_height_sb; - int uniform_tile_spacing_flag; - int log2_tile_cols; // only valid for uniform tiles - int log2_tile_rows; // only valid for uniform tiles - int tile_col_start_sb[MAX_TILE_COLS + 1]; // valid for 0 <= i <= tile_cols - int tile_row_start_sb[MAX_TILE_ROWS + 1]; // valid for 0 <= i <= tile_rows - int tile_width, tile_height; // In MI units - - unsigned int large_scale_tile; - unsigned int single_tile_decoding; - - int byte_alignment; - int skip_loop_filter; - int skip_film_grain; - - // Private data associated with the frame buffer callbacks. - void *cb_priv; - aom_get_frame_buffer_cb_fn_t get_fb_cb; - aom_release_frame_buffer_cb_fn_t release_fb_cb; - - // Handles memory for the codec. - InternalFrameBufferList int_frame_buffers; - - // External BufferPool passed from outside. - BufferPool *buffer_pool; - - PARTITION_CONTEXT **above_seg_context; - ENTROPY_CONTEXT **above_context[MAX_MB_PLANE]; - TXFM_CONTEXT **above_txfm_context; - WarpedMotionParams global_motion[REF_FRAMES]; - aom_film_grain_t film_grain_params; - - int cdef_pri_damping; - int cdef_sec_damping; - int nb_cdef_strengths; - int cdef_strengths[CDEF_MAX_STRENGTHS]; - int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; - int cdef_bits; - - int delta_q_present_flag; - // Resolution of delta quant - int delta_q_res; - int delta_lf_present_flag; - // Resolution of delta lf level - int delta_lf_res; - // This is a flag for number of deltas of loop filter level - // 0: use 1 delta, for y_vertical, y_horizontal, u, and v - // 1: use separate deltas for each filter level - int delta_lf_multi; - int num_tg; - SequenceHeader seq_params; - int current_frame_id; - int ref_frame_id[REF_FRAMES]; - int valid_for_referencing[REF_FRAMES]; - int invalid_delta_frame_id_minus_1; - LV_MAP_CTX_TABLE coeff_ctx_table; - TPL_MV_REF *tpl_mvs; - int tpl_mvs_mem_size; - // TODO(jingning): This can be combined with sign_bias later. - int8_t ref_frame_side[REF_FRAMES]; - - int is_annexb; - - int frame_refs_short_signaling; - int temporal_layer_id; - int spatial_layer_id; - unsigned int number_temporal_layers; - unsigned int number_spatial_layers; - int num_allocated_above_context_mi_col; - int num_allocated_above_contexts; - int num_allocated_above_context_planes; - -#if TXCOEFF_TIMER - int64_t cum_txcoeff_timer; - int64_t txcoeff_timer; - int txb_count; -#endif - -#if TXCOEFF_COST_TIMER - int64_t cum_txcoeff_cost_timer; - int64_t txcoeff_cost_timer; - int64_t txcoeff_cost_count; -#endif - const cfg_options_t *options; -} AV1_COMMON; - -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -static void lock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_lock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - -static void unlock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_unlock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - -static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) { - if (index < 0 || index >= REF_FRAMES) return NULL; - if (cm->ref_frame_map[index] < 0) return NULL; - assert(cm->ref_frame_map[index] < FRAME_BUFFERS); - return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf; -} - -static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer( - const AV1_COMMON *const cm) { - return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf; -} - -static INLINE int get_free_fb(AV1_COMMON *cm) { - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - int i; - - lock_buffer_pool(cm->buffer_pool); - for (i = 0; i < FRAME_BUFFERS; ++i) - if (frame_bufs[i].ref_count == 0) break; - - if (i != FRAME_BUFFERS) { - if (frame_bufs[i].buf.use_external_reference_buffers) { - // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the - // external reference buffers. Restore the buffer pointers to point to the - // internally allocated memory. - YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf; - ybf->y_buffer = ybf->store_buf_adr[0]; - ybf->u_buffer = ybf->store_buf_adr[1]; - ybf->v_buffer = ybf->store_buf_adr[2]; - ybf->use_external_reference_buffers = 0; - } - - frame_bufs[i].ref_count = 1; - } else { - // Reset i to be INVALID_IDX to indicate no free buffer found. - i = INVALID_IDX; - } - - unlock_buffer_pool(cm->buffer_pool); - return i; -} - -static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { - const int ref_index = *idx; - - if (ref_index >= 0 && bufs[ref_index].ref_count > 0) - bufs[ref_index].ref_count--; - - *idx = new_idx; - - bufs[new_idx].ref_count++; -} - -static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) { - return cm->frame_type == KEY_FRAME || cm->intra_only; -} - -static INLINE int frame_is_sframe(const AV1_COMMON *cm) { - return cm->frame_type == S_FRAME; -} - -static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) { - if (cm->primary_ref_frame == PRIMARY_REF_NONE || - cm->frame_refs[cm->primary_ref_frame].idx == INVALID_IDX) { - return NULL; - } else { - return &cm->buffer_pool - ->frame_bufs[cm->frame_refs[cm->primary_ref_frame].idx]; - } -} - -// Returns 1 if this frame might allow mvs from some reference frame. -static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) { - return !cm->error_resilient_mode && cm->seq_params.enable_ref_frame_mvs && - cm->seq_params.enable_order_hint && !frame_is_intra_only(cm); -} - -// Returns 1 if this frame might use warped_motion -static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) { - return !cm->error_resilient_mode && !frame_is_intra_only(cm) && - cm->seq_params.enable_warped_motion; -} - -static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { - const int buf_rows = buf->mi_rows; - const int buf_cols = buf->mi_cols; - - if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) { - aom_free(buf->mvs); - buf->mi_rows = cm->mi_rows; - buf->mi_cols = cm->mi_cols; - CHECK_MEM_ERROR(cm, buf->mvs, - (MV_REF *)aom_calloc( - ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1), - sizeof(*buf->mvs))); - aom_free(buf->seg_map); - CHECK_MEM_ERROR(cm, buf->seg_map, - (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*buf->seg_map))); - } - - const int mem_size = - ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1); - int realloc = cm->tpl_mvs == NULL; - if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size; - - if (realloc) { - aom_free(cm->tpl_mvs); - CHECK_MEM_ERROR(cm, cm->tpl_mvs, - (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs))); - cm->tpl_mvs_mem_size = mem_size; - } -} - -void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); - -static INLINE int av1_num_planes(const AV1_COMMON *cm) { - return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE; -} - -static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd, - const int tile_row) { - const int num_planes = av1_num_planes(cm); - for (int i = 0; i < num_planes; ++i) { - xd->above_context[i] = cm->above_context[i][tile_row]; - } - xd->above_seg_context = cm->above_seg_context[tile_row]; - xd->above_txfm_context = cm->above_txfm_context[tile_row]; -} - -static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd, - tran_low_t *dqcoeff) { - const int num_planes = av1_num_planes(cm); - for (int i = 0; i < num_planes; ++i) { - xd->plane[i].dqcoeff = dqcoeff; - - if (xd->plane[i].plane_type == PLANE_TYPE_Y) { - memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX, - sizeof(cm->y_dequant_QTX)); - memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix)); - - } else { - if (i == AOM_PLANE_U) { - memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX, - sizeof(cm->u_dequant_QTX)); - memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix, - sizeof(cm->u_iqmatrix)); - } else { - memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX, - sizeof(cm->v_dequant_QTX)); - memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix, - sizeof(cm->v_iqmatrix)); - } - } - } - xd->mi_stride = cm->mi_stride; - xd->error_info = &cm->error; - cfl_init(&xd->cfl, &cm->seq_params); -} - -static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col, - const int num_planes) { - int i; - int row_offset = mi_row; - int col_offset = mi_col; - for (i = 0; i < num_planes; ++i) { - struct macroblockd_plane *const pd = &xd->plane[i]; - // Offset the buffer pointer - const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) - row_offset = mi_row - 1; - if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) - col_offset = mi_col - 1; - int above_idx = col_offset; - int left_idx = row_offset & MAX_MIB_MASK; - pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x]; - pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y]; - } -} - -static INLINE int calc_mi_size(int len) { - // len is in mi units. Align to a multiple of SBs. - return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2); -} - -static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, - const int num_planes) { - int i; - for (i = 0; i < num_planes; i++) { - xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x; - xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y; - - xd->plane[i].width = AOMMAX(xd->plane[i].width, 4); - xd->plane[i].height = AOMMAX(xd->plane[i].height, 4); - } -} - -static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, - int mi_row, int bh, int mi_col, int bw, - int mi_rows, int mi_cols) { - xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; - xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8; - - // Are edges available for intra prediction? - xd->up_available = (mi_row > tile->mi_row_start); - - const int ss_x = xd->plane[1].subsampling_x; - const int ss_y = xd->plane[1].subsampling_y; - - xd->left_available = (mi_col > tile->mi_col_start); - xd->chroma_up_available = xd->up_available; - xd->chroma_left_available = xd->left_available; - if (ss_x && bw < mi_size_wide[BLOCK_8X8]) - xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start; - if (ss_y && bh < mi_size_high[BLOCK_8X8]) - xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start; - if (xd->up_available) { - xd->above_mbmi = xd->mi[-xd->mi_stride]; - } else { - xd->above_mbmi = NULL; - } - - if (xd->left_available) { - xd->left_mbmi = xd->mi[-1]; - } else { - xd->left_mbmi = NULL; - } - - const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) && - ((mi_col & 0x01) || !(bw & 0x01) || !ss_x); - if (chroma_ref) { - // To help calculate the "above" and "left" chroma blocks, note that the - // current block may cover multiple luma blocks (eg, if partitioned into - // 4x4 luma blocks). - // First, find the top-left-most luma block covered by this chroma block - MB_MODE_INFO **base_mi = - &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)]; - - // Then, we consider the luma region covered by the left or above 4x4 chroma - // prediction. We want to point to the chroma reference block in that - // region, which is the bottom-right-most mi unit. - // This leads to the following offsets: - MB_MODE_INFO *chroma_above_mi = - xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL; - xd->chroma_above_mbmi = chroma_above_mi; - - MB_MODE_INFO *chroma_left_mi = - xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL; - xd->chroma_left_mbmi = chroma_left_mi; - } - - xd->n4_h = bh; - xd->n4_w = bw; - xd->is_sec_rect = 0; - if (xd->n4_w < xd->n4_h) { - // Only mark is_sec_rect as 1 for the last block. - // For PARTITION_VERT_4, it would be (0, 0, 0, 1); - // For other partitions, it would be (0, 1). - if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1; - } - - if (xd->n4_w > xd->n4_h) - if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1; -} - -static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, - const MB_MODE_INFO *above_mi, - const MB_MODE_INFO *left_mi) { - const PREDICTION_MODE above = av1_above_block_mode(above_mi); - const PREDICTION_MODE left = av1_left_block_mode(left_mi); - const int above_ctx = intra_mode_context[above]; - const int left_ctx = intra_mode_context[left]; - return tile_ctx->kf_y_cdf[above_ctx][left_ctx]; -} - -static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row, - int mi_col, BLOCK_SIZE subsize, - BLOCK_SIZE bsize) { - PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col; - PARTITION_CONTEXT *const left_ctx = - xd->left_seg_context + (mi_row & MAX_MIB_MASK); - - const int bw = mi_size_wide[bsize]; - const int bh = mi_size_high[bsize]; - memset(above_ctx, partition_context_lookup[subsize].above, bw); - memset(left_ctx, partition_context_lookup[subsize].left, bh); -} - -static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize, - int subsampling_x, int subsampling_y) { - const int bw = mi_size_wide[bsize]; - const int bh = mi_size_high[bsize]; - int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) && - ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x); - return ref_pos; -} - -static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x, - int subsampling_y) { - BLOCK_SIZE bs = bsize; - switch (bsize) { - case BLOCK_4X4: - if (subsampling_x == 1 && subsampling_y == 1) - bs = BLOCK_8X8; - else if (subsampling_x == 1) - bs = BLOCK_8X4; - else if (subsampling_y == 1) - bs = BLOCK_4X8; - break; - case BLOCK_4X8: - if (subsampling_x == 1 && subsampling_y == 1) - bs = BLOCK_8X8; - else if (subsampling_x == 1) - bs = BLOCK_8X8; - else if (subsampling_y == 1) - bs = BLOCK_4X8; - break; - case BLOCK_8X4: - if (subsampling_x == 1 && subsampling_y == 1) - bs = BLOCK_8X8; - else if (subsampling_x == 1) - bs = BLOCK_8X4; - else if (subsampling_y == 1) - bs = BLOCK_8X8; - break; - case BLOCK_4X16: - if (subsampling_x == 1 && subsampling_y == 1) - bs = BLOCK_8X16; - else if (subsampling_x == 1) - bs = BLOCK_8X16; - else if (subsampling_y == 1) - bs = BLOCK_4X16; - break; - case BLOCK_16X4: - if (subsampling_x == 1 && subsampling_y == 1) - bs = BLOCK_16X8; - else if (subsampling_x == 1) - bs = BLOCK_16X4; - else if (subsampling_y == 1) - bs = BLOCK_16X8; - break; - default: break; - } - return bs; -} - -static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf, - size_t element) { - assert(cdf != NULL); - return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element]; -} - -static INLINE void partition_gather_horz_alike(aom_cdf_prob *out, - const aom_cdf_prob *const in, - BLOCK_SIZE bsize) { - (void)bsize; - out[0] = CDF_PROB_TOP; - out[0] -= cdf_element_prob(in, PARTITION_HORZ); - out[0] -= cdf_element_prob(in, PARTITION_SPLIT); - out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); - out[0] -= cdf_element_prob(in, PARTITION_HORZ_B); - out[0] -= cdf_element_prob(in, PARTITION_VERT_A); - if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4); - out[0] = AOM_ICDF(out[0]); - out[1] = AOM_ICDF(CDF_PROB_TOP); -} - -static INLINE void partition_gather_vert_alike(aom_cdf_prob *out, - const aom_cdf_prob *const in, - BLOCK_SIZE bsize) { - (void)bsize; - out[0] = CDF_PROB_TOP; - out[0] -= cdf_element_prob(in, PARTITION_VERT); - out[0] -= cdf_element_prob(in, PARTITION_SPLIT); - out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); - out[0] -= cdf_element_prob(in, PARTITION_VERT_A); - out[0] -= cdf_element_prob(in, PARTITION_VERT_B); - if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4); - out[0] = AOM_ICDF(out[0]); - out[1] = AOM_ICDF(CDF_PROB_TOP); -} - -static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row, - int mi_col, BLOCK_SIZE subsize, - BLOCK_SIZE bsize, - PARTITION_TYPE partition) { - if (bsize >= BLOCK_8X8) { - const int hbs = mi_size_wide[bsize] / 2; - BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); - switch (partition) { - case PARTITION_SPLIT: - if (bsize != BLOCK_8X8) break; - AOM_FALLTHROUGH_INTENDED; - case PARTITION_NONE: - case PARTITION_HORZ: - case PARTITION_VERT: - case PARTITION_HORZ_4: - case PARTITION_VERT_4: - update_partition_context(xd, mi_row, mi_col, subsize, bsize); - break; - case PARTITION_HORZ_A: - update_partition_context(xd, mi_row, mi_col, bsize2, subsize); - update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize); - break; - case PARTITION_HORZ_B: - update_partition_context(xd, mi_row, mi_col, subsize, subsize); - update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize); - break; - case PARTITION_VERT_A: - update_partition_context(xd, mi_row, mi_col, bsize2, subsize); - update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize); - break; - case PARTITION_VERT_B: - update_partition_context(xd, mi_row, mi_col, subsize, subsize); - update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize); - break; - default: assert(0 && "Invalid partition type"); - } - } -} - -static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, - int mi_col, BLOCK_SIZE bsize) { - const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col; - const PARTITION_CONTEXT *left_ctx = - xd->left_seg_context + (mi_row & MAX_MIB_MASK); - // Minimum partition point is 8x8. Offset the bsl accordingly. - const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8]; - int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; - - assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]); - assert(bsl >= 0); - - return (left * 2 + above) + bsl * PARTITION_PLOFFSET; -} - -// Return the number of elements in the partition CDF when -// partitioning the (square) block with luma block size of bsize. -static INLINE int partition_cdf_length(BLOCK_SIZE bsize) { - if (bsize <= BLOCK_8X8) - return PARTITION_TYPES; - else if (bsize == BLOCK_128X128) - return EXT_PARTITION_TYPES - 2; - else - return EXT_PARTITION_TYPES; -} - -static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane) { - int max_blocks_wide = block_size_wide[bsize]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - - if (xd->mb_to_right_edge < 0) - max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x); - - // Scale the width in the transform block unit. - return max_blocks_wide >> tx_size_wide_log2[0]; -} - -static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane) { - int max_blocks_high = block_size_high[bsize]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - - if (xd->mb_to_bottom_edge < 0) - max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y); - - // Scale the height in the transform block unit. - return max_blocks_high >> tx_size_high_log2[0]; -} - -static INLINE int max_intra_block_width(const MACROBLOCKD *xd, - BLOCK_SIZE plane_bsize, int plane, - TX_SIZE tx_size) { - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane) - << tx_size_wide_log2[0]; - return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]); -} - -static INLINE int max_intra_block_height(const MACROBLOCKD *xd, - BLOCK_SIZE plane_bsize, int plane, - TX_SIZE tx_size) { - const int max_blocks_high = max_block_high(xd, plane_bsize, plane) - << tx_size_high_log2[0]; - return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]); -} - -static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd, - int mi_col_start, int mi_col_end, const int tile_row) { - const SequenceHeader *const seq_params = &cm->seq_params; - const int num_planes = av1_num_planes(cm); - const int width = mi_col_end - mi_col_start; - const int aligned_width = - ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2); - - const int offset_y = mi_col_start; - const int width_y = aligned_width; - const int offset_uv = offset_y >> seq_params->subsampling_x; - const int width_uv = width_y >> seq_params->subsampling_x; - - av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y); - if (num_planes > 1) { - if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) { - av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv); - av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv); - } else { - aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, - "Invalid value of planes"); - } - } - - av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width); - - memset(cm->above_txfm_context[tile_row] + mi_col_start, - tx_size_wide[TX_SIZES_LARGEST], - aligned_width * sizeof(TXFM_CONTEXT)); -} - -static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) { - av1_zero(xd->left_context); - av1_zero(xd->left_seg_context); - - memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST], - sizeof(xd->left_txfm_context_buffer)); -} - -// Disable array-bounds checks as the TX_SIZE enum contains values larger than -// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround -// infeasible. The assert is enough for static analysis and this or other tools -// asan, valgrind would catch oob access at runtime. -#if defined(__GNUC__) && __GNUC__ >= 4 -#pragma GCC diagnostic ignored "-Warray-bounds" -#endif - -#if defined(__GNUC__) && __GNUC__ >= 4 -#pragma GCC diagnostic warning "-Warray-bounds" -#endif - -static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { - int i; - for (i = 0; i < len; ++i) txfm_ctx[i] = txs; -} - -static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, - const MACROBLOCKD *xd) { - uint8_t bw = tx_size_wide[tx_size]; - uint8_t bh = tx_size_high[tx_size]; - - if (skip) { - bw = n4_w * MI_SIZE; - bh = n4_h * MI_SIZE; - } - - set_txfm_ctx(xd->above_txfm_context, bw, n4_w); - set_txfm_ctx(xd->left_txfm_context, bh, n4_h); -} - -static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx, - TXFM_CONTEXT *left_ctx, - TX_SIZE tx_size, TX_SIZE txb_size) { - BLOCK_SIZE bsize = txsize_to_bsize[txb_size]; - int bh = mi_size_high[bsize]; - int bw = mi_size_wide[bsize]; - uint8_t txw = tx_size_wide[tx_size]; - uint8_t txh = tx_size_high[tx_size]; - int i; - for (i = 0; i < bh; ++i) left_ctx[i] = txh; - for (i = 0; i < bw; ++i) above_ctx[i] = txw; -} - -static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) { - switch (tx_dim) { - case 128: - case 64: return TX_64X64; break; - case 32: return TX_32X32; break; - case 16: return TX_16X16; break; - case 8: return TX_8X8; break; - default: return TX_4X4; - } -} - -static INLINE TX_SIZE get_tx_size(int width, int height) { - if (width == height) { - return get_sqr_tx_size(width); - } - if (width < height) { - if (width + width == height) { - switch (width) { - case 4: return TX_4X8; break; - case 8: return TX_8X16; break; - case 16: return TX_16X32; break; - case 32: return TX_32X64; break; - } - } else { - switch (width) { - case 4: return TX_4X16; break; - case 8: return TX_8X32; break; - case 16: return TX_16X64; break; - } - } - } else { - if (height + height == width) { - switch (height) { - case 4: return TX_8X4; break; - case 8: return TX_16X8; break; - case 16: return TX_32X16; break; - case 32: return TX_64X32; break; - } - } else { - switch (height) { - case 4: return TX_16X4; break; - case 8: return TX_32X8; break; - case 16: return TX_64X16; break; - } - } - } - assert(0); - return TX_4X4; -} - -static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx, - TXFM_CONTEXT *left_ctx, - BLOCK_SIZE bsize, TX_SIZE tx_size) { - const uint8_t txw = tx_size_wide[tx_size]; - const uint8_t txh = tx_size_high[tx_size]; - const int above = *above_ctx < txw; - const int left = *left_ctx < txh; - int category = TXFM_PARTITION_CONTEXTS; - - // dummy return, not used by others. - if (tx_size <= TX_4X4) return 0; - - TX_SIZE max_tx_size = - get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize])); - - if (max_tx_size >= TX_8X8) { - category = - (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) + - (TX_SIZES - 1 - max_tx_size) * 2; - } - assert(category != TXFM_PARTITION_CONTEXTS); - return category * 3 + above + left; -} - -// Compute the next partition in the direction of the sb_type stored in the mi -// array, starting with bsize. -static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm, - int mi_row, int mi_col, - BLOCK_SIZE bsize) { - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID; - - const int offset = mi_row * cm->mi_stride + mi_col; - MB_MODE_INFO **mi = cm->mi_grid_visible + offset; - const BLOCK_SIZE subsize = mi[0]->sb_type; - - if (subsize == bsize) return PARTITION_NONE; - - const int bhigh = mi_size_high[bsize]; - const int bwide = mi_size_wide[bsize]; - const int sshigh = mi_size_high[subsize]; - const int sswide = mi_size_wide[subsize]; - - if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows && - mi_col + bhigh / 2 < cm->mi_cols) { - // In this case, the block might be using an extended partition - // type. - const MB_MODE_INFO *const mbmi_right = mi[bwide / 2]; - const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride]; - - if (sswide == bwide) { - // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or - // PARTITION_HORZ_B. To distinguish the latter two, check if the lower - // half was split. - if (sshigh * 4 == bhigh) return PARTITION_HORZ_4; - assert(sshigh * 2 == bhigh); - - if (mbmi_below->sb_type == subsize) - return PARTITION_HORZ; - else - return PARTITION_HORZ_B; - } else if (sshigh == bhigh) { - // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or - // PARTITION_VERT_B. To distinguish the latter two, check if the right - // half was split. - if (sswide * 4 == bwide) return PARTITION_VERT_4; - assert(sswide * 2 == bhigh); - - if (mbmi_right->sb_type == subsize) - return PARTITION_VERT; - else - return PARTITION_VERT_B; - } else { - // Smaller width and smaller height. Might be PARTITION_SPLIT or could be - // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both - // dimensions, we immediately know this is a split (which will recurse to - // get to subsize). Otherwise look down and to the right. With - // PARTITION_VERT_A, the right block will have height bhigh; with - // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise - // it's PARTITION_SPLIT. - if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT; - - if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A; - if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A; - - return PARTITION_SPLIT; - } - } - const int vert_split = sswide < bwide; - const int horz_split = sshigh < bhigh; - const int split_idx = (vert_split << 1) | horz_split; - assert(split_idx != 0); - - static const PARTITION_TYPE base_partitions[4] = { - PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT - }; - - return base_partitions[split_idx]; -} - -static INLINE void set_use_reference_buffer(AV1_COMMON *const cm, int use) { - cm->seq_params.frame_id_numbers_present_flag = use; -} - -static INLINE void set_sb_size(SequenceHeader *const seq_params, - BLOCK_SIZE sb_size) { - seq_params->sb_size = sb_size; - seq_params->mib_size = mi_size_wide[seq_params->sb_size]; - seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size]; -} - -// Returns true if the frame is fully lossless at the coded resolution. -// Note: If super-resolution is used, such a frame will still NOT be lossless at -// the upscaled resolution. -static INLINE int is_coded_lossless(const AV1_COMMON *cm, - const MACROBLOCKD *xd) { - int coded_lossless = 1; - if (cm->seg.enabled) { - for (int i = 0; i < MAX_SEGMENTS; ++i) { - if (!xd->lossless[i]) { - coded_lossless = 0; - break; - } - } - } else { - coded_lossless = xd->lossless[0]; - } - return coded_lossless; -} - -static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) { - return seq_level_idx < 24 || seq_level_idx == 31; -} - -static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) { - assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX); - // Since bl.minor is unsigned a comparison will return a warning: - // comparison is always true due to limited range of data type - assert(LEVEL_MINOR_MIN == 0); - assert(bl.minor <= LEVEL_MINOR_MAX); - return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor; -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AV1_COMMON_ONYXC_INT_H_ diff --git a/media/libaom/src/av1/common/ppc/cfl_ppc.c b/media/libaom/src/av1/common/ppc/cfl_ppc.c index 026a07809..6f88768f2 100644 --- a/media/libaom/src/av1/common/ppc/cfl_ppc.c +++ b/media/libaom/src/av1/common/ppc/cfl_ppc.c @@ -124,27 +124,27 @@ CFL_SUB_AVG_X(vsx, 32, 32, 512, 10) // Based on observation, for small blocks VSX does not outperform C (no 64bit // load and store intrinsics). So we call the C code for block widths 4. -cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) { +cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { - subtract_average_4x4_c, /* 4x4 */ - subtract_average_8x8_vsx, /* 8x8 */ - subtract_average_16x16_vsx, /* 16x16 */ - subtract_average_32x32_vsx, /* 32x32 */ - cfl_subtract_average_null, /* 64x64 (invalid CFL size) */ - subtract_average_4x8_c, /* 4x8 */ - subtract_average_8x4_vsx, /* 8x4 */ - subtract_average_8x16_vsx, /* 8x16 */ - subtract_average_16x8_vsx, /* 16x8 */ - subtract_average_16x32_vsx, /* 16x32 */ - subtract_average_32x16_vsx, /* 32x16 */ - cfl_subtract_average_null, /* 32x64 (invalid CFL size) */ - cfl_subtract_average_null, /* 64x32 (invalid CFL size) */ - subtract_average_4x16_c, /* 4x16 */ - subtract_average_16x4_vsx, /* 16x4 */ - subtract_average_8x32_vsx, /* 8x32 */ - subtract_average_32x8_vsx, /* 32x8 */ - cfl_subtract_average_null, /* 16x64 (invalid CFL size) */ - cfl_subtract_average_null, /* 64x16 (invalid CFL size) */ + cfl_subtract_average_4x4_c, /* 4x4 */ + cfl_subtract_average_8x8_vsx, /* 8x8 */ + cfl_subtract_average_16x16_vsx, /* 16x16 */ + cfl_subtract_average_32x32_vsx, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_c, /* 4x8 */ + cfl_subtract_average_8x4_vsx, /* 8x4 */ + cfl_subtract_average_8x16_vsx, /* 8x16 */ + cfl_subtract_average_16x8_vsx, /* 16x8 */ + cfl_subtract_average_16x32_vsx, /* 16x32 */ + cfl_subtract_average_32x16_vsx, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_c, /* 4x16 */ + cfl_subtract_average_16x4_vsx, /* 16x4 */ + cfl_subtract_average_8x32_vsx, /* 8x32 */ + cfl_subtract_average_32x8_vsx, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to // index the function pointer array out of bounds. diff --git a/media/libaom/src/av1/common/pred_common.h b/media/libaom/src/av1/common/pred_common.h index 6dba2322d..d1dab97e7 100644 --- a/media/libaom/src/av1/common/pred_common.h +++ b/media/libaom/src/av1/common/pred_common.h @@ -12,29 +12,31 @@ #ifndef AOM_AV1_COMMON_PRED_COMMON_H_ #define AOM_AV1_COMMON_PRED_COMMON_H_ +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" -#include "av1/common/onyxc_int.h" #include "aom_dsp/aom_dsp_common.h" #ifdef __cplusplus extern "C" { #endif -static INLINE int get_segment_id(const AV1_COMMON *const cm, +static INLINE int get_segment_id(const CommonModeInfoParams *const mi_params, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { - const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; - const int xmis = AOMMIN(cm->mi_cols - mi_col, bw); - const int ymis = AOMMIN(cm->mi_rows - mi_row, bh); - int x, y, segment_id = MAX_SEGMENTS; - - for (y = 0; y < ymis; ++y) - for (x = 0; x < xmis; ++x) - segment_id = - AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]); + const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); + int segment_id = MAX_SEGMENTS; + + for (int y = 0; y < ymis; ++y) { + for (int x = 0; x < xmis; ++x) { + segment_id = AOMMIN(segment_id, + segment_ids[mi_offset + y * mi_params->mi_cols + x]); + } + } assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); return segment_id; @@ -42,26 +44,33 @@ static INLINE int get_segment_id(const AV1_COMMON *const cm, static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, - int mi_row, int mi_col, int *cdf_index) { int prev_ul = -1; // top left segment_id int prev_l = -1; // left segment_id int prev_u = -1; // top segment_id + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const uint8_t *seg_map = cm->cur_frame->seg_map; if ((xd->up_available) && (xd->left_available)) { - prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4, - mi_row - 1, mi_col - 1); + prev_ul = + get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 1); } if (xd->up_available) { - prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4, - mi_row - 1, mi_col - 0); + prev_u = + get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 0); } if (xd->left_available) { - prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4, - mi_row - 0, mi_col - 1); + prev_l = + get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, mi_col - 1); } + // This property follows from the fact that get_segment_id() returns a + // nonnegative value. This allows us to test for all edge cases with a simple + // prev_ul < 0 check. + assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0)); // Pick CDF index based on number of matching/out-of-bounds segment IDs. - if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */ + if (prev_ul < 0) /* Edge cases */ *cdf_index = 0; else if ((prev_ul == prev_u) && (prev_ul == prev_l)) *cdf_index = 2; @@ -90,18 +99,18 @@ static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) { static INLINE int get_comp_index_context(const AV1_COMMON *cm, const MACROBLOCKD *xd) { MB_MODE_INFO *mbmi = xd->mi[0]; - int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx; - int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); int bck_frame_index = 0, fwd_frame_index = 0; - int cur_frame_index = cm->cur_frame->cur_frame_offset; + int cur_frame_index = cm->cur_frame->order_hint; - if (bck_idx >= 0) - bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset; + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; - if (fwd_idx >= 0) - fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset; - int fwd = abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)); - int bck = abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)); + int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info, + fwd_frame_index, cur_frame_index)); + int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, bck_frame_index)); const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; @@ -109,14 +118,14 @@ static INLINE int get_comp_index_context(const AV1_COMMON *cm, int above_ctx = 0, left_ctx = 0; const int offset = (fwd == bck); - if (above_mi) { + if (above_mi != NULL) { if (has_second_ref(above_mi)) above_ctx = above_mi->compound_idx; else if (above_mi->ref_frame[0] == ALTREF_FRAME) above_ctx = 1; } - if (left_mi) { + if (left_mi != NULL) { if (has_second_ref(left_mi)) left_ctx = left_mi->compound_idx; else if (left_mi->ref_frame[0] == ALTREF_FRAME) @@ -178,6 +187,7 @@ int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, uint16_t *cache); static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8]; } @@ -198,6 +208,10 @@ static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) { return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)]; } +static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) { + return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)]; +} + int av1_get_comp_reference_type_context(const MACROBLOCKD *xd); // == Uni-directional contexts == diff --git a/media/libaom/src/av1/common/quant_common.c b/media/libaom/src/av1/common/quant_common.c index 0e14da7a3..e96d71a3b 100644 --- a/media/libaom/src/av1/common/quant_common.c +++ b/media/libaom/src/av1/common/quant_common.c @@ -9,14 +9,14 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" #include "av1/common/common.h" -#include "av1/common/onyxc_int.h" #include "av1/common/entropy.h" #include "av1/common/quant_common.h" #include "av1/common/seg_common.h" -#include "av1/common/blockd.h" -static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = { +static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = { 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, @@ -38,7 +38,7 @@ static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = { 1184, 1232, 1282, 1336, }; -static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = { +static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = { 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, @@ -61,7 +61,7 @@ static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = { 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, }; -static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = { +static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = { 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, @@ -88,7 +88,7 @@ static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = { 19718, 20521, 21387, }; -static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = { +static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = { 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, @@ -111,7 +111,7 @@ static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = { 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, }; -static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = { +static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = { 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, @@ -134,7 +134,7 @@ static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = { 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, }; -static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = { +static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = { 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, @@ -190,39 +190,30 @@ static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = { // addition, the minimum allowable quantizer is 4; smaller values will // underflow to 0 in the actual quantization routines. -int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) { +int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); switch (bit_depth) { - case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)]; - case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)]; - case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)]; + case AOM_BITS_8: return dc_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped]; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } } -int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) { +int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); switch (bit_depth) { - case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)]; - case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)]; - case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)]; + case AOM_BITS_8: return ac_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped]; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } } -// In AV1 TX, the coefficients are always scaled up a factor of 8 (3 -// bits), so QTX == Q3. - -int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { - return av1_dc_quant_Q3(qindex, delta, bit_depth); -} - -int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { - return av1_ac_quant_Q3(qindex, delta, bit_depth); -} - int av1_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex) { if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { @@ -234,39 +225,82 @@ int av1_get_qindex(const struct segmentation *seg, int segment_id, } } -const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane, - TX_SIZE tx_size) { - return &cm->giqmatrix[qmlevel][plane][tx_size][0]; +bool av1_use_qmatrix(const CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id) { + // True if explicit Q matrix levels and this is not a lossless segment. + return quant_params->using_qmatrix && !xd->lossless[segment_id]; } -const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane, - TX_SIZE tx_size) { - return &cm->gqmatrix[qmlevel][plane][tx_size][0]; + +const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->giqmatrix[qmlevel][plane][tx_size]; +} +const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->gqmatrix[qmlevel][plane][tx_size]; +} + +// Returns true if the tx_type corresponds to non-identity transform in both +// horizontal and vertical directions. +static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); } + +const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_iqmatrix[seg_id][qm_tx_size] + : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; +} + +const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_qmatrix[seg_id][qm_tx_size] + : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; } #define QM_TOTAL_SIZE 3344 -static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE]; -static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE]; +// We only use wt_matrix_ref[q] and iwt_matrix_ref[q] +// for q = 0, ..., NUM_QM_LEVELS - 2. +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; -void av1_qm_init(AV1_COMMON *cm) { - const int num_planes = av1_num_planes(cm); - int q, c, t; - int current; - for (q = 0; q < NUM_QM_LEVELS; ++q) { - for (c = 0; c < num_planes; ++c) { - current = 0; - for (t = 0; t < TX_SIZES_ALL; ++t) { +void av1_qm_init(CommonQuantParams *quant_params, int num_planes) { + for (int q = 0; q < NUM_QM_LEVELS; ++q) { + for (int c = 0; c < num_planes; ++c) { + int current = 0; + for (int t = 0; t < TX_SIZES_ALL; ++t) { const int size = tx_size_2d[t]; const int qm_tx_size = av1_get_adjusted_tx_size(t); if (q == NUM_QM_LEVELS - 1) { - cm->gqmatrix[q][c][t] = NULL; - cm->giqmatrix[q][c][t] = NULL; + quant_params->gqmatrix[q][c][t] = NULL; + quant_params->giqmatrix[q][c][t] = NULL; } else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size' - cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size]; - cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size]; + assert(t > qm_tx_size); + quant_params->gqmatrix[q][c][t] = + quant_params->gqmatrix[q][c][qm_tx_size]; + quant_params->giqmatrix[q][c][t] = + quant_params->giqmatrix[q][c][qm_tx_size]; } else { assert(current + size <= QM_TOTAL_SIZE); - cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current]; - cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current]; + quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current]; + quant_params->giqmatrix[q][c][t] = + &iwt_matrix_ref[q][c >= 1][current]; current += size; } } @@ -274,7 +308,7 @@ void av1_qm_init(AV1_COMMON *cm) { } } -/* Provide 16 sets of quantization matrices for chroma and luma +/* Provide 15 sets of quantization matrices for chroma and luma and each TX size. Matrices for different TX sizes are in fact sub-sampled from the 32x32 and 16x16 sizes, but explicitly defined here for convenience. Intra and inter matrix sets are the @@ -283,9 +317,10 @@ void av1_qm_init(AV1_COMMON *cm) { frame. Matrices for different QM levels have been rescaled in the frequency domain according to different nominal viewing - distances. + distances. Matrices for QM level 15 are omitted because they are + not used. */ -static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = { +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { { { /* Luma */ /* Size 4x4 */ @@ -6633,427 +6668,9 @@ static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = { 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, }, - { - { /* Luma */ - /* Size 4x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 32x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32 }, - { /* Chroma */ - /* Size 4x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 32x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32 }, - }, }; -static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = { +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { { { /* Luma */ /* Size 4x4 */ @@ -13255,422 +12872,4 @@ static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = { 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, }, - { - { /* Luma */ - /* Size 4x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 32x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32 }, - { /* Chroma */ - /* Size 4x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, - /* Size 16x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 32x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 4x16 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 16x4 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - /* Size 8x32 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, - /* Size 32x8 */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32 }, - }, }; diff --git a/media/libaom/src/av1/common/quant_common.h b/media/libaom/src/av1/common/quant_common.h index d1f52a660..9c30204ff 100644 --- a/media/libaom/src/av1/common/quant_common.h +++ b/media/libaom/src/av1/common/quant_common.h @@ -12,6 +12,7 @@ #ifndef AOM_AV1_COMMON_QUANT_COMMON_H_ #define AOM_AV1_COMMON_QUANT_COMMON_H_ +#include <stdbool.h> #include "aom/aom_codec.h" #include "av1/common/seg_common.h" #include "av1/common/enums.h" @@ -37,24 +38,43 @@ extern "C" { #define DEFAULT_QM_LAST 9 struct AV1Common; +struct CommonQuantParams; +struct macroblockd; -int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth); -int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth); int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); int av1_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex); + +// Returns true if we are using quantization matrix. +bool av1_use_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id); + // Reduce the large number of quantizers to a smaller number of levels for which // different matrices may be defined static INLINE int aom_get_qmlevel(int qindex, int first, int last) { return first + (qindex * (last + 1 - first)) / QINDEX_RANGE; } -void av1_qm_init(struct AV1Common *cm); -const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp, - TX_SIZE tx_size); -const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp, - TX_SIZE tx_size); + +// Initialize all global quant/dequant matrices. +void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes); + +// Get global dequant matrix. +const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); +// Get global quant matrix. +const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); + +// Get either local / global dequant matrix as appropriate. +const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); +// Get either local / global quant matrix as appropriate. +const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); #ifdef __cplusplus } // extern "C" diff --git a/media/libaom/src/av1/common/reconinter.c b/media/libaom/src/av1/common/reconinter.c index 3203efce4..287adddcc 100644 --- a/media/libaom/src/av1/common/reconinter.c +++ b/media/libaom/src/av1/common/reconinter.c @@ -20,25 +20,24 @@ #include "aom/aom_integer.h" #include "aom_dsp/blend.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" -#include "av1/common/onyxc_int.h" -#include "av1/common/obmc.h" - -#define USE_PRECOMPUTED_WEDGE_MASK 1 -#define USE_PRECOMPUTED_WEDGE_SIGN 1 // This function will determine whether or not to create a warped // prediction. int av1_allow_warp(const MB_MODE_INFO *const mbmi, const WarpTypesAllowed *const warp_types, const WarpedMotionParams *const gm_params, - int build_for_obmc, int x_scale, int y_scale, + int build_for_obmc, const struct scale_factors *const sf, WarpedMotionParams *final_warp_params) { - if (x_scale != SCALE_SUBPEL_SHIFTS || y_scale != SCALE_SUBPEL_SHIFTS) - return 0; + // Note: As per the spec, we must test the fixed point scales here, which are + // at a higher precision (1 << 14) than the xs and ys in subpel_params (that + // have 1 << 10 precision). + if (av1_is_scaled(sf)) return 0; if (final_warp_params != NULL) *final_warp_params = default_warp_params; @@ -57,48 +56,114 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi, return 0; } -void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const SubpelParams *subpel_params, - const struct scale_factors *sf, int w, int h, - ConvolveParams *conv_params, - InterpFilters interp_filters, - const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, int ref, - const MB_MODE_INFO *mi, int build_for_obmc, - const MACROBLOCKD *xd, int can_use_previous) { - // Make sure the selected motion mode is valid for this configuration - assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi, - can_use_previous); - assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); - - WarpedMotionParams final_warp_params; - const int do_warp = - (w >= 8 && h >= 8 && - av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], - build_for_obmc, subpel_params->xs, subpel_params->ys, - &final_warp_params)); - const int is_intrabc = mi->use_intrabc; - assert(IMPLIES(is_intrabc, !do_warp)); - - if (do_warp && xd->cur_frame_force_integer_mv == 0) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const struct buf_2d *const pre_buf = &pd->pre[ref]; - av1_warp_plane(&final_warp_params, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, - pre_buf->buf0, pre_buf->width, pre_buf->height, - pre_buf->stride, dst, p_col, p_row, w, h, dst_stride, - pd->subsampling_x, pd->subsampling_y, conv_params); - } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, - w, h, conv_params, interp_filters, is_intrabc, - xd->bd); +void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width, + int block_height, int pix_row, int pix_col, + int subsampling_x, int subsampling_y, int bit_depth, + int use_hbd_buf, int is_intrabc, + const struct scale_factors *sf, + const struct buf_2d *ref_buf, + int_interpfilters interp_filters) { + inter_pred_params->block_width = block_width; + inter_pred_params->block_height = block_height; + inter_pred_params->pix_row = pix_row; + inter_pred_params->pix_col = pix_col; + inter_pred_params->subsampling_x = subsampling_x; + inter_pred_params->subsampling_y = subsampling_y; + inter_pred_params->bit_depth = bit_depth; + inter_pred_params->use_hbd_buf = use_hbd_buf; + inter_pred_params->is_intrabc = is_intrabc; + inter_pred_params->scale_factors = sf; + inter_pred_params->ref_frame_buf = *ref_buf; + inter_pred_params->mode = TRANSLATION_PRED; + inter_pred_params->comp_mode = UNIFORM_SINGLE; + + if (is_intrabc) { + inter_pred_params->interp_filter_params[0] = &av1_intrabc_filter_params; + inter_pred_params->interp_filter_params[1] = &av1_intrabc_filter_params; } else { - inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h, - conv_params, interp_filters, is_intrabc); + inter_pred_params->interp_filter_params[0] = + av1_get_interp_filter_params_with_block_size( + interp_filters.as_filters.x_filter, block_width); + inter_pred_params->interp_filter_params[1] = + av1_get_interp_filter_params_with_block_size( + interp_filters.as_filters.y_filter, block_height); + } +} + +void av1_init_comp_mode(InterPredParams *inter_pred_params) { + inter_pred_params->comp_mode = UNIFORM_COMP; +} + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi) { + if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8) + return; + + if (xd->cur_frame_force_integer_mv) return; + + if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0, + inter_pred_params->scale_factors, + &inter_pred_params->warp_params)) + inter_pred_params->mode = WARP_PRED; +} + +void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize, + const INTERINTER_COMPOUND_DATA *mask_comp) { + inter_pred_params->sb_type = bsize; + inter_pred_params->mask_comp = *mask_comp; + + if (inter_pred_params->conv_params.compound_index == 1) { + inter_pred_params->conv_params.do_average = 0; + inter_pred_params->comp_mode = MASK_COMP; + } +} + +void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + assert(IMPLIES(inter_pred_params->conv_params.is_compound, + inter_pred_params->conv_params.dst != NULL)); + + // TODO(jingning): av1_warp_plane() can be further cleaned up. + if (inter_pred_params->mode == WARP_PRED) { + av1_warp_plane( + &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf, + inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0, + inter_pred_params->ref_frame_buf.width, + inter_pred_params->ref_frame_buf.height, + inter_pred_params->ref_frame_buf.stride, dst, + inter_pred_params->pix_col, inter_pred_params->pix_row, + inter_pred_params->block_width, inter_pred_params->block_height, + dst_stride, inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y, &inter_pred_params->conv_params); + } else if (inter_pred_params->mode == TRANSLATION_PRED) { +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { + highbd_inter_predictor( + src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->scale_factors, inter_pred_params->block_width, + inter_pred_params->block_height, &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params, + inter_pred_params->bit_depth); + } else { + inter_predictor( + src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->scale_factors, inter_pred_params->block_width, + inter_pred_params->block_height, &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); + } +#else + inter_predictor( + src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->scale_factors, inter_pred_params->block_width, + inter_pred_params->block_height, &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); +#endif } } -#if USE_PRECOMPUTED_WEDGE_MASK static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, @@ -118,7 +183,8 @@ static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, }; -static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) { +static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift, + int width) { if (shift >= 0) { memcpy(dst + shift, src, width - shift); memset(dst, src[0], shift); @@ -128,9 +194,7 @@ static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) { memset(dst + width - shift, src[width - 1], shift); } } -#endif // USE_PRECOMPUTED_WEDGE_MASK -#if USE_PRECOMPUTED_WEDGE_SIGN /* clang-format off */ DECLARE_ALIGNED(16, static uint8_t, wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = { @@ -158,10 +222,6 @@ DECLARE_ALIGNED(16, static uint8_t, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used }; /* clang-format on */ -#else -DECLARE_ALIGNED(16, static uint8_t, - wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]); -#endif // USE_PRECOMPUTED_WEDGE_SIGN // [negative][direction] DECLARE_ALIGNED( @@ -173,6 +233,10 @@ DECLARE_ALIGNED( DECLARE_ALIGNED(16, static uint8_t, wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]); +DECLARE_ALIGNED(16, static uint8_t, + smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL] + [MAX_WEDGE_SQUARE]); + static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2]; static const wedge_code_type wedge_codebook_16_hgtw[16] = { @@ -208,23 +272,23 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = { { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; -const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = { +const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = { { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, - { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], wedge_masks[BLOCK_8X8] }, - { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], wedge_masks[BLOCK_8X16] }, - { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], wedge_masks[BLOCK_16X8] }, - { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], wedge_masks[BLOCK_16X16] }, - { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], wedge_masks[BLOCK_16X32] }, - { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], wedge_masks[BLOCK_32X16] }, - { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], wedge_masks[BLOCK_32X32] }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, @@ -234,9 +298,9 @@ const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = { { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, - { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], wedge_masks[BLOCK_8X32] }, - { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], wedge_masks[BLOCK_32X8] }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, @@ -248,12 +312,12 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg, const int bh = block_size_high[sb_type]; const int bw = block_size_wide[sb_type]; const wedge_code_type *a = - wedge_params_lookup[sb_type].codebook + wedge_index; + av1_wedge_params_lookup[sb_type].codebook + wedge_index; int woff, hoff; - const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index]; + const uint8_t wsignflip = + av1_wedge_params_lookup[sb_type].signflip[wedge_index]; - assert(wedge_index >= 0 && - wedge_index < (1 << get_wedge_bits_lookup(sb_type))); + assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type)); woff = (a->x_offset * bw) >> 3; hoff = (a->y_offset * bh) >> 3; master = wedge_mask_obl[neg ^ wsignflip][a->direction] + @@ -275,10 +339,10 @@ const uint8_t *av1_get_compound_type_mask( } } -static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base, - const CONV_BUF_TYPE *src0, int src0_stride, - const CONV_BUF_TYPE *src1, int src1_stride, int h, - int w, ConvolveParams *conv_params, int bd) { +static AOM_INLINE void diffwtd_mask_d16( + uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { int round = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); int i, j, m, diff; @@ -309,9 +373,10 @@ void av1_build_compound_diffwtd_mask_d16_c( } } -static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base, - const uint8_t *src0, int src0_stride, - const uint8_t *src1, int src1_stride, int h, int w) { +static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse, + int mask_base, const uint8_t *src0, + int src0_stride, const uint8_t *src1, + int src1_stride, int h, int w) { int i, j, m, diff; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { @@ -419,13 +484,12 @@ void av1_build_compound_diffwtd_mask_highbd_c( } } -static void init_wedge_master_masks() { +static AOM_INLINE void init_wedge_master_masks() { int i, j; const int w = MASK_MASTER_SIZE; const int h = MASK_MASTER_SIZE; const int stride = MASK_MASTER_STRIDE; -// Note: index [0] stores the masters, and [1] its complement. -#if USE_PRECOMPUTED_WEDGE_MASK + // Note: index [0] stores the masters, and [1] its complement. // Generate prototype by shifting the masters int shift = h / 4; for (i = 0; i < h; i += 2) { @@ -443,22 +507,7 @@ static void init_wedge_master_masks() { wedge_master_vertical, MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); } -#else - static const double smoother_param = 2.85; - const int a[2] = { 2, 1 }; - const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]); - for (i = 0; i < h; i++) { - for (j = 0; j < w; ++j) { - int x = (2 * j + 1 - w); - int y = (2 * i + 1 - h); - double d = (a[0] * x + a[1] * y) / asqrt; - const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32); - wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk; - const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32); - wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx; - } - } -#endif // USE_PRECOMPUTED_WEDGE_MASK + for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j]; @@ -480,57 +529,18 @@ static void init_wedge_master_masks() { } } -#if !USE_PRECOMPUTED_WEDGE_SIGN -// If the signs for the wedges for various blocksizes are -// inconsistent flip the sign flag. Do it only once for every -// wedge codebook. -static void init_wedge_signs() { - BLOCK_SIZE sb_type; - memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup)); - for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) { - const int bw = block_size_wide[sb_type]; - const int bh = block_size_high[sb_type]; - const wedge_params_type wedge_params = wedge_params_lookup[sb_type]; - const int wbits = wedge_params.bits; - const int wtypes = 1 << wbits; - int i, w; - if (wbits) { - for (w = 0; w < wtypes; ++w) { - // Get the mask master, i.e. index [0] - const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type); - int avg = 0; - for (i = 0; i < bw; ++i) avg += mask[i]; - for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE]; - avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1); - // Default sign of this wedge is 1 if the average < 32, 0 otherwise. - // If default sign is 1: - // If sign requested is 0, we need to flip the sign and return - // the complement i.e. index [1] instead. If sign requested is 1 - // we need to flip the sign and return index [0] instead. - // If default sign is 0: - // If sign requested is 0, we need to return index [0] the master - // if sign requested is 1, we need to return the complement index [1] - // instead. - wedge_params.signflip[w] = (avg < 32); - } - } - } -} -#endif // !USE_PRECOMPUTED_WEDGE_SIGN - -static void init_wedge_masks() { +static AOM_INLINE void init_wedge_masks() { uint8_t *dst = wedge_mask_buf; BLOCK_SIZE bsize; memset(wedge_masks, 0, sizeof(wedge_masks)); for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) { + const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize]; + const int wtypes = wedge_params->wedge_types; + if (wtypes == 0) continue; const uint8_t *mask; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - const wedge_params_type *wedge_params = &wedge_params_lookup[bsize]; - const int wbits = wedge_params->bits; - const int wtypes = 1 << wbits; int w; - if (wbits == 0) continue; for (w = 0; w < wtypes; ++w) { mask = get_wedge_mask_inplace(w, 0, bsize); aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, @@ -548,109 +558,383 @@ static void init_wedge_masks() { } } +/* clang-format off */ +static const uint8_t ii_weights1d[MAX_SB_SIZE] = { + 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4, + 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; +static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = { + 32, 16, 16, 16, 8, 8, 8, 4, + 4, 4, 2, 2, 2, 1, 1, 1, + 8, 8, 4, 4, 2, 2 +}; +/* clang-format on */ + +static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride, + BLOCK_SIZE plane_bsize, + INTERINTRA_MODE mode) { + int i, j; + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const int size_scale = ii_size_scales[plane_bsize]; + + switch (mode) { + case II_V_PRED: + for (i = 0; i < bh; ++i) { + memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0])); + mask += stride; + } + break; + + case II_H_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale]; + mask += stride; + } + break; + + case II_SMOOTH_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) + mask[j] = ii_weights1d[(i < j ? i : j) * size_scale]; + mask += stride; + } + break; + + case II_DC_PRED: + default: + for (i = 0; i < bh; ++i) { + memset(mask, 32, bw * sizeof(mask[0])); + mask += stride; + } + break; + } +} + +static AOM_INLINE void init_smooth_interintra_masks() { + for (int m = 0; m < INTERINTRA_MODES; ++m) { + for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) { + const int bw = block_size_wide[bs]; + const int bh = block_size_high[bs]; + if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue; + build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs, + m); + } + } +} + // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0 void av1_init_wedge_masks() { init_wedge_master_masks(); -#if !USE_PRECOMPUTED_WEDGE_SIGN - init_wedge_signs(); -#endif // !USE_PRECOMPUTED_WEDGE_SIGN init_wedge_masks(); + init_smooth_interintra_masks(); } -static void build_masked_compound_no_round( +static AOM_INLINE void build_masked_compound_no_round( uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, - int w, ConvolveParams *conv_params, MACROBLOCKD *xd) { - // Derive subsampling from h and w passed in. May be refactored to - // pass in subsampling factors directly. - const int subh = (2 << mi_size_high_log2[sb_type]) == h; - const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + int w, InterPredParams *inter_pred_params) { + const int ssy = inter_pred_params->subsampling_y; + const int ssx = inter_pred_params->subsampling_x; const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + const int mask_stride = block_size_wide[sb_type]; +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, block_size_wide[sb_type], - w, h, subw, subh, conv_params, xd->bd); - else + src1_stride, mask, mask_stride, w, h, ssx, + ssy, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } else { aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, block_size_wide[sb_type], w, - h, subw, subh, conv_params); + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); + } +#else + aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); +#endif } -void av1_make_masked_inter_predictor( - const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride, - const SubpelParams *subpel_params, const struct scale_factors *sf, int w, - int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane, - const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref, - MACROBLOCKD *xd, int can_use_previous) { - MB_MODE_INFO *mi = xd->mi[0]; - (void)dst; - (void)dst_stride; - mi->interinter_comp.seg_mask = xd->seg_mask; - const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp; - -// We're going to call av1_make_inter_predictor to generate a prediction into -// a temporary buffer, then will blend that temporary buffer with that from -// the other reference. -// -#define INTER_PRED_BYTES_PER_PIXEL 2 - - DECLARE_ALIGNED(32, uint8_t, - tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]); -#undef INTER_PRED_BYTES_PER_PIXEL - - uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf); +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp; + BLOCK_SIZE sb_type = inter_pred_params->sb_type; + + // We're going to call av1_make_inter_predictor to generate a prediction into + // a temporary buffer, then will blend that temporary buffer with that from + // the other reference. + DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]); + uint8_t *tmp_dst = + inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf; const int tmp_buf_stride = MAX_SB_SIZE; - CONV_BUF_TYPE *org_dst = conv_params->dst; - int org_dst_stride = conv_params->dst_stride; + CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst; + int org_dst_stride = inter_pred_params->conv_params.dst_stride; CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf; - conv_params->dst = tmp_buf16; - conv_params->dst_stride = tmp_buf_stride; - assert(conv_params->do_average == 0); + inter_pred_params->conv_params.dst = tmp_buf16; + inter_pred_params->conv_params.dst_stride = tmp_buf_stride; + assert(inter_pred_params->conv_params.do_average == 0); // This will generate a prediction in tmp_buf for the second reference - av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params, - sf, w, h, conv_params, interp_filters, warp_types, - p_col, p_row, plane, ref, mi, 0, xd, - can_use_previous); + av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, + inter_pred_params, subpel_params); - if (!plane && comp_data->type == COMPOUND_DIFFWTD) { + if (!inter_pred_params->conv_params.plane && + comp_data->type == COMPOUND_DIFFWTD) { av1_build_compound_diffwtd_mask_d16( comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride, - tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd); + tmp_buf16, tmp_buf_stride, inter_pred_params->block_height, + inter_pred_params->block_width, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } + build_masked_compound_no_round( + dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride, + comp_data, sb_type, inter_pred_params->block_height, + inter_pred_params->block_width, inter_pred_params); +} + +void av1_build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *const src_mv, + InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, CalcSubpelParamsFunc calc_subpel_params_func) { + SubpelParams subpel_params; + uint8_t *src; + int src_stride; + calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src, + &subpel_params, &src_stride); + + if (inter_pred_params->comp_mode == UNIFORM_SINGLE || + inter_pred_params->comp_mode == UNIFORM_COMP) { + av1_make_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); + } else { + av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); } - build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride, - tmp_buf16, tmp_buf_stride, comp_data, - mi->sb_type, h, w, conv_params, xd); } -void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, - int order_idx, int *fwd_offset, int *bck_offset, - int *use_jnt_comp_avg, int is_compound) { +// True if the following hold: +// 1. Not intrabc and not build_for_obmc +// 2. A U or V plane +// 3. If the block size differs from the base block size +// 4. If sub-sampled, none of the previous blocks around the sub-sample +// are intrabc or inter-blocks +static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize, + int is_intrabc, int build_for_obmc) { + if (is_intrabc || build_for_obmc) { + return false; + } + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + if ((block_size_wide[bsize] >= 8 || !ss_x) && + (block_size_high[bsize] >= 8 || !ss_y)) { + return false; + } + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0; + const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0; + + for (int row = row_start; row <= 0; ++row) { + for (int col = col_start; col <= 0; ++col) { + const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + if (!is_inter_block(this_mbmi)) return false; + if (is_intrabc_block(this_mbmi)) return false; + } + } + return true; +} + +static void build_inter_predictors_sub8x8( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int bw, int bh, int mi_x, int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func) { + const BLOCK_SIZE bsize = mi->sb_type; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const bool ss_x = pd->subsampling_x; + const bool ss_y = pd->subsampling_y; + const int b4_w = block_size_wide[bsize] >> ss_x; + const int b4_h = block_size_high[bsize] >> ss_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + const int b8_w = block_size_wide[plane_bsize]; + const int b8_h = block_size_high[plane_bsize]; + const int is_compound = has_second_ref(mi); + assert(!is_compound); + assert(!is_intrabc_block(mi)); + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0; + const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + int row = row_start; + for (int y = 0; y < b8_h; y += b4_h) { + int col = col_start; + for (int x = 0; x < b8_w; x += b4_w) { + MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + int tmp_dst_stride = 8; + assert(bw < 8 || bh < 8); + (void)bw; + (void)bh; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; + int ref = 0; + const RefCntBuffer *ref_buf = + get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *const sf = ref_scale_factors; + const struct buf_2d pre_buf = { + NULL, + (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer, + ref_buf->buf.uv_crop_width, + ref_buf->buf.uv_crop_height, + ref_buf->buf.uv_stride, + }; + + const MV mv = this_mbmi->mv[ref].as_mv; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y, + pre_x + x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf, + &pre_buf, this_mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd); + inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0; + + av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, + &inter_pred_params, xd, mi_x + x, mi_y + y, + ref, calc_subpel_params_func); + + ++col; + } + ++row; + } +} + +static void build_inter_predictors_8x8_and_bigger( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func) { + const int is_compound = has_second_ref(mi); + const int is_intrabc = is_intrabc_block(mi); + assert(IMPLIES(is_intrabc, !is_compound)); + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + + int is_global[2] = { 0, 0 }; + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + is_global[ref] = is_global_mv_block(mi, wm->wmtype); + } + + const BLOCK_SIZE bsize = mi->sb_type; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int row_start = + (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; + const int col_start = + (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + const MV mv = mi->mv[ref].as_mv; + const WarpTypesAllowed warp_types = { is_global[ref], + mi->motion_mode == WARPED_CAUSAL }; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf, + mi->interp_filters); + if (is_compound) av1_init_comp_mode(&inter_pred_params); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + + av1_dist_wtd_comp_weight_assign( + cm, mi, 0, &inter_pred_params.conv_params.fwd_offset, + &inter_pred_params.conv_params.bck_offset, + &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound); + + if (!build_for_obmc) + av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); + + if (is_masked_compound_type(mi->interinter_comp.type)) { + av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp); + // Assign physical buffer. + inter_pred_params.mask_comp.seg_mask = xd->seg_mask; + } + + av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, + xd, mi_x, mi_y, ref, calc_subpel_params_func); + } +} + +void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, + int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func) { + if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi), + build_for_obmc)) { + build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y, + calc_subpel_params_func); + } else { + build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, + bh, mi_x, mi_y, + calc_subpel_params_func); + } +} + +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int order_idx, + int *fwd_offset, int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound) { assert(fwd_offset != NULL && bck_offset != NULL); if (!is_compound || mbmi->compound_idx) { - *use_jnt_comp_avg = 0; + *use_dist_wtd_comp_avg = 0; return; } - *use_jnt_comp_avg = 1; - const int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx; - const int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx; - const int cur_frame_index = cm->cur_frame->cur_frame_offset; + *use_dist_wtd_comp_avg = 1; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); + const int cur_frame_index = cm->cur_frame->order_hint; int bck_frame_index = 0, fwd_frame_index = 0; - if (bck_idx >= 0) { - bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset; - } - - if (fwd_idx >= 0) { - fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset; - } + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; - int d0 = clamp(abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)), + int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info, + fwd_frame_index, cur_frame_index)), 0, MAX_FRAME_DISTANCE); - int d1 = clamp(abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)), + int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info, + cur_frame_index, bck_frame_index)), 0, MAX_FRAME_DISTANCE); const int order = d0 <= d1; @@ -708,10 +992,9 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, // obmc_mask_N[overlap_position] static const uint8_t obmc_mask_1[1] = { 64 }; +DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 }; -static const uint8_t obmc_mask_2[2] = { 45, 64 }; - -static const uint8_t obmc_mask_4[4] = { 39, 50, 59, 64 }; +DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 }; static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 }; @@ -743,19 +1026,21 @@ const uint8_t *av1_get_obmc_mask(int length) { } } -static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc, - uint8_t mi_hw, MB_MODE_INFO *mi, - void *fun_ctxt, const int num_planes) { +static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *mi, void *fun_ctxt, + const int num_planes) { (void)xd; - (void)rel_mi_rc; - (void)mi_hw; + (void)rel_mi_row; + (void)rel_mi_col; + (void)op_mi_size; + (void)dir; (void)mi; ++*(int *)fun_ctxt; (void)num_planes; } -void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col) { +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) { MB_MODE_INFO *mbmi = xd->mi[0]; mbmi->overlappable_neighbors[0] = 0; @@ -763,9 +1048,9 @@ void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd, if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return; - foreach_overlappable_nb_above(cm, xd, mi_col, INT_MAX, increment_int_ptr, + foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr, &mbmi->overlappable_neighbors[0]); - foreach_overlappable_nb_left(cm, xd, mi_row, INT_MAX, increment_int_ptr, + foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr, &mbmi->overlappable_neighbors[1]); } @@ -806,21 +1091,20 @@ struct obmc_inter_pred_ctxt { int *adjacent_stride; }; -static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col, - uint8_t above_mi_width, - MB_MODE_INFO *above_mi, - void *fun_ctxt, - const int num_planes) { +static INLINE void build_obmc_inter_pred_above( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) { (void)above_mi; + (void)rel_mi_row; + (void)dir; struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; const int overlap = AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; - const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x; + const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; const int bh = overlap >> pd->subsampling_y; const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x; @@ -831,32 +1115,36 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col, const int tmp_stride = ctxt->adjacent_stride[plane]; const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col]; const uint8_t *const mask = av1_get_obmc_mask(bh); - +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); if (is_hbd) aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh, xd->bd); else aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh); +#else + aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif } } -static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row, - uint8_t left_mi_height, - MB_MODE_INFO *left_mi, - void *fun_ctxt, - const int num_planes) { +static INLINE void build_obmc_inter_pred_left( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) { (void)left_mi; + (void)rel_mi_col; + (void)dir; struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; const BLOCK_SIZE bsize = xd->mi[0]->sb_type; const int overlap = AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; const int bw = overlap >> pd->subsampling_x; - const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y; + const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y; const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y; if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; @@ -867,12 +1155,18 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row, const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride]; const uint8_t *const mask = av1_get_obmc_mask(bw); +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); if (is_hbd) aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh, xd->bd); else aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh); +#else + aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif } } @@ -881,7 +1175,6 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row, // prediction. We assume the original prediction (bmc) is stored in // xd->plane[].dst.buf void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, uint8_t *above[MAX_MB_PLANE], int above_stride[MAX_MB_PLANE], uint8_t *left[MAX_MB_PLANE], @@ -890,23 +1183,54 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, // handle above row struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride }; - foreach_overlappable_nb_above(cm, xd, mi_col, + foreach_overlappable_nb_above(cm, xd, max_neighbor_obmc[mi_size_wide_log2[bsize]], build_obmc_inter_pred_above, &ctxt_above); // handle left column struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride }; - foreach_overlappable_nb_left(cm, xd, mi_row, + foreach_overlappable_nb_left(cm, xd, max_neighbor_obmc[mi_size_high_log2[bsize]], build_obmc_inter_pred_left, &ctxt_left); } +void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset, + int mi_col_offset, MB_MODE_INFO *ref_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type); + const int ref_mi_row = xd->mi_row + mi_row_offset; + const int ref_mi_col = xd->mi_col + mi_col_offset; + + for (int plane = 0; plane < num_planes; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane], + ctxt->tmp_width[plane], ctxt->tmp_height[plane], + ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + + xd->block_ref_scale_factors[0] = sf; + if ((!av1_is_valid_scale(sf))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + + av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf, + num_planes); +} + void av1_setup_build_prediction_by_above_pred( MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes) { const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type); - const int above_mi_col = ctxt->mi_col + rel_mi_col; + const int above_mi_col = xd->mi_col + rel_mi_col; av1_modify_neighbor_predictor_for_obmc(above_mbmi); @@ -922,19 +1246,21 @@ void av1_setup_build_prediction_by_above_pred( for (int ref = 0; ref < num_refs; ++ref) { const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref]; - const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME]; - - xd->block_refs[ref] = ref_buf; - if ((!av1_is_valid_scale(&ref_buf->sf))) + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + xd->block_ref_scale_factors[ref] = sf; + if ((!av1_is_valid_scale(sf))) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); - av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col, - &ref_buf->sf, num_planes); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf, + num_planes); } xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); - xd->mb_to_right_edge = ctxt->mb_to_far_edge + - (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8; + xd->mb_to_right_edge = + ctxt->mb_to_far_edge + + (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8; } void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, @@ -943,7 +1269,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, struct build_prediction_ctxt *ctxt, const int num_planes) { const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type); - const int left_mi_row = ctxt->mi_row + rel_mi_row; + const int left_mi_row = xd->mi_row + rel_mi_row; av1_modify_neighbor_predictor_for_obmc(left_mbmi); @@ -959,91 +1285,34 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, for (int ref = 0; ref < num_refs; ++ref) { const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref]; - const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME]; + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const ref_scale_factors = + get_ref_scale_factors_const(ctxt->cm, frame); - xd->block_refs[ref] = ref_buf; - if ((!av1_is_valid_scale(&ref_buf->sf))) + xd->block_ref_scale_factors[ref] = ref_scale_factors; + if ((!av1_is_valid_scale(ref_scale_factors))) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); - av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col, - &ref_buf->sf, num_planes); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col, + ref_scale_factors, num_planes); } - xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row); + xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row)); xd->mb_to_bottom_edge = ctxt->mb_to_far_edge + - (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8; + GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE); } -/* clang-format off */ -static const uint8_t ii_weights1d[MAX_SB_SIZE] = { - 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32, - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16, - 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, - 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4, - 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 -}; -static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = { - 32, 16, 16, 16, 8, 8, 8, 4, - 4, 4, 2, 2, 2, 1, 1, 1, - 8, 8, 4, 4, 2, 2 -}; -/* clang-format on */ - -static void build_smooth_interintra_mask(uint8_t *mask, int stride, - BLOCK_SIZE plane_bsize, - INTERINTRA_MODE mode) { - int i, j; - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; - const int size_scale = ii_size_scales[plane_bsize]; - - switch (mode) { - case II_V_PRED: - for (i = 0; i < bh; ++i) { - memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0])); - mask += stride; - } - break; - - case II_H_PRED: - for (i = 0; i < bh; ++i) { - for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale]; - mask += stride; - } - break; - - case II_SMOOTH_PRED: - for (i = 0; i < bh; ++i) { - for (j = 0; j < bw; ++j) - mask[j] = ii_weights1d[(i < j ? i : j) * size_scale]; - mask += stride; - } - break; - - case II_DC_PRED: - default: - for (i = 0; i < bh; ++i) { - memset(mask, 32, bw * sizeof(mask[0])); - mask += stride; - } - break; - } -} - -static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra, - int wedge_index, int wedge_sign, - BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, - uint8_t *comppred, int compstride, - const uint8_t *interpred, int interstride, - const uint8_t *intrapred, int intrastride) { +static AOM_INLINE void combine_interintra( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, + uint8_t *comppred, int compstride, const uint8_t *interpred, + int interstride, const uint8_t *intrapred, int intrastride) { const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; if (use_wedge_interintra) { - if (is_interintra_wedge_used(bsize)) { + if (av1_is_wedge_used(bsize)) { const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); const int subw = 2 * mi_size_wide[bsize] == bw; @@ -1055,22 +1324,22 @@ static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra, return; } - uint8_t mask[MAX_SB_SQUARE]; - build_smooth_interintra_mask(mask, bw, plane_bsize, mode); + const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize]; aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred, interstride, mask, bw, bw, bh, 0, 0); } -static void combine_interintra_highbd( - INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index, - int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void combine_interintra_highbd( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, uint8_t *comppred8, int compstride, const uint8_t *interpred8, int interstride, const uint8_t *intrapred8, int intrastride, int bd) { const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; if (use_wedge_interintra) { - if (is_interintra_wedge_used(bsize)) { + if (av1_is_wedge_used(bsize)) { const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); const int subh = 2 * mi_size_high[bsize] == bh; @@ -1088,12 +1357,13 @@ static void combine_interintra_highbd( interpred8, interstride, mask, bw, bw, bh, 0, 0, bd); } +#endif void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - BUFFER_SET *ctx, uint8_t *dst, - int dst_stride) { + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride) { struct macroblockd_plane *const pd = &xd->plane[plane]; const int ssx = xd->plane[plane].subsampling_x; const int ssy = xd->plane[plane].subsampling_y; @@ -1116,28 +1386,30 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const int ssx = xd->plane[plane].subsampling_x; const int ssy = xd->plane[plane].subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { combine_interintra_highbd( xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, - xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign, - bsize, plane_bsize, xd->plane[plane].dst.buf, - xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred, - intra_stride, xd->bd); + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + inter_pred, inter_stride, intra_pred, intra_stride, xd->bd); return; } +#endif combine_interintra( xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, - xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign, - bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred, intra_stride); } // build interintra_predictors for one plane -void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *pred, int stride, - BUFFER_SET *ctx, int plane, - BLOCK_SIZE bsize) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + if (is_cur_buf_hbd(xd)) { DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); av1_build_intra_predictors_for_interintra( cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), @@ -1152,11 +1424,3 @@ void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, MAX_SB_SIZE); } } - -void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *upred, uint8_t *vpred, - int ustride, int vstride, - BUFFER_SET *ctx, BLOCK_SIZE bsize) { - av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize); - av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize); -} diff --git a/media/libaom/src/av1/common/reconinter.h b/media/libaom/src/av1/common/reconinter.h index db86c777e..fe3c6a621 100644 --- a/media/libaom/src/av1/common/reconinter.h +++ b/media/libaom/src/av1/common/reconinter.h @@ -12,9 +12,9 @@ #ifndef AOM_AV1_COMMON_RECONINTER_H_ #define AOM_AV1_COMMON_RECONINTER_H_ -#include "av1/common/filter.h" -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/convolve.h" +#include "av1/common/filter.h" #include "av1/common/warped_motion.h" #include "aom/aom_integer.h" @@ -35,8 +35,7 @@ extern "C" { #endif -// Set to (1 << 5) if the 32-ary codebooks are used for any bock size -#define MAX_WEDGE_TYPES (1 << 4) +#define MAX_WEDGE_TYPES 16 #define MAX_WEDGE_SIZE_LOG2 5 // 32x32 #define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2) @@ -47,7 +46,7 @@ extern "C" { #define WEDGE_NONE -1 // Angles are with respect to horizontal anti-clockwise -typedef enum { +enum { WEDGE_HORIZONTAL = 0, WEDGE_VERTICAL = 1, WEDGE_OBLIQUE27 = 2, @@ -55,7 +54,7 @@ typedef enum { WEDGE_OBLIQUE117 = 4, WEDGE_OBLIQUE153 = 5, WEDGE_DIRECTIONS -} WedgeDirectionType; +} UENUM1BYTE(WedgeDirectionType); // 3-tuple: {direction, x_offset, y_offset} typedef struct { @@ -67,13 +66,13 @@ typedef struct { typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES]; typedef struct { - int bits; + int wedge_types; const wedge_code_type *codebook; uint8_t *signflip; wedge_masks_type *masks; } wedge_params_type; -extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL]; +extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL]; typedef struct SubpelParams { int xs; @@ -84,8 +83,6 @@ typedef struct SubpelParams { struct build_prediction_ctxt { const AV1_COMMON *cm; - int mi_row; - int mi_col; uint8_t **tmp_buf; int *tmp_width; int *tmp_height; @@ -93,6 +90,55 @@ struct build_prediction_ctxt { int mb_to_far_edge; }; +typedef enum InterPredMode { + TRANSLATION_PRED, + WARP_PRED, +} InterPredMode; + +typedef enum InterCompMode { + UNIFORM_SINGLE, + UNIFORM_COMP, + MASK_COMP, +} InterCompMode; + +typedef struct InterPredParams { + InterPredMode mode; + InterCompMode comp_mode; + WarpedMotionParams warp_params; + ConvolveParams conv_params; + const InterpFilterParams *interp_filter_params[2]; + int block_width; + int block_height; + int pix_row; + int pix_col; + struct buf_2d ref_frame_buf; + int subsampling_x; + int subsampling_y; + const struct scale_factors *scale_factors; + int bit_depth; + int use_hbd_buf; + INTERINTER_COMPOUND_DATA mask_comp; + BLOCK_SIZE sb_type; + int is_intrabc; +} InterPredParams; + +void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width, + int block_height, int pix_row, int pix_col, + int subsampling_x, int subsampling_y, int bit_depth, + int use_hbd_buf, int is_intrabc, + const struct scale_factors *sf, + const struct buf_2d *ref_buf, + int_interpfilters interp_filters); + +void av1_init_comp_mode(InterPredParams *inter_pred_params); + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi); + +void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize, + const INTERINTER_COMPOUND_DATA *mask_comp); + static INLINE int has_scale(int xs, int ys) { return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS; } @@ -108,53 +154,47 @@ static INLINE void revert_scale_extra_bits(SubpelParams *sp) { assert(sp->ys <= SUBPEL_SHIFTS); } -static INLINE void inter_predictor(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const SubpelParams *subpel_params, - const struct scale_factors *sf, int w, int h, - ConvolveParams *conv_params, - InterpFilters interp_filters, - int is_intrabc) { +static INLINE void inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, const struct scale_factors *sf, int w, + int h, ConvolveParams *conv_params, + const InterpFilterParams *interp_filters[2]) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); assert(sf); const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); - assert(IMPLIES(is_intrabc, !is_scaled)); if (is_scaled) { av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, - subpel_params->ys, 1, conv_params, sf, is_intrabc); + subpel_params->ys, 1, conv_params, sf); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, - sp.ys, 0, conv_params, sf, is_intrabc); + sp.ys, 0, conv_params, sf); } } -static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const SubpelParams *subpel_params, - const struct scale_factors *sf, int w, - int h, ConvolveParams *conv_params, - InterpFilters interp_filters, - int is_intrabc, int bd) { +static INLINE void highbd_inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, const struct scale_factors *sf, int w, + int h, ConvolveParams *conv_params, + const InterpFilterParams *interp_filters[2], int bd) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); assert(sf); const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); - assert(IMPLIES(is_intrabc, !is_scaled)); if (is_scaled) { - av1_highbd_convolve_2d_facade( - src, src_stride, dst, dst_stride, w, h, interp_filters, - subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, - subpel_params->ys, 1, conv_params, sf, is_intrabc, bd); + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, subpel_params->subpel_x, + subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params, sf, bd); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); - av1_highbd_convolve_2d_facade( - src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, - sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd); + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, sp.subpel_x, sp.xs, + sp.subpel_y, sp.ys, 0, conv_params, sf, bd); } } @@ -167,9 +207,10 @@ static INLINE int is_interinter_compound_used(COMPOUND_TYPE type, const int comp_allowed = is_comp_ref_allowed(sb_type); switch (type) { case COMPOUND_AVERAGE: + case COMPOUND_DISTWTD: case COMPOUND_DIFFWTD: return comp_allowed; case COMPOUND_WEDGE: - return comp_allowed && wedge_params_lookup[sb_type].bits > 0; + return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0; default: assert(0); return 0; } } @@ -187,39 +228,41 @@ static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) { return 0; } -static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) { - return wedge_params_lookup[sb_type].bits; -} - -static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) { - const int wbits = wedge_params_lookup[sb_type].bits; - return (wbits > 0) ? wbits + 1 : 0; -} - -static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) { - return wedge_params_lookup[sb_type].bits > 0; +static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types; } -static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) { - return wedge_params_lookup[sb_type].bits; +static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types > 0; } void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const SubpelParams *subpel_params, - const struct scale_factors *sf, int w, int h, - ConvolveParams *conv_params, - InterpFilters interp_filters, - const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, int ref, - const MB_MODE_INFO *mi, int build_for_obmc, - const MACROBLOCKD *xd, int can_use_previous); - -void av1_make_masked_inter_predictor( - const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride, - const SubpelParams *subpel_params, const struct scale_factors *sf, int w, - int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane, - const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref, - MACROBLOCKD *xd, int can_use_previous); + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv, + InterPredParams *const inter_pred_params, + MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, uint8_t **pre, + SubpelParams *subpel_params, + int *src_stride); + +void av1_build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *const src_mv, + InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, CalcSubpelParamsFunc calc_subpel_params_func); + +void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, + int mi_y, + CalcSubpelParamsFunc calc_subpel_params_func); // TODO(jkoleszar): yet another mv clamping function :-( static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, @@ -236,22 +279,26 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, (int16_t)(src_mv->col * (1 << (1 - ss_x))) }; assert(ss_x <= 1); assert(ss_y <= 1); + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, + xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, + xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, + xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom + }; - clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, - xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, - xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, - xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom); + clamp_mv(&clamped_mv, &mv_limits); return clamped_mv; } -static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *sf) { +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { const int x = sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset; const int y = sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset; - return y * stride + x; + return (int64_t)y * stride + x; } static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize, @@ -296,6 +343,11 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) { return 1; } +void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset, + int mi_col_offset, MB_MODE_INFO *ref_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes); + void av1_setup_build_prediction_by_above_pred( MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, @@ -306,56 +358,53 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, struct build_prediction_ctxt *ctxt, const int num_planes); void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, uint8_t *above[MAX_MB_PLANE], int above_stride[MAX_MB_PLANE], uint8_t *left[MAX_MB_PLANE], int left_stride[MAX_MB_PLANE]); const uint8_t *av1_get_obmc_mask(int length); -void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col); +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd); #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) void av1_init_wedge_masks(); -static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index, - int wedge_sign, +static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE sb_type) { - return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index]; + return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index]; } const uint8_t *av1_get_compound_type_mask( const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type); // build interintra_predictors for one plane -void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *pred, int stride, - BUFFER_SET *ctx, int plane, - BLOCK_SIZE bsize); - -void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *upred, uint8_t *vpred, - int ustride, int vstride, - BUFFER_SET *ctx, BLOCK_SIZE bsize); +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize); -void av1_build_intra_predictors_for_interintra( - const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride); +void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, + MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride); void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const uint8_t *inter_pred, int inter_stride, const uint8_t *intra_pred, int intra_stride); -void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, - int order_idx, int *fwd_offset, int *bck_offset, - int *use_jnt_comp_avg, int is_compound); +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int order_idx, + int *fwd_offset, int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound); int av1_allow_warp(const MB_MODE_INFO *const mbmi, const WarpTypesAllowed *const warp_types, const WarpedMotionParams *const gm_params, - int build_for_obmc, int x_scale, int y_scale, + int build_for_obmc, const struct scale_factors *const sf, WarpedMotionParams *final_warp_params); #ifdef __cplusplus diff --git a/media/libaom/src/av1/common/reconintra.c b/media/libaom/src/av1/common/reconintra.c index 71a52e73e..1307a0313 100644 --- a/media/libaom/src/av1/common/reconintra.c +++ b/media/libaom/src/av1/common/reconintra.c @@ -20,9 +20,9 @@ #include "aom_ports/aom_once.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" -#include "av1/common/reconintra.h" -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/cfl.h" +#include "av1/common/reconintra.h" enum { NEED_LEFT = 1 << 1, @@ -198,7 +198,7 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, int col_off, int ss_x, int ss_y) { if (!top_available || !right_available) return 0; - const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0]; + const int bw_unit = mi_size_wide[bsize]; const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1); const int top_right_count_unit = tx_size_wide_unit[txsz]; @@ -405,7 +405,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, // Bottom-left pixels are in the bottom-left block, which is not available. return 0; } else { - const int bh_unit = block_size_high[bsize] >> tx_size_high_log2[0]; + const int bh_unit = mi_size_high[bsize]; const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1); const int bottom_left_count_unit = tx_size_high_unit[txsz]; @@ -422,10 +422,9 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, // and/or bottom-left superblocks. But only the left superblock is // available, so check if all required pixels fall in that superblock. if (blk_col_in_sb == 0) { - const int blk_start_row_off = blk_row_in_sb - << (bh_in_mi_log2 + MI_SIZE_LOG2 - - tx_size_wide_log2[0]) >> - ss_y; + const int blk_start_row_off = + blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >> + ss_y; const int row_off_in_sb = blk_start_row_off + row_off; const int sb_height_unit = sb_mi_size >> ss_y; return row_off_in_sb + bottom_left_count_unit < sb_height_unit; @@ -453,11 +452,13 @@ typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL]; static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL]; +#if CONFIG_AV1_HIGHBITDEPTH typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL]; static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL]; +#endif static void init_intra_predictors_internal(void) { assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES); @@ -499,7 +500,7 @@ static void init_intra_predictors_internal(void) { INIT_ALL_SIZES(dc_pred[0][1], dc_top); INIT_ALL_SIZES(dc_pred[1][0], dc_left); INIT_ALL_SIZES(dc_pred[1][1], dc); - +#if CONFIG_AV1_HIGHBITDEPTH INIT_ALL_SIZES(pred_high[V_PRED], highbd_v); INIT_ALL_SIZES(pred_high[H_PRED], highbd_h); INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth); @@ -510,6 +511,7 @@ static void init_intra_predictors_internal(void) { INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top); INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left); INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc); +#endif #undef intra_pred_allsizes } @@ -556,33 +558,37 @@ void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { - int r, c, x, y, shift1, shift2, val, base1, base2; - assert(dx > 0); assert(dy > 0); const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; - const int base_inc_x = 1 << upsample_above; - x = -dx; - for (r = 0; r < bh; ++r, x -= dx, dst += stride) { - base1 = x >> frac_bits_x; - y = (r << 6) - dy; - for (c = 0; c < bw; ++c, base1 += base_inc_x, y -= dy) { - if (base1 >= min_base_x) { - shift1 = ((x * (1 << upsample_above)) & 0x3F) >> 1; - val = above[base1] * (32 - shift1) + above[base1 + 1] * shift1; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } else { - base2 = y >> frac_bits_y; - assert(base2 >= -(1 << upsample_left)); - shift2 = ((y * (1 << upsample_left)) & 0x3F) >> 1; - val = left[base2] * (32 - shift2) + left[base2 + 1] * shift2; + x = c + 1; + y = (r << 6) - x * dy; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } dst[c] = val; } + dst += stride; } } @@ -643,6 +649,7 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, } } +#if CONFIG_AV1_HIGHBITDEPTH // Directional prediction, zone 1: 0 < angle < 90 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, @@ -688,30 +695,33 @@ void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { - int r, c, x, y, shift, val, base; - (void)bd; assert(dx > 0); assert(dy > 0); const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - y = r + 1; - x = (c << 6) - y * dx; - base = x >> frac_bits_x; - if (base >= min_base_x) { - shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; - val = above[base] * (32 - shift) + above[base + 1] * shift; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } else { x = c + 1; y = (r << 6) - x * dy; - base = y >> frac_bits_y; - shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; - val = left[base] * (32 - shift) + left[base + 1] * shift; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } dst[c] = val; @@ -778,6 +788,7 @@ static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, pred_high[H_PRED][tx_size](dst, stride, above, left, bd); } } +#endif // CONFIG_AV1_HIGHBITDEPTH DECLARE_ALIGNED(16, const int8_t, av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = { @@ -843,10 +854,6 @@ void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, assert(bw <= 32 && bh <= 32); - // The initialization is just for silencing Jenkins static analysis warnings - for (r = 0; r < bh + 1; ++r) - memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0])); - for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); @@ -881,6 +888,7 @@ void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, } } +#if CONFIG_AV1_HIGHBITDEPTH static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, @@ -893,10 +901,6 @@ static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride, assert(bw <= 32 && bh <= 32); - // The initialization is just for silencing Jenkins static analysis warnings - for (r = 0; r < bh + 1; ++r) - memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0])); - for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0])); @@ -931,6 +935,7 @@ static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride, dst += stride; } } +#endif // CONFIG_AV1_HIGHBITDEPTH static int is_smooth(const MB_MODE_INFO *mbmi, int plane) { if (plane == 0) { @@ -1008,9 +1013,9 @@ static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) { void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) { if (!strength) return; - const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { - { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 } - }; + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; const int filt = strength - 1; uint8_t edge[129]; @@ -1041,9 +1046,9 @@ static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) { void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) { if (!strength) return; - const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { - { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 } - }; + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; const int filt = strength - 1; uint16_t edge[129]; @@ -1061,6 +1066,7 @@ void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) { } } +#if CONFIG_AV1_HIGHBITDEPTH static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) { const int kernel[3] = { 5, 6, 5 }; @@ -1070,6 +1076,7 @@ static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) { p_above[-1] = s; p_left[-1] = s; } +#endif void av1_upsample_intra_edge_c(uint8_t *p, int sz) { // interpolate half-sample positions @@ -1117,7 +1124,7 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) { p[2 * i] = in[i + 2]; } } - +#if CONFIG_AV1_HIGHBITDEPTH static void build_intra_predictors_high( const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, PREDICTION_MODE mode, int angle_delta, @@ -1144,7 +1151,7 @@ static void build_intra_predictors_high( int base = 128 << (xd->bd - 8); // The default values if ref pixels are not available: - // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1 + // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1 // base+1 A B .. Y Z // base+1 C D .. W X // base+1 E F .. U V @@ -1182,7 +1189,7 @@ static void build_intra_predictors_high( // NEED_LEFT if (need_left) { - int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT); + int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT; if (use_filter_intra) need_bottom = 0; if (is_dr_mode) need_bottom = p_angle > 180; const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0); @@ -1207,7 +1214,7 @@ static void build_intra_predictors_high( // NEED_ABOVE if (need_above) { - int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT); + int need_right = extend_modes[mode] & NEED_ABOVERIGHT; if (use_filter_intra) need_right = 0; if (is_dr_mode) need_right = p_angle < 90; const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0); @@ -1302,6 +1309,7 @@ static void build_intra_predictors_high( pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd); } } +#endif // CONFIG_AV1_HIGHBITDEPTH static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, @@ -1328,7 +1336,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; // The default values if ref pixels are not available: - // 127 127 127 .. 127 127 127 127 127 127 + // 128 127 127 .. 127 127 127 127 127 127 // 129 A B .. Y Z // 129 C D .. W X // 129 E F .. U V @@ -1367,10 +1375,13 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, // NEED_LEFT if (need_left) { - int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT); + int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT; if (use_filter_intra) need_bottom = 0; if (is_dr_mode) need_bottom = p_angle > 180; - const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0); + // the avx2 dr_prediction_z2 may read at most 3 extra bytes, + // due to the avx2 mask load is with dword granularity. + // so we initialize 3 extra bytes to silence valgrind complain. + const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3); i = 0; if (n_left_px > 0) { for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; @@ -1392,7 +1403,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, // NEED_ABOVE if (need_above) { - int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT); + int need_right = extend_modes[mode] & NEED_ABOVERIGHT; if (use_filter_intra) need_right = 0; if (is_dr_mode) need_right = p_angle < 90; const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0); @@ -1486,6 +1497,57 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, } } +static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x, + int subsampling_y) { + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); + BLOCK_SIZE bs = bsize; + switch (bsize) { + case BLOCK_4X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_4X8: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X8; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_8X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_8X8; + break; + case BLOCK_4X16: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X16; + else if (subsampling_x == 1) + bs = BLOCK_8X16; + else if (subsampling_y == 1) + bs = BLOCK_4X16; + break; + case BLOCK_16X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_16X8; + else if (subsampling_x == 1) + bs = BLOCK_16X4; + else if (subsampling_y == 1) + bs = BLOCK_16X8; + break; + default: break; + } + return bs; +} + void av1_predict_intra_block( const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx, TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, @@ -1494,8 +1556,8 @@ void av1_predict_intra_block( const MB_MODE_INFO *const mbmi = xd->mi[0]; const int txwpx = tx_size_wide[tx_size]; const int txhpx = tx_size_high[tx_size]; - const int x = col_off << tx_size_wide_log2[0]; - const int y = row_off << tx_size_high_log2[0]; + const int x = col_off << MI_SIZE_LOG2; + const int y = row_off << MI_SIZE_LOG2; if (use_palette) { int r, c; @@ -1503,7 +1565,7 @@ void av1_predict_intra_block( xd->color_index_map_offset[plane != 0]; const uint16_t *const palette = mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (r = 0; r < txhpx; ++r) { for (c = 0; c < txwpx; ++c) { @@ -1521,15 +1583,15 @@ void av1_predict_intra_block( return; } - BLOCK_SIZE bsize = mbmi->sb_type; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int txw = tx_size_wide_unit[tx_size]; const int txh = tx_size_high_unit[tx_size]; - const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available - : xd->up_available); + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int have_top = + row_off || (ss_y ? xd->chroma_up_available : xd->up_available); const int have_left = - col_off || - (pd->subsampling_x ? xd->chroma_left_available : xd->left_available); + col_off || (ss_x ? xd->chroma_left_available : xd->left_available); const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); const int xr_chr_offset = 0; @@ -1537,32 +1599,35 @@ void av1_predict_intra_block( // Distance between the right edge of this prediction block to // the frame right edge - const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + - (wpx - x - txwpx) - xr_chr_offset; + const int xr = + (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset; // Distance between the bottom edge of this prediction block to // the frame bottom edge - const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + - (hpx - y - txhpx) - yd_chr_offset; + const int yd = + (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset; const int right_available = - mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end; + mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end; const int bottom_available = - (yd > 0) && - (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end); + (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end); const PARTITION_TYPE partition = mbmi->partition; + BLOCK_SIZE bsize = mbmi->sb_type; // force 4x4 chroma component block size. - bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y); + if (ss_x || ss_y) { + bsize = scale_chroma_bsize(bsize, ss_x, ss_y); + } - const int have_top_right = has_top_right( - cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size, - row_off, col_off, pd->subsampling_x, pd->subsampling_y); - const int have_bottom_left = has_bottom_left( - cm, bsize, mi_row, mi_col, bottom_available, have_left, partition, - tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y); + const int have_top_right = + has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available, + partition, tx_size, row_off, col_off, ss_x, ss_y); + const int have_bottom_left = + has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left, + partition, tx_size, row_off, col_off, ss_x, ss_y); const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { build_intra_predictors_high( xd, ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode, tx_size, disable_edge_filter, @@ -1572,7 +1637,7 @@ void av1_predict_intra_block( have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane); return; } - +#endif build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode, tx_size, disable_edge_filter, @@ -1588,8 +1653,7 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[plane]; const int dst_stride = pd->dst.stride; - uint8_t *dst = - &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; const PREDICTION_MODE mode = (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0; diff --git a/media/libaom/src/av1/common/reconintra.h b/media/libaom/src/av1/common/reconintra.h index 07853aba0..9d203569c 100644 --- a/media/libaom/src/av1/common/reconintra.h +++ b/media/libaom/src/av1/common/reconintra.h @@ -15,8 +15,8 @@ #include <stdlib.h> #include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" -#include "av1/common/onyxc_int.h" #ifdef __cplusplus extern "C" { @@ -26,13 +26,11 @@ void av1_init_intra_predictors(void); void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, int blk_col, int blk_row, TX_SIZE tx_size); -void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd, - int bw, int bh, TX_SIZE tx_size, - PREDICTION_MODE mode, int angle_delta, - int use_palette, - FILTER_INTRA_MODE filter_intra_mode, - const uint8_t *ref, int ref_stride, uint8_t *dst, - int dst_stride, int aoff, int loff, int plane); +void av1_predict_intra_block( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, int col_off, int row_off, int plane); // Mapping of interintra to intra mode for use in the intra component static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = { @@ -56,8 +54,8 @@ static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) { } static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) { - return frame_is_intra_only(cm) && cm->allow_screen_content_tools && - cm->allow_intrabc; + return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools && + cm->features.allow_intrabc; } static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm, @@ -76,6 +74,40 @@ static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm, extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]; +static const int16_t dr_intra_derivative[90] = { + // More evenly spread out angles and limited to 10-bit + // Values that are 0 will never be used + // Approx angle + 0, 0, 0, // + 1023, 0, 0, // 3, ... + 547, 0, 0, // 6, ... + 372, 0, 0, 0, 0, // 9, ... + 273, 0, 0, // 14, ... + 215, 0, 0, // 17, ... + 178, 0, 0, // 20, ... + 151, 0, 0, // 23, ... (113 & 203 are base angles) + 132, 0, 0, // 26, ... + 116, 0, 0, // 29, ... + 102, 0, 0, 0, // 32, ... + 90, 0, 0, // 36, ... + 80, 0, 0, // 39, ... + 71, 0, 0, // 42, ... + 64, 0, 0, // 45, ... (45 & 135 are base angles) + 57, 0, 0, // 48, ... + 51, 0, 0, // 51, ... + 45, 0, 0, 0, // 54, ... + 40, 0, 0, // 58, ... + 35, 0, 0, // 61, ... + 31, 0, 0, // 64, ... + 27, 0, 0, // 67, ... (67 & 157 are base angles) + 23, 0, 0, // 70, ... + 19, 0, 0, // 73, ... + 15, 0, 0, 0, 0, // 76, ... + 11, 0, 0, // 81, ... + 7, 0, 0, // 84, ... + 3, 0, 0, // 87, ... +}; + // Get the shift (up-scaled by 256) in X w.r.t a unit change in Y. // If angle > 0 && angle < 90, dx = -((int)(256 / t)); // If angle > 90 && angle < 180, dx = (int)(256 / t); @@ -110,7 +142,7 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, int type) { const int d = abs(delta); const int blk_wh = bs0 + bs1; - if (d <= 0 || d >= 40) return 0; + if (d == 0 || d >= 40) return 0; return type ? (blk_wh <= 8) : (blk_wh <= 16); } #ifdef __cplusplus diff --git a/media/libaom/src/av1/common/resize.c b/media/libaom/src/av1/common/resize.c index d61a20aa2..98f28f7b5 100644 --- a/media/libaom/src/av1/common/resize.c +++ b/media/libaom/src/av1/common/resize.c @@ -313,6 +313,91 @@ static void interpolate_core(const uint8_t *const input, int in_length, } } +static void interpolate_core_double_prec(const double *const input, + int in_length, double *output, + int out_length, + const int16_t *interp_filters, + int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + double *optr = output; + int x, x1, x2, k, int_pel, sub_pel; + double sum; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = sum / (1 << FILTER_BITS); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = sum / (1 << FILTER_BITS); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = sum / (1 << FILTER_BITS); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = sum / (1 << FILTER_BITS); + } + } +} + static void interpolate(const uint8_t *const input, int in_length, uint8_t *output, int out_length) { const InterpKernel *interp_filters = @@ -322,6 +407,15 @@ static void interpolate(const uint8_t *const input, int in_length, SUBPEL_TAPS); } +static void interpolate_double_prec(const double *const input, int in_length, + double *output, int out_length) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + interpolate_core_double_prec(input, in_length, output, out_length, + &interp_filters[0][0], SUBPEL_TAPS); +} + int32_t av1_get_upscale_convolve_step(int in_length, int out_length) { return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; } @@ -337,7 +431,6 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length, return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK); } -#ifndef __clang_analyzer__ static void down2_symeven(const uint8_t *const input, int length, uint8_t *output) { // Actual filter len = 2 * filter_len_half. @@ -392,7 +485,6 @@ static void down2_symeven(const uint8_t *const input, int length, } } } -#endif static void down2_symodd(const uint8_t *const input, int length, uint8_t *output) { @@ -505,6 +597,12 @@ static void resize_multistep(const uint8_t *const input, int length, } } +static void upscale_multistep_double_prec(const double *const input, int length, + double *output, int olength) { + assert(length < olength); + interpolate_double_prec(input, length, output, olength); +} + static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { int i; uint8_t *iptr = img; @@ -523,9 +621,29 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { } } -static void resize_plane(const uint8_t *const input, int height, int width, - int in_stride, uint8_t *output, int height2, - int width2, int out_stride) { +static void fill_col_to_arr_double_prec(double *img, int stride, int len, + double *arr) { + int i; + double *iptr = img; + double *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col_double_prec(double *img, int stride, int len, + double *arr) { + int i; + double *iptr = img; + double *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void av1_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride) { int i; uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height); uint8_t *tmpbuf = @@ -554,6 +672,33 @@ Error: aom_free(arrbuf2); } +void av1_upscale_plane_double_prec(const double *const input, int height, + int width, int in_stride, double *output, + int height2, int width2, int out_stride) { + int i; + double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height); + double *arrbuf = (double *)aom_malloc(sizeof(double) * height); + double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2); + if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + for (i = 0; i < height; ++i) + upscale_multistep_double_prec(input + in_stride * i, width, + intbuf + width2 * i, width2); + for (i = 0; i < width2; ++i) { + fill_col_to_arr_double_prec(intbuf + i, width2, height, arrbuf); + upscale_multistep_double_prec(arrbuf, height, arrbuf2, height2); + fill_arr_to_col_double_prec(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(arrbuf); + aom_free(arrbuf2); +} + static void upscale_normative_rect(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, @@ -613,6 +758,7 @@ static void upscale_normative_rect(const uint8_t *const input, int height, } } +#if CONFIG_AV1_HIGHBITDEPTH static void highbd_interpolate_core(const uint16_t *const input, int in_length, uint16_t *output, int out_length, int bd, const int16_t *interp_filters, @@ -705,7 +851,6 @@ static void highbd_interpolate(const uint16_t *const input, int in_length, &interp_filters[0][0], SUBPEL_TAPS); } -#ifndef __clang_analyzer__ static void highbd_down2_symeven(const uint16_t *const input, int length, uint16_t *output, int bd) { // Actual filter len = 2 * filter_len_half. @@ -813,7 +958,6 @@ static void highbd_down2_symodd(const uint16_t *const input, int length, } } } -#endif static void highbd_resize_multistep(const uint16_t *const input, int length, uint16_t *output, int olength, @@ -871,10 +1015,9 @@ static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len, } } -static void highbd_resize_plane(const uint8_t *const input, int height, - int width, int in_stride, uint8_t *output, - int height2, int width2, int out_stride, - int bd) { +void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd) { int i; uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height); uint16_t *tmpbuf = @@ -963,17 +1106,18 @@ static void highbd_upscale_normative_rect(const uint8_t *const input, aom_free(tmp_right); } } +#endif // CONFIG_AV1_HIGHBITDEPTH void av1_resize_frame420(const uint8_t *const y, int y_stride, const uint8_t *const u, const uint8_t *const v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { - resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); - resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, owidth / 2, - ouv_stride); - resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, owidth / 2, - ouv_stride); + av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride); + av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride); } void av1_resize_frame422(const uint8_t *const y, int y_stride, @@ -981,11 +1125,11 @@ void av1_resize_frame422(const uint8_t *const y, int y_stride, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { - resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); - resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2, - ouv_stride); - resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2, - ouv_stride); + av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2, + ouv_stride); + av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2, + ouv_stride); } void av1_resize_frame444(const uint8_t *const y, int y_stride, @@ -993,23 +1137,26 @@ void av1_resize_frame444(const uint8_t *const y, int y_stride, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { - resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); - resize_plane(u, height, width, uv_stride, ou, oheight, owidth, ouv_stride); - resize_plane(v, height, width, uv_stride, ov, oheight, owidth, ouv_stride); + av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); + av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride); + av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride); } +#if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, const uint8_t *const u, const uint8_t *const v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth, int bd) { - highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, - oy_stride, bd); - highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, - owidth / 2, ouv_stride, bd); - highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, - owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride, bd); } void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, @@ -1018,12 +1165,12 @@ void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth, int bd) { - highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, - oy_stride, bd); - highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2, - ouv_stride, bd); - highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2, - ouv_stride, bd); + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, + owidth / 2, ouv_stride, bd); } void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, @@ -1032,13 +1179,14 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth, int bd) { - highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, - oy_stride, bd); - highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, - ouv_stride, bd); - highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, - ouv_stride, bd); + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride, bd); + av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride, bd); } +#endif // CONFIG_AV1_HIGHBITDEPTH void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, @@ -1049,16 +1197,24 @@ void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src, // the static analysis warnings. for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { const int is_uv = i > 0; +#if CONFIG_AV1_HIGHBITDEPTH if (src->flags & YV12_FLAG_HIGHBITDEPTH) - highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv], - src->crop_widths[is_uv], src->strides[is_uv], - dst->buffers[i], dst->crop_heights[is_uv], - dst->crop_widths[is_uv], dst->strides[is_uv], bd); + av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv], bd); else - resize_plane(src->buffers[i], src->crop_heights[is_uv], - src->crop_widths[is_uv], src->strides[is_uv], - dst->buffers[i], dst->crop_heights[is_uv], - dst->crop_widths[is_uv], dst->strides[is_uv]); + av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv]); +#else + (void)bd; + av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv]); +#endif } aom_extend_frame_borders(dst, num_planes); } @@ -1079,7 +1235,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width, upscaled_plane_width, x_step_qn); - for (int j = 0; j < cm->tile_cols; j++) { + for (int j = 0; j < cm->tiles.cols; j++) { av1_tile_set_col(&tile_col, cm, j); // Determine the limits of this tile column in both the source // and destination images. @@ -1092,7 +1248,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR; int upscaled_x1; - if (j == cm->tile_cols - 1) { + if (j == cm->tiles.cols - 1) { // Note that we can't just use AOMMIN here - due to rounding, // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than // upscaled_plane_width. @@ -1106,8 +1262,9 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, const int dst_width = upscaled_x1 - upscaled_x0; const int pad_left = (j == 0); - const int pad_right = (j == cm->tile_cols - 1); + const int pad_right = (j == cm->tiles.cols - 1); +#if CONFIG_AV1_HIGHBITDEPTH if (cm->seq_params.use_highbitdepth) highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, @@ -1117,7 +1274,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, x_step_qn, x0_qn, pad_left, pad_right); - +#else + upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, + dst_width, dst_stride, x_step_qn, x0_qn, pad_left, + pad_right); +#endif // Update the fractional pixel offset to prepare for the next tile column. x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS); } @@ -1155,10 +1316,19 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm, // denominator. static void calculate_scaled_size_helper(int *dim, int denom) { if (denom != SCALE_NUMERATOR) { + // We need to ensure the constraint in "Appendix A" of the spec: + // * FrameWidth is greater than or equal to 16 + // * FrameHeight is greater than or equal to 16 + // For this, we clamp the downscaled dimension to at least 16. One + // exception: if original dimension itself was < 16, then we keep the + // downscaled dimension to be same as the original, to ensure that resizing + // is valid. + const int min_dim = AOMMIN(16, *dim); // Use this version if we need *dim to be even // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom); // *width <<= 1; *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom); + *dim = AOMMAX(*dim, min_dim); } } @@ -1201,17 +1371,18 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { const int num_planes = av1_num_planes(cm); if (!av1_superres_scaled(cm)) return; const SequenceHeader *const seq_params = &cm->seq_params; + const int byte_alignment = cm->features.byte_alignment; YV12_BUFFER_CONFIG copy_buffer; memset(©_buffer, 0, sizeof(copy_buffer)); - YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm); + YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf; const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3); if (aom_alloc_frame_buffer( ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment)) + AOM_BORDER_IN_PIXELS, byte_alignment)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate copy buffer for superres upscaling"); @@ -1225,27 +1396,31 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { // Realloc the current frame buffer at a higher resolution in place. if (pool != NULL) { // Use callbacks if on the decoder. - aom_codec_frame_buffer_t *fb = - &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer; + aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer; aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb; aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb; void *cb_priv = pool->cb_priv; + lock_buffer_pool(pool); // Realloc with callback does not release the frame buffer - release first. - if (release_fb_cb(cb_priv, fb)) + if (release_fb_cb(cb_priv, fb)) { + unlock_buffer_pool(pool); aom_internal_error( &cm->error, AOM_CODEC_MEM_ERROR, "Failed to free current frame buffer before superres upscaling"); - + } // aom_realloc_frame_buffer() leaves config data for frame_to_show intact if (aom_realloc_frame_buffer( frame_to_show, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv)) + AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) { + unlock_buffer_pool(pool); aom_internal_error( &cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate current frame buffer for superres upscaling"); + } + unlock_buffer_pool(pool); } else { // Make a copy of the config data for frame_to_show in copy_buffer copy_buffer_config(frame_to_show, ©_buffer); @@ -1256,7 +1431,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { frame_to_show, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment)) + AOM_BORDER_IN_PIXELS, byte_alignment)) aom_internal_error( &cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate current frame buffer for superres upscaling"); diff --git a/media/libaom/src/av1/common/resize.h b/media/libaom/src/av1/common/resize.h index 9a59a8d63..8ee859e5c 100644 --- a/media/libaom/src/av1/common/resize.h +++ b/media/libaom/src/av1/common/resize.h @@ -14,7 +14,7 @@ #include <stdio.h> #include "aom/aom_integer.h" -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #ifdef __cplusplus extern "C" { @@ -23,6 +23,9 @@ extern "C" { void av1_resize_plane(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); +void av1_upscale_plane_double_prec(const double *const input, int height, + int width, int in_stride, double *output, + int height2, int width2, int out_stride); void av1_resize_frame420(const uint8_t *const y, int y_stride, const uint8_t *const u, const uint8_t *const v, int uv_stride, int height, int width, uint8_t *oy, diff --git a/media/libaom/src/av1/common/restoration.c b/media/libaom/src/av1/common/restoration.c index d276a915b..a0f37ad63 100644 --- a/media/libaom/src/av1/common/restoration.c +++ b/media/libaom/src/av1/common/restoration.c @@ -17,7 +17,7 @@ #include "config/aom_scale_rtcd.h" #include "aom_mem/aom_mem.h" -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" #include "aom_dsp/aom_dsp_common.h" @@ -28,7 +28,7 @@ // The 's' values are calculated based on original 'r' and 'e' values in the // spec using GenSgrprojVtable(). // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid). -const sgr_params_type sgr_params[SGRPROJ_PARAMS] = { +const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = { { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } }, { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } }, { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } }, @@ -111,7 +111,7 @@ int sgrproj_mtable[SGRPROJ_PARAMS][2]; static void GenSgrprojVtable() { for (int i = 0; i < SGRPROJ_PARAMS; ++i) { - const sgr_params_type *const params = &sgr_params[i]; + const sgr_params_type *const params = &av1_sgr_params[i]; for (int j = 0; j < 2; ++j) { const int e = params->e[j]; const int r = params->r[j]; @@ -153,6 +153,7 @@ static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride, } } +#if CONFIG_AV1_HIGHBITDEPTH static void extend_frame_highbd(uint16_t *data, int width, int height, int stride, int border_horz, int border_vert) { uint16_t *data_p; @@ -173,13 +174,24 @@ static void extend_frame_highbd(uint16_t *data, int width, int height, } } -void extend_frame(uint8_t *data, int width, int height, int stride, - int border_horz, int border_vert, int highbd) { - if (highbd) +static void copy_tile_highbd(int width, int height, const uint16_t *src, + int src_stride, uint16_t *dst, int dst_stride) { + for (int i = 0; i < height; ++i) + memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst)); +} +#endif + +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride, border_horz, border_vert); - else - extend_frame_lowbd(data, width, height, stride, border_horz, border_vert); + return; + } +#endif + (void)highbd; + extend_frame_lowbd(data, width, height, stride, border_horz, border_vert); } static void copy_tile_lowbd(int width, int height, const uint8_t *src, @@ -188,19 +200,17 @@ static void copy_tile_lowbd(int width, int height, const uint8_t *src, memcpy(dst + i * dst_stride, src + i * src_stride, width); } -static void copy_tile_highbd(int width, int height, const uint16_t *src, - int src_stride, uint16_t *dst, int dst_stride) { - for (int i = 0; i < height; ++i) - memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst)); -} - static void copy_tile(int width, int height, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int highbd) { - if (highbd) +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride, CONVERT_TO_SHORTPTR(dst), dst_stride); - else - copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride); + return; + } +#endif + (void)highbd; + copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride); } #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d)) @@ -212,11 +222,10 @@ static void copy_tile(int width, int height, const uint8_t *src, int src_stride, // rules: // // * At a frame boundary, we copy the outermost row of CDEF pixels three times. -// This extension is done by a call to extend_frame() at the start of the loop -// restoration process, so the value of copy_above/copy_below doesn't strictly -// matter. -// However, by setting *copy_above = *copy_below = 1 whenever loop filtering -// across tiles is disabled, we can allow +// This extension is done by a call to av1_extend_frame() at the start of the +// loop restoration process, so the value of copy_above/copy_below doesn't +// strictly matter. However, by setting *copy_above = *copy_below = 1 whenever +// loop filtering across tiles is disabled, we can allow // {setup,restore}_processing_stripe_boundary to assume that the top/bottom // data has always been copied, simplifying the behaviour at the left and // right edges of tiles. @@ -620,7 +629,7 @@ static void boxsum(int32_t *src, int width, int height, int src_stride, int r, assert(0 && "Invalid value of r in self-guided filter"); } -void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { if (params->r[0] == 0) { xq[0] = 0; xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1]; @@ -633,7 +642,7 @@ void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { } } -const int32_t x_by_xplus1[256] = { +const int32_t av1_x_by_xplus1[256] = { // Special case: Map 0 -> 1 (corresponding to a value of 1/256) // instead of 0. See comments in selfguided_restoration_internal() for why 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, @@ -656,7 +665,7 @@ const int32_t x_by_xplus1[256] = { 256, }; -const int32_t one_by_x[MAX_NELEM] = { +const int32_t av1_one_by_x[MAX_NELEM] = { 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, }; @@ -665,7 +674,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height, int dgd_stride, int bit_depth, int sgr_params_idx, int radius_idx, int pass, int32_t *A, int32_t *B) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -726,7 +735,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height, // Further, in the calculation of B[k] below, if z == 0 and r == 2, // then A[k] "should be" 0. But then we can end up setting B[k] to a value // slightly above 2^(8 + bit depth), due to rounding in the value of - // one_by_x[25-1]. + // av1_one_by_x[25-1]. // // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. // This fixes the above issues (256 - A[k] fits in a uint8, and we can't @@ -738,17 +747,17 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height, // would be a bad idea, as that corresponds to the case where the image // is very variable, when we want to preserve the local pixel value as // much as possible. - A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] + A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, - // one_by_x[n - 1] = round(2^12 / n) + // av1_one_by_x[n - 1] = round(2^12 / n) // => the product here is < 2^(20 + bit_depth) <= 2^32, // and B[k] is set to a value < 2^(8 + bit depth) - // This holds even with the rounding in one_by_x and in the overall + // This holds even with the rounding in av1_one_by_x and in the overall // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * (uint32_t)B[k] * - (uint32_t)one_by_x[n - 1], + (uint32_t)av1_one_by_x[n - 1], SGRPROJ_RECIP_BITS); } } @@ -757,7 +766,7 @@ static void calculate_intermediate_result(int32_t *dgd, int width, int height, static void selfguided_restoration_fast_internal( int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; // Adjusting the stride of A and B here appears to avoid bad cache effects, @@ -883,7 +892,7 @@ int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, } } - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; // If params->r == 0 we skip the corresponding filter. We only allow one of // the radii to be 0, as having both equal to 0 would be equivalent to // skipping SGR entirely. @@ -899,11 +908,11 @@ int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, return 0; } -void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, - int stride, int eps, const int *xqd, - uint8_t *dst8, int dst_stride, - int32_t *tmpbuf, int bit_depth, - int highbd) { +void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); @@ -912,9 +921,9 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); (void)ret; assert(!ret); - const sgr_params_type *const params = &sgr_params[eps]; + const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; - decode_xq(xqd, xq, params); + av1_decode_xq(xqd, xq, params); for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int k = i * width + j; @@ -950,12 +959,13 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui, for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, stripe_width - j); - apply_selfguided_restoration(src + j, w, stripe_height, src_stride, - rui->sgrproj_info.ep, rui->sgrproj_info.xqd, - dst + j, dst_stride, tmpbuf, bit_depth, 0); + av1_apply_selfguided_restoration( + src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0); } } +#if CONFIG_AV1_HIGHBITDEPTH static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui, int stripe_width, int stripe_height, int procunit_width, const uint8_t *src8, @@ -984,11 +994,12 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui, int32_t *tmpbuf, int bit_depth) { for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, stripe_width - j); - apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride, - rui->sgrproj_info.ep, rui->sgrproj_info.xqd, - dst8 + j, dst_stride, tmpbuf, bit_depth, 1); + av1_apply_selfguided_restoration( + src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1); } } +#endif // CONFIG_AV1_HIGHBITDEPTH typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui, int stripe_width, int stripe_height, @@ -996,12 +1007,18 @@ typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui, int src_stride, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth); +#if CONFIG_AV1_HIGHBITDEPTH #define NUM_STRIPE_FILTERS 4 - static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd, sgrproj_filter_stripe_highbd }; +#else +#define NUM_STRIPE_FILTERS 2 +static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { + wiener_filter_stripe, sgrproj_filter_stripe +}; +#endif // CONFIG_AV1_HIGHBITDEPTH // Filter one restoration unit void av1_loop_restoration_filter_unit( @@ -1072,13 +1089,6 @@ void av1_loop_restoration_filter_unit( } } -static void filter_frame_on_tile(int tile_row, int tile_col, void *priv, - AV1_COMMON *cm) { - (void)tile_col; - FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv; - ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1]; -} - static void filter_frame_on_unit(const RestorationTileLimits *limits, const AV1PixelRect *tile_rect, int rest_unit_idx, void *priv, int32_t *tmpbuf, @@ -1106,8 +1116,8 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, const int frame_height = frame->crop_heights[0]; if (aom_realloc_frame_buffer( lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, - seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL) < 0) + seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, + cm->features.byte_alignment, NULL, NULL, NULL) < 0) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate restoration dst buffer"); @@ -1127,9 +1137,9 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, const int plane_height = frame->crop_heights[is_uv]; FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane]; - extend_frame(frame->buffers[plane], plane_width, plane_height, - frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER, - highbd); + av1_extend_frame(frame->buffers[plane], plane_width, plane_height, + frame->strides[is_uv], RESTORATION_BORDER, + RESTORATION_BORDER, highbd); lr_plane_ctxt->rsi = rsi; lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x; @@ -1141,7 +1151,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, lr_plane_ctxt->data_stride = frame->strides[is_uv]; lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv]; lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv); - filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm); + lr_plane_ctxt->tile_stripe0 = 0; } } @@ -1150,10 +1160,10 @@ void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, int vstart, int vend); - static const copy_fun copy_funs[3] = { - aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v - }; - + static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, + aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v }; + assert(num_planes <= 3); for (int plane = 0; plane < num_planes; ++plane) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect; @@ -1180,7 +1190,7 @@ static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm, void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int optimized_lr, void *lr_ctxt) { - assert(!cm->all_lossless); + assert(!cm->features.all_lossless); const int num_planes = av1_num_planes(cm); AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; @@ -1308,7 +1318,7 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, if (bsize != cm->seq_params.sb_size) return 0; if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0; - assert(!cm->all_lossless); + assert(!cm->features.all_lossless); const int is_uv = plane > 0; diff --git a/media/libaom/src/av1/common/restoration.h b/media/libaom/src/av1/common/restoration.h index d834f9270..3b80dd5a9 100644 --- a/media/libaom/src/av1/common/restoration.h +++ b/media/libaom/src/av1/common/restoration.h @@ -22,6 +22,8 @@ extern "C" { #endif +// Border for Loop restoration buffer +#define AOM_RESTORATION_FRAME_BORDER 32 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) #define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5)) @@ -120,6 +122,7 @@ extern "C" { // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN. #define WIENER_WIN_CHROMA (WIENER_WIN - 2) +#define WIENER_WIN_REDUCED (WIENER_WIN - 2) #define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA)) #define WIENER_FILT_PREC_BITS 7 @@ -275,18 +278,18 @@ typedef struct AV1LrStruct { YV12_BUFFER_CONFIG *dst; } AV1LrStruct; -extern const sgr_params_type sgr_params[SGRPROJ_PARAMS]; +extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS]; extern int sgrproj_mtable[SGRPROJ_PARAMS][2]; -extern const int32_t x_by_xplus1[256]; -extern const int32_t one_by_x[MAX_NELEM]; +extern const int32_t av1_x_by_xplus1[256]; +extern const int32_t av1_one_by_x[MAX_NELEM]; void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi, int is_uv); void av1_free_restoration_struct(RestorationInfo *rst_info); -void extend_frame(uint8_t *data, int width, int height, int stride, - int border_horz, int border_vert, int highbd); -void decode_xq(const int *xqd, int *xq, const sgr_params_type *params); +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd); +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params); // Filter a single loop restoration unit. // diff --git a/media/libaom/src/av1/common/scale.c b/media/libaom/src/av1/common/scale.c index c525fe229..3b14c0a2c 100644 --- a/media/libaom/src/av1/common/scale.c +++ b/media/libaom/src/av1/common/scale.c @@ -37,7 +37,7 @@ static INLINE int scaled_y(int val, const struct scale_factors *sf) { // Note: Expect val to be in q4 precision static int unscaled_value(int val, const struct scale_factors *sf) { (void)sf; - return val << SCALE_EXTRA_BITS; + return val * (1 << SCALE_EXTRA_BITS); } static int get_fixed_point_scale_factor(int other_size, int this_size) { @@ -88,39 +88,41 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, // AV1 convolve functions // Special case convolve functions should produce the same result as // av1_convolve_2d. - // subpel_x_q4 == 0 && subpel_y_q4 == 0 + // subpel_x_qn == 0 && subpel_y_qn == 0 sf->convolve[0][0][0] = av1_convolve_2d_copy_sr; - // subpel_x_q4 == 0 + // subpel_x_qn == 0 sf->convolve[0][1][0] = av1_convolve_y_sr; - // subpel_y_q4 == 0 + // subpel_y_qn == 0 sf->convolve[1][0][0] = av1_convolve_x_sr; - // subpel_x_q4 != 0 && subpel_y_q4 != 0 + // subpel_x_qn != 0 && subpel_y_qn != 0 sf->convolve[1][1][0] = av1_convolve_2d_sr; - // subpel_x_q4 == 0 && subpel_y_q4 == 0 - sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy; - // subpel_x_q4 == 0 - sf->convolve[0][1][1] = av1_jnt_convolve_y; - // subpel_y_q4 == 0 - sf->convolve[1][0][1] = av1_jnt_convolve_x; - // subpel_x_q4 != 0 && subpel_y_q4 != 0 - sf->convolve[1][1][1] = av1_jnt_convolve_2d; + // subpel_x_qn == 0 && subpel_y_qn == 0 + sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy; + // subpel_x_qn == 0 + sf->convolve[0][1][1] = av1_dist_wtd_convolve_y; + // subpel_y_qn == 0 + sf->convolve[1][0][1] = av1_dist_wtd_convolve_x; + // subpel_x_qn != 0 && subpel_y_qn != 0 + sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d; +#if CONFIG_AV1_HIGHBITDEPTH // AV1 High BD convolve functions // Special case convolve functions should produce the same result as // av1_highbd_convolve_2d. - // subpel_x_q4 == 0 && subpel_y_q4 == 0 + // subpel_x_qn == 0 && subpel_y_qn == 0 sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr; - // subpel_x_q4 == 0 + // subpel_x_qn == 0 sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr; - // subpel_y_q4 == 0 + // subpel_y_qn == 0 sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr; - // subpel_x_q4 != 0 && subpel_y_q4 != 0 + // subpel_x_qn != 0 && subpel_y_qn != 0 sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr; - // subpel_x_q4 == 0 && subpel_y_q4 == 0 - sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy; - // subpel_x_q4 == 0 - sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y; - // subpel_y_q4 == 0 - sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x; - // subpel_x_q4 != 0 && subpel_y_q4 != 0 - sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d; + // subpel_x_qn == 0 && subpel_y_qn == 0 + sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy; + // subpel_x_qn == 0 + sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y; + // subpel_y_qn == 0 + sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x; + // subpel_x_qn != 0 && subpel_y_qn != 0 + sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d; +#endif } diff --git a/media/libaom/src/av1/common/scale.h b/media/libaom/src/av1/common/scale.h index 748e958c3..16b40bde8 100644 --- a/media/libaom/src/av1/common/scale.h +++ b/media/libaom/src/av1/common/scale.h @@ -45,11 +45,13 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); static INLINE int av1_is_valid_scale(const struct scale_factors *sf) { + assert(sf != NULL); return sf->x_scale_fp != REF_INVALID_SCALE && sf->y_scale_fp != REF_INVALID_SCALE; } static INLINE int av1_is_scaled(const struct scale_factors *sf) { + assert(sf != NULL); return av1_is_valid_scale(sf) && (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); } diff --git a/media/libaom/src/av1/common/scan.c b/media/libaom/src/av1/common/scan.c index 31a787b53..c1d4f3581 100644 --- a/media/libaom/src/av1/common/scan.c +++ b/media/libaom/src/av1/common/scan.c @@ -14,9 +14,9 @@ #include "av1/common/common_data.h" #include "av1/common/scan.h" -DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { - 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 -}; +DECLARE_ALIGNED(16, static const int16_t, + default_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, @@ -839,1546 +839,9 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023 }; -// Neighborhood 2-tuples for various scans and blocksizes, -// in {top, left} order for each position in corresponding scan order. -DECLARE_ALIGNED(16, static const int16_t, - default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 4, 4, 1, 4, 1, 1, 2, 2, 2, 5, 5, - 8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 4, 4, 8, 8, 0, 0, 1, 4, 5, 8, 9, 12, 1, - 1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, - 4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 4, 1, 1, 4, 4, 2, 5, 5, 8, 6, - 9, 2, 2, 8, 8, 3, 6, 9, 12, 7, 10, 10, 13, 12, 12, 13, 16, - 11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, - 24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 0, - 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 1, 1, - 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2, 2, 3, - 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, - 4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 12, 12, - 13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, - 24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 8, 1, 1, 8, 8, 2, 9, 9, 16, 10, - 17, 2, 2, 16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3, 3, 4, 11, - 19, 26, 12, 19, 4, 4, 20, 27, 5, 12, 13, 20, 21, 28, 5, 5, 6, - 13, 14, 21, 22, 29, 6, 6, 7, 14, 15, 22, 23, 30, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 8, 8, 16, 16, 0, 0, 1, 8, 9, 16, 17, 24, 1, - 1, 2, 9, 10, 17, 18, 25, 2, 2, 3, 10, 11, 18, 19, 26, 3, 3, - 4, 11, 12, 19, 20, 27, 4, 4, 5, 12, 13, 20, 21, 28, 5, 5, 6, - 13, 14, 21, 22, 29, 6, 6, 7, 14, 15, 22, 23, 30, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0, - 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, 7, 14, 8, 8, - 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, - 24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 4, 1, 1, 4, 4, 2, 5, 5, 8, 6, 9, 2, - 2, 8, 8, 3, 6, 9, 12, 7, 10, 10, 13, 12, 12, 13, 16, 11, 14, 14, 17, - 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24, - 24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36, - 34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43, - 46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52, - 53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 16, 1, 1, 16, 16, 2, 17, 17, 32, 18, 33, 2, - 2, 32, 32, 3, 18, 33, 48, 19, 34, 34, 49, 3, 3, 4, 19, 35, 50, 20, 35, - 4, 4, 36, 51, 5, 20, 21, 36, 37, 52, 5, 5, 6, 21, 22, 37, 38, 53, 6, - 6, 7, 22, 23, 38, 39, 54, 7, 7, 8, 23, 24, 39, 40, 55, 8, 8, 9, 24, - 25, 40, 41, 56, 9, 9, 10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43, - 58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13, - 14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, 4, 5, - 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 12, 12, 13, 16, 14, 17, - 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24, - 24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36, - 34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43, - 46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52, - 53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 0, 0, 1, 16, 2, 17, - 3, 18, 4, 19, 5, 20, 6, 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, - 27, 13, 28, 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36, - 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31, - 46, 32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 40, 55, - 41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28, 32, - 32, 36, 36, 40, 40, 44, 44, 48, 48, 52, 52, 56, 56, 0, 0, 1, 4, 5, 8, - 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45, - 48, 49, 52, 53, 56, 57, 60, 1, 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, - 22, 25, 26, 29, 30, 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, - 61, 2, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34, - 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 1, 16, 17, 32, 33, 48, 1, 1, 2, - 17, 18, 33, 34, 49, 2, 2, 3, 18, 19, 34, 35, 50, 3, 3, 4, 19, 20, 35, - 36, 51, 4, 4, 5, 20, 21, 36, 37, 52, 5, 5, 6, 21, 22, 37, 38, 53, 6, - 6, 7, 22, 23, 38, 39, 54, 7, 7, 8, 23, 24, 39, 40, 55, 8, 8, 9, 24, - 25, 40, 41, 56, 9, 9, 10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43, - 58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13, - 14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 1, 1, 8, 8, 8, 2, 2, 2, - 9, 9, 16, 16, 16, 3, 3, 3, 10, 10, 17, 17, 24, 24, 24, - 4, 4, 4, 11, 11, 18, 18, 25, 25, 32, 32, 32, 5, 5, 5, - 12, 12, 19, 19, 26, 26, 33, 33, 40, 40, 40, 6, 6, 6, 13, - 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 7, 14, 14, - 21, 21, 28, 28, 35, 35, 42, 42, 49, 49, 56, 56, 56, 15, 22, - 22, 29, 29, 36, 36, 43, 43, 50, 50, 57, 57, 64, 64, 64, 23, - 30, 30, 37, 37, 44, 44, 51, 51, 58, 58, 65, 65, 72, 72, 72, - 31, 38, 38, 45, 45, 52, 52, 59, 59, 66, 66, 73, 73, 80, 80, - 80, 39, 46, 46, 53, 53, 60, 60, 67, 67, 74, 74, 81, 81, 88, - 88, 88, 47, 54, 54, 61, 61, 68, 68, 75, 75, 82, 82, 89, 89, - 96, 96, 96, 55, 62, 62, 69, 69, 76, 76, 83, 83, 90, 90, 97, - 97, 104, 104, 104, 63, 70, 70, 77, 77, 84, 84, 91, 91, 98, 98, - 105, 105, 112, 112, 112, 71, 78, 78, 85, 85, 92, 92, 99, 99, 106, - 106, 113, 113, 120, 120, 120, 79, 86, 86, 93, 93, 100, 100, 107, 107, - 114, 114, 121, 121, 128, 128, 128, 87, 94, 94, 101, 101, 108, 108, 115, - 115, 122, 122, 129, 129, 136, 136, 136, 95, 102, 102, 109, 109, 116, 116, - 123, 123, 130, 130, 137, 137, 144, 144, 144, 103, 110, 110, 117, 117, 124, - 124, 131, 131, 138, 138, 145, 145, 152, 152, 152, 111, 118, 118, 125, 125, - 132, 132, 139, 139, 146, 146, 153, 153, 160, 160, 160, 119, 126, 126, 133, - 133, 140, 140, 147, 147, 154, 154, 161, 161, 168, 168, 168, 127, 134, 134, - 141, 141, 148, 148, 155, 155, 162, 162, 169, 169, 176, 176, 176, 135, 142, - 142, 149, 149, 156, 156, 163, 163, 170, 170, 177, 177, 184, 184, 184, 143, - 150, 150, 157, 157, 164, 164, 171, 171, 178, 178, 185, 185, 192, 192, 192, - 151, 158, 158, 165, 165, 172, 172, 179, 179, 186, 186, 193, 193, 200, 200, - 200, 159, 166, 166, 173, 173, 180, 180, 187, 187, 194, 194, 201, 201, 208, - 208, 208, 167, 174, 174, 181, 181, 188, 188, 195, 195, 202, 202, 209, 209, - 216, 216, 216, 175, 182, 182, 189, 189, 196, 196, 203, 203, 210, 210, 217, - 217, 224, 224, 224, 183, 190, 190, 197, 197, 204, 204, 211, 211, 218, 218, - 225, 225, 232, 232, 232, 191, 198, 198, 205, 205, 212, 212, 219, 219, 226, - 226, 233, 233, 240, 240, 240, 199, 206, 206, 213, 213, 220, 220, 227, 227, - 234, 234, 241, 241, 248, 207, 214, 214, 221, 221, 228, 228, 235, 235, 242, - 242, 249, 215, 222, 222, 229, 229, 236, 236, 243, 243, 250, 223, 230, 230, - 237, 237, 244, 244, 251, 231, 238, 238, 245, 245, 252, 239, 246, 246, 253, - 247, 254, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 1, 1, 32, 32, 32, 2, 2, 2, - 33, 33, 64, 64, 64, 3, 3, 3, 34, 34, 65, 65, 96, 96, 96, - 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128, 5, 5, 5, - 36, 36, 67, 67, 98, 98, 129, 129, 160, 160, 160, 6, 6, 6, 37, - 37, 68, 68, 99, 99, 130, 130, 161, 161, 192, 192, 192, 7, 7, 7, - 38, 38, 69, 69, 100, 100, 131, 131, 162, 162, 193, 193, 224, 8, 8, - 8, 39, 39, 70, 70, 101, 101, 132, 132, 163, 163, 194, 194, 225, 9, - 9, 9, 40, 40, 71, 71, 102, 102, 133, 133, 164, 164, 195, 195, 226, - 10, 10, 10, 41, 41, 72, 72, 103, 103, 134, 134, 165, 165, 196, 196, - 227, 11, 11, 11, 42, 42, 73, 73, 104, 104, 135, 135, 166, 166, 197, - 197, 228, 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167, - 198, 198, 229, 13, 13, 13, 44, 44, 75, 75, 106, 106, 137, 137, 168, - 168, 199, 199, 230, 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138, - 169, 169, 200, 200, 231, 15, 15, 15, 46, 46, 77, 77, 108, 108, 139, - 139, 170, 170, 201, 201, 232, 16, 16, 16, 47, 47, 78, 78, 109, 109, - 140, 140, 171, 171, 202, 202, 233, 17, 17, 17, 48, 48, 79, 79, 110, - 110, 141, 141, 172, 172, 203, 203, 234, 18, 18, 18, 49, 49, 80, 80, - 111, 111, 142, 142, 173, 173, 204, 204, 235, 19, 19, 19, 50, 50, 81, - 81, 112, 112, 143, 143, 174, 174, 205, 205, 236, 20, 20, 20, 51, 51, - 82, 82, 113, 113, 144, 144, 175, 175, 206, 206, 237, 21, 21, 21, 52, - 52, 83, 83, 114, 114, 145, 145, 176, 176, 207, 207, 238, 22, 22, 22, - 53, 53, 84, 84, 115, 115, 146, 146, 177, 177, 208, 208, 239, 23, 23, - 23, 54, 54, 85, 85, 116, 116, 147, 147, 178, 178, 209, 209, 240, 24, - 24, 24, 55, 55, 86, 86, 117, 117, 148, 148, 179, 179, 210, 210, 241, - 25, 25, 25, 56, 56, 87, 87, 118, 118, 149, 149, 180, 180, 211, 211, - 242, 26, 26, 26, 57, 57, 88, 88, 119, 119, 150, 150, 181, 181, 212, - 212, 243, 27, 27, 27, 58, 58, 89, 89, 120, 120, 151, 151, 182, 182, - 213, 213, 244, 28, 28, 28, 59, 59, 90, 90, 121, 121, 152, 152, 183, - 183, 214, 214, 245, 29, 29, 29, 60, 60, 91, 91, 122, 122, 153, 153, - 184, 184, 215, 215, 246, 30, 30, 30, 61, 61, 92, 92, 123, 123, 154, - 154, 185, 185, 216, 216, 247, 31, 62, 62, 93, 93, 124, 124, 155, 155, - 186, 186, 217, 217, 248, 63, 94, 94, 125, 125, 156, 156, 187, 187, 218, - 218, 249, 95, 126, 126, 157, 157, 188, 188, 219, 219, 250, 127, 158, 158, - 189, 189, 220, 220, 251, 159, 190, 190, 221, 221, 252, 191, 222, 222, 253, - 223, 254, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, - 6, 0, 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, - 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14, - 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28, - 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, - 36, 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, - 37, 44, 38, 45, 39, 46, 40, 40, 41, 48, 42, 49, 43, 50, 44, - 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56, 50, 57, 51, 58, - 52, 59, 53, 60, 54, 61, 55, 62, 56, 56, 57, 64, 58, 65, 59, - 66, 60, 67, 61, 68, 62, 69, 63, 70, 64, 64, 65, 72, 66, 73, - 67, 74, 68, 75, 69, 76, 70, 77, 71, 78, 72, 72, 73, 80, 74, - 81, 75, 82, 76, 83, 77, 84, 78, 85, 79, 86, 80, 80, 81, 88, - 82, 89, 83, 90, 84, 91, 85, 92, 86, 93, 87, 94, 88, 88, 89, - 96, 90, 97, 91, 98, 92, 99, 93, 100, 94, 101, 95, 102, 96, 96, - 97, 104, 98, 105, 99, 106, 100, 107, 101, 108, 102, 109, 103, 110, 104, - 104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118, - 112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119, - 126, 120, 120, 121, 128, 122, 129, 123, 130, 124, 131, 125, 132, 126, 133, - 127, 134, 128, 128, 129, 136, 130, 137, 131, 138, 132, 139, 133, 140, 134, - 141, 135, 142, 136, 136, 137, 144, 138, 145, 139, 146, 140, 147, 141, 148, - 142, 149, 143, 150, 144, 144, 145, 152, 146, 153, 147, 154, 148, 155, 149, - 156, 150, 157, 151, 158, 152, 152, 153, 160, 154, 161, 155, 162, 156, 163, - 157, 164, 158, 165, 159, 166, 160, 160, 161, 168, 162, 169, 163, 170, 164, - 171, 165, 172, 166, 173, 167, 174, 168, 168, 169, 176, 170, 177, 171, 178, - 172, 179, 173, 180, 174, 181, 175, 182, 176, 176, 177, 184, 178, 185, 179, - 186, 180, 187, 181, 188, 182, 189, 183, 190, 184, 184, 185, 192, 186, 193, - 187, 194, 188, 195, 189, 196, 190, 197, 191, 198, 192, 192, 193, 200, 194, - 201, 195, 202, 196, 203, 197, 204, 198, 205, 199, 206, 200, 200, 201, 208, - 202, 209, 203, 210, 204, 211, 205, 212, 206, 213, 207, 214, 208, 208, 209, - 216, 210, 217, 211, 218, 212, 219, 213, 220, 214, 221, 215, 222, 216, 216, - 217, 224, 218, 225, 219, 226, 220, 227, 221, 228, 222, 229, 223, 230, 224, - 224, 225, 232, 226, 233, 227, 234, 228, 235, 229, 236, 230, 237, 231, 238, - 232, 232, 233, 240, 234, 241, 235, 242, 236, 243, 237, 244, 238, 245, 239, - 246, 240, 240, 241, 248, 242, 249, 243, 250, 244, 251, 245, 252, 246, 253, - 247, 254, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, - 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, - 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, - 29, 29, 30, 30, 0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5, - 36, 6, 37, 7, 38, 8, 39, 9, 40, 10, 41, 11, 42, 12, 43, - 13, 44, 14, 45, 15, 46, 16, 47, 17, 48, 18, 49, 19, 50, 20, - 51, 21, 52, 22, 53, 23, 54, 24, 55, 25, 56, 26, 57, 27, 58, - 28, 59, 29, 60, 30, 61, 31, 62, 32, 32, 33, 64, 34, 65, 35, - 66, 36, 67, 37, 68, 38, 69, 39, 70, 40, 71, 41, 72, 42, 73, - 43, 74, 44, 75, 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50, - 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88, - 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65, - 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103, - 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110, 80, - 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117, 87, 118, - 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124, 94, 125, 95, - 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102, 133, - 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110, - 141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148, - 118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125, - 156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163, - 133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140, - 171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178, - 148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155, - 186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193, - 163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170, - 201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208, - 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185, - 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192, - 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200, - 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238, - 208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215, - 246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253, - 223, 254, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, - 48, 56, 56, 64, 64, 72, 72, 80, 80, 88, 88, 96, 96, 104, 104, - 112, 112, 120, 120, 128, 128, 136, 136, 144, 144, 152, 152, 160, 160, 168, - 168, 176, 176, 184, 184, 192, 192, 200, 200, 208, 208, 216, 216, 224, 224, - 232, 232, 240, 240, 0, 0, 1, 8, 9, 16, 17, 24, 25, 32, 33, - 40, 41, 48, 49, 56, 57, 64, 65, 72, 73, 80, 81, 88, 89, 96, - 97, 104, 105, 112, 113, 120, 121, 128, 129, 136, 137, 144, 145, 152, 153, - 160, 161, 168, 169, 176, 177, 184, 185, 192, 193, 200, 201, 208, 209, 216, - 217, 224, 225, 232, 233, 240, 241, 248, 1, 1, 2, 9, 10, 17, 18, - 25, 26, 33, 34, 41, 42, 49, 50, 57, 58, 65, 66, 73, 74, 81, - 82, 89, 90, 97, 98, 105, 106, 113, 114, 121, 122, 129, 130, 137, 138, - 145, 146, 153, 154, 161, 162, 169, 170, 177, 178, 185, 186, 193, 194, 201, - 202, 209, 210, 217, 218, 225, 226, 233, 234, 241, 242, 249, 2, 2, 3, - 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59, 66, - 67, 74, 75, 82, 83, 90, 91, 98, 99, 106, 107, 114, 115, 122, 123, - 130, 131, 138, 139, 146, 147, 154, 155, 162, 163, 170, 171, 178, 179, 186, - 187, 194, 195, 202, 203, 210, 211, 218, 219, 226, 227, 234, 235, 242, 243, - 250, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51, - 52, 59, 60, 67, 68, 75, 76, 83, 84, 91, 92, 99, 100, 107, 108, - 115, 116, 123, 124, 131, 132, 139, 140, 147, 148, 155, 156, 163, 164, 171, - 172, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 219, 220, 227, 228, - 235, 236, 243, 244, 251, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, - 37, 44, 45, 52, 53, 60, 61, 68, 69, 76, 77, 84, 85, 92, 93, - 100, 101, 108, 109, 116, 117, 124, 125, 132, 133, 140, 141, 148, 149, 156, - 157, 164, 165, 172, 173, 180, 181, 188, 189, 196, 197, 204, 205, 212, 213, - 220, 221, 228, 229, 236, 237, 244, 245, 252, 5, 5, 6, 13, 14, 21, - 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 62, 69, 70, 77, 78, - 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125, 126, 133, 134, 141, - 142, 149, 150, 157, 158, 165, 166, 173, 174, 181, 182, 189, 190, 197, 198, - 205, 206, 213, 214, 221, 222, 229, 230, 237, 238, 245, 246, 253, 6, 6, - 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63, - 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126, - 127, 134, 135, 142, 143, 150, 151, 158, 159, 166, 167, 174, 175, 182, 183, - 190, 191, 198, 199, 206, 207, 214, 215, 222, 223, 230, 231, 238, 239, 246, - 247, 254, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192, 192, - 0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161, 192, 193, 224, - 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161, 162, 193, 194, 225, - 2, 2, 3, 34, 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195, 226, - 3, 3, 4, 35, 36, 67, 68, 99, 100, 131, 132, 163, 164, 195, 196, 227, - 4, 4, 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228, - 5, 5, 6, 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229, - 6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230, - 7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200, 231, - 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201, 232, - 9, 9, 10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233, - 10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202, 203, 234, - 11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235, - 12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236, - 13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237, - 14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, 238, - 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207, 208, 239, - 16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240, - 17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209, 210, 241, - 18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242, - 19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243, - 20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244, - 21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245, - 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, 215, 246, - 23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247, - 24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216, 217, 248, - 25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249, - 26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218, 219, 250, - 27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251, - 28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252, - 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, 253, - 30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254, - 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0, 0, 1, - 8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1, 1, 2, 9, 10, 17, - 18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2, 2, 3, 10, 11, 18, 19, 26, 27, - 34, 35, 42, 43, 50, 51, 58, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, - 44, 51, 52, 59, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53, - 60, 5, 5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6, 6, - 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0, 0, 1, - 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, 7, 14, 8, 8, 9, 16, 10, 17, - 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, - 27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36, - 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39, - 46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48, - 49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 2, 2, 2, 9, 9, 16, 16, - 16, 24, 24, 17, 24, 10, 17, 3, 10, 3, 3, 4, 4, 4, 11, 11, 18, 18, 25, - 25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5, 12, 5, 5, 6, - 6, 6, 13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49, - 35, 42, 28, 35, 21, 28, 14, 21, 7, 14, 15, 22, 22, 29, 29, 36, 36, 43, 43, - 50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52, - 52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 1, 1, 8, 8, 8, 2, 2, 2, - 9, 9, 16, 16, 16, 3, 3, 3, 10, 10, 17, 17, 24, 24, 24, - 4, 4, 4, 11, 11, 18, 18, 25, 25, 32, 32, 32, 5, 5, 5, - 12, 12, 19, 19, 26, 26, 33, 33, 40, 40, 40, 6, 6, 6, 13, - 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 7, 14, 14, - 21, 21, 28, 28, 35, 35, 42, 42, 49, 49, 56, 56, 56, 15, 22, - 22, 29, 29, 36, 36, 43, 43, 50, 50, 57, 57, 64, 64, 64, 23, - 30, 30, 37, 37, 44, 44, 51, 51, 58, 58, 65, 65, 72, 72, 72, - 31, 38, 38, 45, 45, 52, 52, 59, 59, 66, 66, 73, 73, 80, 80, - 80, 39, 46, 46, 53, 53, 60, 60, 67, 67, 74, 74, 81, 81, 88, - 88, 88, 47, 54, 54, 61, 61, 68, 68, 75, 75, 82, 82, 89, 89, - 96, 96, 96, 55, 62, 62, 69, 69, 76, 76, 83, 83, 90, 90, 97, - 97, 104, 104, 104, 63, 70, 70, 77, 77, 84, 84, 91, 91, 98, 98, - 105, 105, 112, 112, 112, 71, 78, 78, 85, 85, 92, 92, 99, 99, 106, - 106, 113, 113, 120, 79, 86, 86, 93, 93, 100, 100, 107, 107, 114, 114, - 121, 87, 94, 94, 101, 101, 108, 108, 115, 115, 122, 95, 102, 102, 109, - 109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119, - 126, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 1, 1, 16, 16, 16, 2, 2, 2, - 17, 17, 32, 32, 32, 3, 3, 3, 18, 18, 33, 33, 48, 48, 48, - 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 5, 5, 5, - 20, 20, 35, 35, 50, 50, 65, 65, 80, 80, 80, 6, 6, 6, 21, - 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 7, 7, 7, - 22, 22, 37, 37, 52, 52, 67, 67, 82, 82, 97, 97, 112, 8, 8, - 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98, 113, 9, - 9, 9, 24, 24, 39, 39, 54, 54, 69, 69, 84, 84, 99, 99, 114, - 10, 10, 10, 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100, - 115, 11, 11, 11, 26, 26, 41, 41, 56, 56, 71, 71, 86, 86, 101, - 101, 116, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87, - 102, 102, 117, 13, 13, 13, 28, 28, 43, 43, 58, 58, 73, 73, 88, - 88, 103, 103, 118, 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74, - 89, 89, 104, 104, 119, 15, 30, 30, 45, 45, 60, 60, 75, 75, 90, - 90, 105, 105, 120, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106, - 121, 47, 62, 62, 77, 77, 92, 92, 107, 107, 122, 63, 78, 78, 93, - 93, 108, 108, 123, 79, 94, 94, 109, 109, 124, 95, 110, 110, 125, 111, - 126, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48, - 56, 56, 64, 64, 72, 72, 80, 80, 88, 88, 96, 96, 104, 104, 112, 112, - 0, 0, 1, 8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, - 57, 64, 65, 72, 73, 80, 81, 88, 89, 96, 97, 104, 105, 112, 113, 120, - 1, 1, 2, 9, 10, 17, 18, 25, 26, 33, 34, 41, 42, 49, 50, 57, - 58, 65, 66, 73, 74, 81, 82, 89, 90, 97, 98, 105, 106, 113, 114, 121, - 2, 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, - 59, 66, 67, 74, 75, 82, 83, 90, 91, 98, 99, 106, 107, 114, 115, 122, - 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51, 52, 59, - 60, 67, 68, 75, 76, 83, 84, 91, 92, 99, 100, 107, 108, 115, 116, 123, - 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53, 60, - 61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124, - 5, 5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, - 62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125, - 6, 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, - 63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126, - 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, 96, - 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96, 97, 112, - 1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97, 98, 113, - 2, 2, 3, 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114, - 3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99, 100, 115, - 4, 4, 5, 20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116, - 5, 5, 6, 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117, - 6, 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118, - 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119, - 8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120, - 9, 9, 10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121, - 10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122, - 11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123, - 12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124, - 13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125, - 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126, - 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, - 6, 0, 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, - 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14, - 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28, - 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, - 36, 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, - 37, 44, 38, 45, 39, 46, 40, 40, 41, 48, 42, 49, 43, 50, 44, - 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56, 50, 57, 51, 58, - 52, 59, 53, 60, 54, 61, 55, 62, 56, 56, 57, 64, 58, 65, 59, - 66, 60, 67, 61, 68, 62, 69, 63, 70, 64, 64, 65, 72, 66, 73, - 67, 74, 68, 75, 69, 76, 70, 77, 71, 78, 72, 72, 73, 80, 74, - 81, 75, 82, 76, 83, 77, 84, 78, 85, 79, 86, 80, 80, 81, 88, - 82, 89, 83, 90, 84, 91, 85, 92, 86, 93, 87, 94, 88, 88, 89, - 96, 90, 97, 91, 98, 92, 99, 93, 100, 94, 101, 95, 102, 96, 96, - 97, 104, 98, 105, 99, 106, 100, 107, 101, 108, 102, 109, 103, 110, 104, - 104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118, - 112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119, - 126, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, - 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6, - 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28, - 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, - 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, - 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36, - 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58, - 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51, - 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73, - 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66, - 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88, - 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81, - 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103, - 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96, - 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118, - 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111, - 126, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 1, 1, 16, 16, 16, 2, 2, 2, - 17, 17, 32, 32, 32, 3, 3, 3, 18, 18, 33, 33, 48, 48, 48, - 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 5, 5, 5, - 20, 20, 35, 35, 50, 50, 65, 65, 80, 80, 80, 6, 6, 6, 21, - 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 7, 7, 7, - 22, 22, 37, 37, 52, 52, 67, 67, 82, 82, 97, 97, 112, 112, 112, - 8, 8, 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98, - 113, 113, 128, 128, 128, 9, 9, 9, 24, 24, 39, 39, 54, 54, 69, - 69, 84, 84, 99, 99, 114, 114, 129, 129, 144, 144, 144, 10, 10, 10, - 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100, 115, 115, 130, - 130, 145, 145, 160, 160, 160, 11, 11, 11, 26, 26, 41, 41, 56, 56, - 71, 71, 86, 86, 101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176, - 176, 176, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87, - 102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192, - 13, 13, 13, 28, 28, 43, 43, 58, 58, 73, 73, 88, 88, 103, 103, - 118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208, - 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74, 89, 89, 104, 104, - 119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224, - 224, 224, 15, 30, 30, 45, 45, 60, 60, 75, 75, 90, 90, 105, 105, - 120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225, - 225, 240, 240, 240, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106, - 121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, - 226, 241, 241, 256, 256, 256, 47, 62, 62, 77, 77, 92, 92, 107, 107, - 122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227, - 227, 242, 242, 257, 257, 272, 272, 272, 63, 78, 78, 93, 93, 108, 108, - 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228, - 228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79, 94, 94, 109, 109, - 124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229, - 229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95, 110, 110, - 125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230, - 230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111, - 126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231, - 231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336, - 336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232, - 232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337, - 352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233, - 233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338, - 353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234, - 234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339, - 354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235, - 235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340, - 355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236, - 236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341, - 356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237, - 237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342, - 357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238, - 238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343, - 358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448, - 239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344, - 359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464, - 464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345, - 360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465, - 465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346, - 361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466, - 466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362, - 377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482, - 482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393, - 408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334, - 334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439, - 454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395, - 395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351, - 366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471, - 471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442, - 457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443, - 443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444, - 459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475, - 475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447, - 462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494, - 494, 509, 495, 510, 0, 0 -}; - DECLARE_ALIGNED(16, static const int16_t, - default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 1, 1, 1, 32, 32, 32, 2, 2, 2, - 33, 33, 64, 64, 64, 3, 3, 3, 34, 34, 65, 65, 96, 96, 96, - 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128, 5, 5, 5, - 36, 36, 67, 67, 98, 98, 129, 129, 160, 160, 160, 6, 6, 6, 37, - 37, 68, 68, 99, 99, 130, 130, 161, 161, 192, 192, 192, 7, 7, 7, - 38, 38, 69, 69, 100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224, - 8, 8, 8, 39, 39, 70, 70, 101, 101, 132, 132, 163, 163, 194, 194, - 225, 225, 256, 256, 256, 9, 9, 9, 40, 40, 71, 71, 102, 102, 133, - 133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10, 10, 10, - 41, 41, 72, 72, 103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258, - 258, 289, 289, 320, 320, 320, 11, 11, 11, 42, 42, 73, 73, 104, 104, - 135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352, - 352, 352, 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167, - 198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384, - 13, 13, 13, 44, 44, 75, 75, 106, 106, 137, 137, 168, 168, 199, 199, - 230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416, - 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138, 169, 169, 200, 200, - 231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448, - 448, 448, 15, 15, 15, 46, 46, 77, 77, 108, 108, 139, 139, 170, 170, - 201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418, - 418, 449, 449, 480, 16, 16, 16, 47, 47, 78, 78, 109, 109, 140, 140, - 171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388, - 388, 419, 419, 450, 450, 481, 17, 17, 17, 48, 48, 79, 79, 110, 110, - 141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358, - 358, 389, 389, 420, 420, 451, 451, 482, 18, 18, 18, 49, 49, 80, 80, - 111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328, - 328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19, 19, 19, 50, 50, - 81, 81, 112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298, - 298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20, 20, 20, - 51, 51, 82, 82, 113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268, - 268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21, - 21, 21, 52, 52, 83, 83, 114, 114, 145, 145, 176, 176, 207, 207, 238, - 238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455, - 486, 22, 22, 22, 53, 53, 84, 84, 115, 115, 146, 146, 177, 177, 208, - 208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425, - 456, 456, 487, 23, 23, 23, 54, 54, 85, 85, 116, 116, 147, 147, 178, - 178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395, - 426, 426, 457, 457, 488, 24, 24, 24, 55, 55, 86, 86, 117, 117, 148, - 148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365, - 396, 396, 427, 427, 458, 458, 489, 25, 25, 25, 56, 56, 87, 87, 118, - 118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335, - 366, 366, 397, 397, 428, 428, 459, 459, 490, 26, 26, 26, 57, 57, 88, - 88, 119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305, - 336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27, 27, 27, 58, - 58, 89, 89, 120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275, - 306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28, 28, - 28, 59, 59, 90, 90, 121, 121, 152, 152, 183, 183, 214, 214, 245, 245, - 276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493, - 29, 29, 29, 60, 60, 91, 91, 122, 122, 153, 153, 184, 184, 215, 215, - 246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463, - 463, 494, 30, 30, 30, 61, 61, 92, 92, 123, 123, 154, 154, 185, 185, - 216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433, - 433, 464, 464, 495, 31, 62, 62, 93, 93, 124, 124, 155, 155, 186, 186, - 217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434, - 434, 465, 465, 496, 63, 94, 94, 125, 125, 156, 156, 187, 187, 218, 218, - 249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466, - 466, 497, 95, 126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281, - 312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158, - 158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375, - 406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283, - 283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191, - 222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439, - 439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378, - 409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379, - 379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380, - 411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443, - 443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383, - 414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478, - 478, 509, 479, 510, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, - 96, 112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208, - 224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336, - 336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448, - 464, 464, 480, 480, 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, - 80, 81, 96, 97, 112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, - 193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305, - 320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432, - 433, 448, 449, 464, 465, 480, 481, 496, 1, 1, 2, 17, 18, 33, 34, - 49, 50, 65, 66, 81, 82, 97, 98, 113, 114, 129, 130, 145, 146, 161, - 162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274, - 289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401, - 402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2, 2, 3, - 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114, 115, 130, - 131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243, - 258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370, - 371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483, - 498, 3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99, - 100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212, - 227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339, - 340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452, - 467, 468, 483, 484, 499, 4, 4, 5, 20, 21, 36, 37, 52, 53, 68, - 69, 84, 85, 100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181, - 196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308, - 309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421, - 436, 437, 452, 453, 468, 469, 484, 485, 500, 5, 5, 6, 21, 22, 37, - 38, 53, 54, 69, 70, 85, 86, 101, 102, 117, 118, 133, 134, 149, 150, - 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277, - 278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390, - 405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6, 6, - 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118, 119, - 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246, - 247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359, - 374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486, - 487, 502, 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, - 103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, - 216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328, - 343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455, - 456, 471, 472, 487, 488, 503, 8, 8, 9, 24, 25, 40, 41, 56, 57, - 72, 73, 88, 89, 104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, - 185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297, - 312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424, - 425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9, 9, 10, 25, 26, - 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121, 122, 137, 138, 153, - 154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266, - 281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393, - 394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10, - 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122, - 123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235, - 250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362, - 363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475, - 490, 491, 506, 11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, - 92, 107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204, - 219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331, - 332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444, - 459, 460, 475, 476, 491, 492, 507, 12, 12, 13, 28, 29, 44, 45, 60, - 61, 76, 77, 92, 93, 108, 109, 124, 125, 140, 141, 156, 157, 172, 173, - 188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300, - 301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413, - 428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13, 13, 14, 29, - 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125, 126, 141, 142, - 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269, - 270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382, - 397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509, - 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, - 126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238, - 239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351, - 366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478, - 479, 494, 495, 510, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192, - 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416, - 448, 448, 0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161, - 192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416, - 417, 448, 449, 480, 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, - 161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385, - 386, 417, 418, 449, 450, 481, 2, 2, 3, 34, 35, 66, 67, 98, 99, - 130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354, - 355, 386, 387, 418, 419, 450, 451, 482, 3, 3, 4, 35, 36, 67, 68, - 99, 100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323, - 324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4, 4, 5, 36, 37, - 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292, - 293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5, 5, 6, - 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261, - 262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6, - 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230, - 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455, - 486, 7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, - 200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424, - 455, 456, 487, 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168, - 169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393, - 424, 425, 456, 457, 488, 9, 9, 10, 41, 42, 73, 74, 105, 106, 137, - 138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362, - 393, 394, 425, 426, 457, 458, 489, 10, 10, 11, 42, 43, 74, 75, 106, - 107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331, - 362, 363, 394, 395, 426, 427, 458, 459, 490, 11, 11, 12, 43, 44, 75, - 76, 107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300, - 331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12, 12, 13, 44, - 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269, - 300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13, 13, - 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237, 238, - 269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493, - 14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, - 238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462, - 463, 494, 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, - 207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431, - 432, 463, 464, 495, 16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, - 176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400, - 401, 432, 433, 464, 465, 496, 17, 17, 18, 49, 50, 81, 82, 113, 114, - 145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369, - 370, 401, 402, 433, 434, 465, 466, 497, 18, 18, 19, 50, 51, 82, 83, - 114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338, - 339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19, 19, 20, 51, 52, - 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307, - 308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20, 20, 21, - 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276, - 277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21, - 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245, - 246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470, - 501, 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, - 215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439, - 470, 471, 502, 23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, - 184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408, - 439, 440, 471, 472, 503, 24, 24, 25, 56, 57, 88, 89, 120, 121, 152, - 153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377, - 408, 409, 440, 441, 472, 473, 504, 25, 25, 26, 57, 58, 89, 90, 121, - 122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346, - 377, 378, 409, 410, 441, 442, 473, 474, 505, 26, 26, 27, 58, 59, 90, - 91, 122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315, - 346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27, 27, 28, 59, - 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284, - 315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28, 28, - 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252, 253, - 284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508, - 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, - 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477, - 478, 509, 30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, - 222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446, - 447, 478, 479, 510, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, - 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6, - 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28, - 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, - 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, - 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36, - 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58, - 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51, - 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73, - 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66, - 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88, - 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81, - 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103, - 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96, - 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118, - 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111, - 126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133, - 119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126, - 141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148, - 134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141, - 156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163, - 149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156, - 171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178, - 164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171, - 186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193, - 179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186, - 201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208, - 194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201, - 216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208, - 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216, - 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238, - 224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231, - 246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253, - 239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246, - 261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268, - 254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261, - 276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283, - 269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276, - 291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298, - 284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291, - 306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313, - 299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306, - 321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328, - 314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321, - 336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343, - 329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336, - 336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358, - 344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351, - 366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373, - 359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366, - 381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388, - 374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381, - 396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403, - 389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396, - 411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418, - 404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411, - 426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433, - 419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426, - 441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448, - 434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441, - 456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448, - 449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456, - 471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478, - 464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471, - 486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493, - 479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486, - 501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508, - 494, 509, 495, 510, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, - 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, - 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, - 29, 29, 30, 30, 0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5, - 36, 6, 37, 7, 38, 8, 39, 9, 40, 10, 41, 11, 42, 12, 43, - 13, 44, 14, 45, 15, 46, 16, 47, 17, 48, 18, 49, 19, 50, 20, - 51, 21, 52, 22, 53, 23, 54, 24, 55, 25, 56, 26, 57, 27, 58, - 28, 59, 29, 60, 30, 61, 31, 62, 32, 32, 33, 64, 34, 65, 35, - 66, 36, 67, 37, 68, 38, 69, 39, 70, 40, 71, 41, 72, 42, 73, - 43, 74, 44, 75, 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50, - 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88, - 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65, - 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103, - 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110, 80, - 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117, 87, 118, - 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124, 94, 125, 95, - 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102, 133, - 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110, - 141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148, - 118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125, - 156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163, - 133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140, - 171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178, - 148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155, - 186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193, - 163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170, - 201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208, - 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185, - 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192, - 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200, - 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238, - 208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215, - 246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253, - 223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230, - 261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268, - 238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245, - 276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283, - 253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260, - 291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298, - 268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275, - 306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313, - 283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290, - 321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328, - 298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305, - 336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343, - 313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320, - 320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358, - 328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335, - 366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373, - 343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350, - 381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388, - 358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365, - 396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403, - 373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380, - 411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418, - 388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395, - 426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433, - 403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410, - 441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448, - 418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425, - 456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463, - 433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440, - 471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478, - 448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455, - 486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493, - 463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470, - 501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508, - 478, 509, 479, 510, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, - 96, 112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208, - 224, 224, 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, - 96, 97, 112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208, - 209, 224, 225, 240, 1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66, - 81, 82, 97, 98, 113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193, - 194, 209, 210, 225, 226, 241, 2, 2, 3, 18, 19, 34, 35, 50, 51, - 66, 67, 82, 83, 98, 99, 114, 115, 130, 131, 146, 147, 162, 163, 178, - 179, 194, 195, 210, 211, 226, 227, 242, 3, 3, 4, 19, 20, 35, 36, - 51, 52, 67, 68, 83, 84, 99, 100, 115, 116, 131, 132, 147, 148, 163, - 164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4, 4, 5, 20, 21, - 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116, 117, 132, 133, 148, - 149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5, 5, 6, - 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117, 118, 133, - 134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6, - 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118, - 119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, - 246, 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, - 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216, - 231, 232, 247, 8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88, - 89, 104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201, - 216, 217, 232, 233, 248, 9, 9, 10, 25, 26, 41, 42, 57, 58, 73, - 74, 89, 90, 105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186, - 201, 202, 217, 218, 233, 234, 249, 10, 10, 11, 26, 27, 42, 43, 58, - 59, 74, 75, 90, 91, 106, 107, 122, 123, 138, 139, 154, 155, 170, 171, - 186, 187, 202, 203, 218, 219, 234, 235, 250, 11, 11, 12, 27, 28, 43, - 44, 59, 60, 75, 76, 91, 92, 107, 108, 123, 124, 139, 140, 155, 156, - 171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12, 12, 13, 28, - 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124, 125, 140, 141, - 156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13, 13, - 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125, 126, - 141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, - 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, - 126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238, - 239, 254, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, - 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6, - 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28, - 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, - 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, - 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36, - 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58, - 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51, - 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73, - 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66, - 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88, - 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81, - 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103, - 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96, - 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118, - 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111, - 126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133, - 119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126, - 141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148, - 134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141, - 156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163, - 149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156, - 171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178, - 164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171, - 186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193, - 179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186, - 201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208, - 194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201, - 216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208, - 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216, - 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238, - 224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231, - 246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253, - 239, 254, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 2, 2, 2, - 17, 17, 32, 32, 32, 48, 48, 33, 48, 18, 33, 3, 18, 3, 3, - 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 80, 80, 65, - 80, 50, 65, 35, 50, 20, 35, 5, 20, 5, 5, 6, 6, 6, 21, - 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 112, 112, 97, - 112, 82, 97, 67, 82, 52, 67, 37, 52, 22, 37, 7, 22, 7, 7, - 8, 8, 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98, - 113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99, 114, 84, 99, - 69, 84, 54, 69, 39, 54, 24, 39, 9, 24, 9, 9, 10, 10, 10, - 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100, 115, 115, 130, - 130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116, - 131, 101, 116, 86, 101, 71, 86, 56, 71, 41, 56, 26, 41, 11, 26, - 11, 11, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87, - 102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192, - 208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103, - 118, 88, 103, 73, 88, 58, 73, 43, 58, 28, 43, 13, 28, 13, 13, - 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74, 89, 89, 104, 104, - 119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224, - 224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135, - 150, 120, 135, 105, 120, 90, 105, 75, 90, 60, 75, 45, 60, 30, 45, - 15, 30, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106, 121, 121, - 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241, - 227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122, - 137, 107, 122, 92, 107, 77, 92, 62, 77, 47, 62, 63, 78, 78, 93, - 93, 108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, - 213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184, - 154, 169, 139, 154, 124, 139, 109, 124, 94, 109, 79, 94, 95, 110, 110, - 125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230, - 230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141, - 156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, - 202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173, - 188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234, - 234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206, - 221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253, - 239, 254, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, - mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, - 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, - 416, 416, 448, 448, 480, 480, 512, 512, 544, 544, 576, 576, 608, 608, - 640, 640, 672, 672, 704, 704, 736, 736, 768, 768, 800, 800, 832, 832, - 864, 864, 896, 896, 928, 928, 960, 960, 0, 0, 1, 32, 33, 64, - 65, 96, 97, 128, 129, 160, 161, 192, 193, 224, 225, 256, 257, 288, - 289, 320, 321, 352, 353, 384, 385, 416, 417, 448, 449, 480, 481, 512, - 513, 544, 545, 576, 577, 608, 609, 640, 641, 672, 673, 704, 705, 736, - 737, 768, 769, 800, 801, 832, 833, 864, 865, 896, 897, 928, 929, 960, - 961, 992, 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161, - 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385, - 386, 417, 418, 449, 450, 481, 482, 513, 514, 545, 546, 577, 578, 609, - 610, 641, 642, 673, 674, 705, 706, 737, 738, 769, 770, 801, 802, 833, - 834, 865, 866, 897, 898, 929, 930, 961, 962, 993, 2, 2, 3, 34, - 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195, 226, 227, 258, - 259, 290, 291, 322, 323, 354, 355, 386, 387, 418, 419, 450, 451, 482, - 483, 514, 515, 546, 547, 578, 579, 610, 611, 642, 643, 674, 675, 706, - 707, 738, 739, 770, 771, 802, 803, 834, 835, 866, 867, 898, 899, 930, - 931, 962, 963, 994, 3, 3, 4, 35, 36, 67, 68, 99, 100, 131, - 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323, 324, 355, - 356, 387, 388, 419, 420, 451, 452, 483, 484, 515, 516, 547, 548, 579, - 580, 611, 612, 643, 644, 675, 676, 707, 708, 739, 740, 771, 772, 803, - 804, 835, 836, 867, 868, 899, 900, 931, 932, 963, 964, 995, 4, 4, - 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228, - 229, 260, 261, 292, 293, 324, 325, 356, 357, 388, 389, 420, 421, 452, - 453, 484, 485, 516, 517, 548, 549, 580, 581, 612, 613, 644, 645, 676, - 677, 708, 709, 740, 741, 772, 773, 804, 805, 836, 837, 868, 869, 900, - 901, 932, 933, 964, 965, 996, 5, 5, 6, 37, 38, 69, 70, 101, - 102, 133, 134, 165, 166, 197, 198, 229, 230, 261, 262, 293, 294, 325, - 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 486, 517, 518, 549, - 550, 581, 582, 613, 614, 645, 646, 677, 678, 709, 710, 741, 742, 773, - 774, 805, 806, 837, 838, 869, 870, 901, 902, 933, 934, 965, 966, 997, - 6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, - 199, 230, 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, - 423, 454, 455, 486, 487, 518, 519, 550, 551, 582, 583, 614, 615, 646, - 647, 678, 679, 710, 711, 742, 743, 774, 775, 806, 807, 838, 839, 870, - 871, 902, 903, 934, 935, 966, 967, 998, 7, 7, 8, 39, 40, 71, - 72, 103, 104, 135, 136, 167, 168, 199, 200, 231, 232, 263, 264, 295, - 296, 327, 328, 359, 360, 391, 392, 423, 424, 455, 456, 487, 488, 519, - 520, 551, 552, 583, 584, 615, 616, 647, 648, 679, 680, 711, 712, 743, - 744, 775, 776, 807, 808, 839, 840, 871, 872, 903, 904, 935, 936, 967, - 968, 999, 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168, - 169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, - 393, 424, 425, 456, 457, 488, 489, 520, 521, 552, 553, 584, 585, 616, - 617, 648, 649, 680, 681, 712, 713, 744, 745, 776, 777, 808, 809, 840, - 841, 872, 873, 904, 905, 936, 937, 968, 969, 1000, 9, 9, 10, 41, - 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233, 234, 265, - 266, 297, 298, 329, 330, 361, 362, 393, 394, 425, 426, 457, 458, 489, - 490, 521, 522, 553, 554, 585, 586, 617, 618, 649, 650, 681, 682, 713, - 714, 745, 746, 777, 778, 809, 810, 841, 842, 873, 874, 905, 906, 937, - 938, 969, 970, 1001, 10, 10, 11, 42, 43, 74, 75, 106, 107, 138, - 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331, 362, - 363, 394, 395, 426, 427, 458, 459, 490, 491, 522, 523, 554, 555, 586, - 587, 618, 619, 650, 651, 682, 683, 714, 715, 746, 747, 778, 779, 810, - 811, 842, 843, 874, 875, 906, 907, 938, 939, 970, 971, 1002, 11, 11, - 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235, - 236, 267, 268, 299, 300, 331, 332, 363, 364, 395, 396, 427, 428, 459, - 460, 491, 492, 523, 524, 555, 556, 587, 588, 619, 620, 651, 652, 683, - 684, 715, 716, 747, 748, 779, 780, 811, 812, 843, 844, 875, 876, 907, - 908, 939, 940, 971, 972, 1003, 12, 12, 13, 44, 45, 76, 77, 108, - 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269, 300, 301, 332, - 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 493, 524, 525, 556, - 557, 588, 589, 620, 621, 652, 653, 684, 685, 716, 717, 748, 749, 780, - 781, 812, 813, 844, 845, 876, 877, 908, 909, 940, 941, 972, 973, 1004, - 13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, - 206, 237, 238, 269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, - 430, 461, 462, 493, 494, 525, 526, 557, 558, 589, 590, 621, 622, 653, - 654, 685, 686, 717, 718, 749, 750, 781, 782, 813, 814, 845, 846, 877, - 878, 909, 910, 941, 942, 973, 974, 1005, 14, 14, 15, 46, 47, 78, - 79, 110, 111, 142, 143, 174, 175, 206, 207, 238, 239, 270, 271, 302, - 303, 334, 335, 366, 367, 398, 399, 430, 431, 462, 463, 494, 495, 526, - 527, 558, 559, 590, 591, 622, 623, 654, 655, 686, 687, 718, 719, 750, - 751, 782, 783, 814, 815, 846, 847, 878, 879, 910, 911, 942, 943, 974, - 975, 1006, 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, - 176, 207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, - 400, 431, 432, 463, 464, 495, 496, 527, 528, 559, 560, 591, 592, 623, - 624, 655, 656, 687, 688, 719, 720, 751, 752, 783, 784, 815, 816, 847, - 848, 879, 880, 911, 912, 943, 944, 975, 976, 1007, 16, 16, 17, 48, - 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240, 241, 272, - 273, 304, 305, 336, 337, 368, 369, 400, 401, 432, 433, 464, 465, 496, - 497, 528, 529, 560, 561, 592, 593, 624, 625, 656, 657, 688, 689, 720, - 721, 752, 753, 784, 785, 816, 817, 848, 849, 880, 881, 912, 913, 944, - 945, 976, 977, 1008, 17, 17, 18, 49, 50, 81, 82, 113, 114, 145, - 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369, - 370, 401, 402, 433, 434, 465, 466, 497, 498, 529, 530, 561, 562, 593, - 594, 625, 626, 657, 658, 689, 690, 721, 722, 753, 754, 785, 786, 817, - 818, 849, 850, 881, 882, 913, 914, 945, 946, 977, 978, 1009, 18, 18, - 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242, - 243, 274, 275, 306, 307, 338, 339, 370, 371, 402, 403, 434, 435, 466, - 467, 498, 499, 530, 531, 562, 563, 594, 595, 626, 627, 658, 659, 690, - 691, 722, 723, 754, 755, 786, 787, 818, 819, 850, 851, 882, 883, 914, - 915, 946, 947, 978, 979, 1010, 19, 19, 20, 51, 52, 83, 84, 115, - 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307, 308, 339, - 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 500, 531, 532, 563, - 564, 595, 596, 627, 628, 659, 660, 691, 692, 723, 724, 755, 756, 787, - 788, 819, 820, 851, 852, 883, 884, 915, 916, 947, 948, 979, 980, 1011, - 20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, - 213, 244, 245, 276, 277, 308, 309, 340, 341, 372, 373, 404, 405, 436, - 437, 468, 469, 500, 501, 532, 533, 564, 565, 596, 597, 628, 629, 660, - 661, 692, 693, 724, 725, 756, 757, 788, 789, 820, 821, 852, 853, 884, - 885, 916, 917, 948, 949, 980, 981, 1012, 21, 21, 22, 53, 54, 85, - 86, 117, 118, 149, 150, 181, 182, 213, 214, 245, 246, 277, 278, 309, - 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470, 501, 502, 533, - 534, 565, 566, 597, 598, 629, 630, 661, 662, 693, 694, 725, 726, 757, - 758, 789, 790, 821, 822, 853, 854, 885, 886, 917, 918, 949, 950, 981, - 982, 1013, 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, - 183, 214, 215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, - 407, 438, 439, 470, 471, 502, 503, 534, 535, 566, 567, 598, 599, 630, - 631, 662, 663, 694, 695, 726, 727, 758, 759, 790, 791, 822, 823, 854, - 855, 886, 887, 918, 919, 950, 951, 982, 983, 1014, 23, 23, 24, 55, - 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247, 248, 279, - 280, 311, 312, 343, 344, 375, 376, 407, 408, 439, 440, 471, 472, 503, - 504, 535, 536, 567, 568, 599, 600, 631, 632, 663, 664, 695, 696, 727, - 728, 759, 760, 791, 792, 823, 824, 855, 856, 887, 888, 919, 920, 951, - 952, 983, 984, 1015, 24, 24, 25, 56, 57, 88, 89, 120, 121, 152, - 153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, - 377, 408, 409, 440, 441, 472, 473, 504, 505, 536, 537, 568, 569, 600, - 601, 632, 633, 664, 665, 696, 697, 728, 729, 760, 761, 792, 793, 824, - 825, 856, 857, 888, 889, 920, 921, 952, 953, 984, 985, 1016, 25, 25, - 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249, - 250, 281, 282, 313, 314, 345, 346, 377, 378, 409, 410, 441, 442, 473, - 474, 505, 506, 537, 538, 569, 570, 601, 602, 633, 634, 665, 666, 697, - 698, 729, 730, 761, 762, 793, 794, 825, 826, 857, 858, 889, 890, 921, - 922, 953, 954, 985, 986, 1017, 26, 26, 27, 58, 59, 90, 91, 122, - 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315, 346, - 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 507, 538, 539, 570, - 571, 602, 603, 634, 635, 666, 667, 698, 699, 730, 731, 762, 763, 794, - 795, 826, 827, 858, 859, 890, 891, 922, 923, 954, 955, 986, 987, 1018, - 27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, - 220, 251, 252, 283, 284, 315, 316, 347, 348, 379, 380, 411, 412, 443, - 444, 475, 476, 507, 508, 539, 540, 571, 572, 603, 604, 635, 636, 667, - 668, 699, 700, 731, 732, 763, 764, 795, 796, 827, 828, 859, 860, 891, - 892, 923, 924, 955, 956, 987, 988, 1019, 28, 28, 29, 60, 61, 92, - 93, 124, 125, 156, 157, 188, 189, 220, 221, 252, 253, 284, 285, 316, - 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508, 509, 540, - 541, 572, 573, 604, 605, 636, 637, 668, 669, 700, 701, 732, 733, 764, - 765, 796, 797, 828, 829, 860, 861, 892, 893, 924, 925, 956, 957, 988, - 989, 1020, 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, - 190, 221, 222, 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, - 414, 445, 446, 477, 478, 509, 510, 541, 542, 573, 574, 605, 606, 637, - 638, 669, 670, 701, 702, 733, 734, 765, 766, 797, 798, 829, 830, 861, - 862, 893, 894, 925, 926, 957, 958, 989, 990, 1021, 30, 30, 31, 62, - 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254, 255, 286, - 287, 318, 319, 350, 351, 382, 383, 414, 415, 446, 447, 478, 479, 510, - 511, 542, 543, 574, 575, 606, 607, 638, 639, 670, 671, 702, 703, 734, - 735, 766, 767, 798, 799, 830, 831, 862, 863, 894, 895, 926, 927, 958, - 959, 990, 991, 1022, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, - 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, - 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, - 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, - 27, 27, 28, 28, 29, 29, 30, 30, 0, 0, 1, 32, 2, 33, - 3, 34, 4, 35, 5, 36, 6, 37, 7, 38, 8, 39, 9, 40, - 10, 41, 11, 42, 12, 43, 13, 44, 14, 45, 15, 46, 16, 47, - 17, 48, 18, 49, 19, 50, 20, 51, 21, 52, 22, 53, 23, 54, - 24, 55, 25, 56, 26, 57, 27, 58, 28, 59, 29, 60, 30, 61, - 31, 62, 32, 32, 33, 64, 34, 65, 35, 66, 36, 67, 37, 68, - 38, 69, 39, 70, 40, 71, 41, 72, 42, 73, 43, 74, 44, 75, - 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50, 81, 51, 82, - 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88, 58, 89, - 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65, 96, - 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103, - 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110, - 80, 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117, - 87, 118, 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124, - 94, 125, 95, 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131, - 101, 132, 102, 133, 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, - 108, 139, 109, 140, 110, 141, 111, 142, 112, 143, 113, 144, 114, 145, - 115, 146, 116, 147, 117, 148, 118, 149, 119, 150, 120, 151, 121, 152, - 122, 153, 123, 154, 124, 155, 125, 156, 126, 157, 127, 158, 128, 128, - 129, 160, 130, 161, 131, 162, 132, 163, 133, 164, 134, 165, 135, 166, - 136, 167, 137, 168, 138, 169, 139, 170, 140, 171, 141, 172, 142, 173, - 143, 174, 144, 175, 145, 176, 146, 177, 147, 178, 148, 179, 149, 180, - 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155, 186, 156, 187, - 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193, 163, 194, - 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170, 201, - 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208, - 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, - 185, 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, - 192, 192, 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, - 199, 230, 200, 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, - 206, 237, 207, 238, 208, 239, 209, 240, 210, 241, 211, 242, 212, 243, - 213, 244, 214, 245, 215, 246, 216, 247, 217, 248, 218, 249, 219, 250, - 220, 251, 221, 252, 222, 253, 223, 254, 224, 224, 225, 256, 226, 257, - 227, 258, 228, 259, 229, 260, 230, 261, 231, 262, 232, 263, 233, 264, - 234, 265, 235, 266, 236, 267, 237, 268, 238, 269, 239, 270, 240, 271, - 241, 272, 242, 273, 243, 274, 244, 275, 245, 276, 246, 277, 247, 278, - 248, 279, 249, 280, 250, 281, 251, 282, 252, 283, 253, 284, 254, 285, - 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260, 291, 261, 292, - 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298, 268, 299, - 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275, 306, - 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313, - 283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, - 290, 321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, - 297, 328, 298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, - 304, 335, 305, 336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, - 311, 342, 312, 343, 313, 344, 314, 345, 315, 346, 316, 347, 317, 348, - 318, 349, 319, 350, 320, 320, 321, 352, 322, 353, 323, 354, 324, 355, - 325, 356, 326, 357, 327, 358, 328, 359, 329, 360, 330, 361, 331, 362, - 332, 363, 333, 364, 334, 365, 335, 366, 336, 367, 337, 368, 338, 369, - 339, 370, 340, 371, 341, 372, 342, 373, 343, 374, 344, 375, 345, 376, - 346, 377, 347, 378, 348, 379, 349, 380, 350, 381, 351, 382, 352, 352, - 353, 384, 354, 385, 355, 386, 356, 387, 357, 388, 358, 389, 359, 390, - 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365, 396, 366, 397, - 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403, 373, 404, - 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380, 411, - 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418, - 388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, - 395, 426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, - 402, 433, 403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, - 409, 440, 410, 441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, - 416, 416, 417, 448, 418, 449, 419, 450, 420, 451, 421, 452, 422, 453, - 423, 454, 424, 455, 425, 456, 426, 457, 427, 458, 428, 459, 429, 460, - 430, 461, 431, 462, 432, 463, 433, 464, 434, 465, 435, 466, 436, 467, - 437, 468, 438, 469, 439, 470, 440, 471, 441, 472, 442, 473, 443, 474, - 444, 475, 445, 476, 446, 477, 447, 478, 448, 448, 449, 480, 450, 481, - 451, 482, 452, 483, 453, 484, 454, 485, 455, 486, 456, 487, 457, 488, - 458, 489, 459, 490, 460, 491, 461, 492, 462, 493, 463, 494, 464, 495, - 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470, 501, 471, 502, - 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508, 478, 509, - 479, 510, 480, 480, 481, 512, 482, 513, 483, 514, 484, 515, 485, 516, - 486, 517, 487, 518, 488, 519, 489, 520, 490, 521, 491, 522, 492, 523, - 493, 524, 494, 525, 495, 526, 496, 527, 497, 528, 498, 529, 499, 530, - 500, 531, 501, 532, 502, 533, 503, 534, 504, 535, 505, 536, 506, 537, - 507, 538, 508, 539, 509, 540, 510, 541, 511, 542, 512, 512, 513, 544, - 514, 545, 515, 546, 516, 547, 517, 548, 518, 549, 519, 550, 520, 551, - 521, 552, 522, 553, 523, 554, 524, 555, 525, 556, 526, 557, 527, 558, - 528, 559, 529, 560, 530, 561, 531, 562, 532, 563, 533, 564, 534, 565, - 535, 566, 536, 567, 537, 568, 538, 569, 539, 570, 540, 571, 541, 572, - 542, 573, 543, 574, 544, 544, 545, 576, 546, 577, 547, 578, 548, 579, - 549, 580, 550, 581, 551, 582, 552, 583, 553, 584, 554, 585, 555, 586, - 556, 587, 557, 588, 558, 589, 559, 590, 560, 591, 561, 592, 562, 593, - 563, 594, 564, 595, 565, 596, 566, 597, 567, 598, 568, 599, 569, 600, - 570, 601, 571, 602, 572, 603, 573, 604, 574, 605, 575, 606, 576, 576, - 577, 608, 578, 609, 579, 610, 580, 611, 581, 612, 582, 613, 583, 614, - 584, 615, 585, 616, 586, 617, 587, 618, 588, 619, 589, 620, 590, 621, - 591, 622, 592, 623, 593, 624, 594, 625, 595, 626, 596, 627, 597, 628, - 598, 629, 599, 630, 600, 631, 601, 632, 602, 633, 603, 634, 604, 635, - 605, 636, 606, 637, 607, 638, 608, 608, 609, 640, 610, 641, 611, 642, - 612, 643, 613, 644, 614, 645, 615, 646, 616, 647, 617, 648, 618, 649, - 619, 650, 620, 651, 621, 652, 622, 653, 623, 654, 624, 655, 625, 656, - 626, 657, 627, 658, 628, 659, 629, 660, 630, 661, 631, 662, 632, 663, - 633, 664, 634, 665, 635, 666, 636, 667, 637, 668, 638, 669, 639, 670, - 640, 640, 641, 672, 642, 673, 643, 674, 644, 675, 645, 676, 646, 677, - 647, 678, 648, 679, 649, 680, 650, 681, 651, 682, 652, 683, 653, 684, - 654, 685, 655, 686, 656, 687, 657, 688, 658, 689, 659, 690, 660, 691, - 661, 692, 662, 693, 663, 694, 664, 695, 665, 696, 666, 697, 667, 698, - 668, 699, 669, 700, 670, 701, 671, 702, 672, 672, 673, 704, 674, 705, - 675, 706, 676, 707, 677, 708, 678, 709, 679, 710, 680, 711, 681, 712, - 682, 713, 683, 714, 684, 715, 685, 716, 686, 717, 687, 718, 688, 719, - 689, 720, 690, 721, 691, 722, 692, 723, 693, 724, 694, 725, 695, 726, - 696, 727, 697, 728, 698, 729, 699, 730, 700, 731, 701, 732, 702, 733, - 703, 734, 704, 704, 705, 736, 706, 737, 707, 738, 708, 739, 709, 740, - 710, 741, 711, 742, 712, 743, 713, 744, 714, 745, 715, 746, 716, 747, - 717, 748, 718, 749, 719, 750, 720, 751, 721, 752, 722, 753, 723, 754, - 724, 755, 725, 756, 726, 757, 727, 758, 728, 759, 729, 760, 730, 761, - 731, 762, 732, 763, 733, 764, 734, 765, 735, 766, 736, 736, 737, 768, - 738, 769, 739, 770, 740, 771, 741, 772, 742, 773, 743, 774, 744, 775, - 745, 776, 746, 777, 747, 778, 748, 779, 749, 780, 750, 781, 751, 782, - 752, 783, 753, 784, 754, 785, 755, 786, 756, 787, 757, 788, 758, 789, - 759, 790, 760, 791, 761, 792, 762, 793, 763, 794, 764, 795, 765, 796, - 766, 797, 767, 798, 768, 768, 769, 800, 770, 801, 771, 802, 772, 803, - 773, 804, 774, 805, 775, 806, 776, 807, 777, 808, 778, 809, 779, 810, - 780, 811, 781, 812, 782, 813, 783, 814, 784, 815, 785, 816, 786, 817, - 787, 818, 788, 819, 789, 820, 790, 821, 791, 822, 792, 823, 793, 824, - 794, 825, 795, 826, 796, 827, 797, 828, 798, 829, 799, 830, 800, 800, - 801, 832, 802, 833, 803, 834, 804, 835, 805, 836, 806, 837, 807, 838, - 808, 839, 809, 840, 810, 841, 811, 842, 812, 843, 813, 844, 814, 845, - 815, 846, 816, 847, 817, 848, 818, 849, 819, 850, 820, 851, 821, 852, - 822, 853, 823, 854, 824, 855, 825, 856, 826, 857, 827, 858, 828, 859, - 829, 860, 830, 861, 831, 862, 832, 832, 833, 864, 834, 865, 835, 866, - 836, 867, 837, 868, 838, 869, 839, 870, 840, 871, 841, 872, 842, 873, - 843, 874, 844, 875, 845, 876, 846, 877, 847, 878, 848, 879, 849, 880, - 850, 881, 851, 882, 852, 883, 853, 884, 854, 885, 855, 886, 856, 887, - 857, 888, 858, 889, 859, 890, 860, 891, 861, 892, 862, 893, 863, 894, - 864, 864, 865, 896, 866, 897, 867, 898, 868, 899, 869, 900, 870, 901, - 871, 902, 872, 903, 873, 904, 874, 905, 875, 906, 876, 907, 877, 908, - 878, 909, 879, 910, 880, 911, 881, 912, 882, 913, 883, 914, 884, 915, - 885, 916, 886, 917, 887, 918, 888, 919, 889, 920, 890, 921, 891, 922, - 892, 923, 893, 924, 894, 925, 895, 926, 896, 896, 897, 928, 898, 929, - 899, 930, 900, 931, 901, 932, 902, 933, 903, 934, 904, 935, 905, 936, - 906, 937, 907, 938, 908, 939, 909, 940, 910, 941, 911, 942, 912, 943, - 913, 944, 914, 945, 915, 946, 916, 947, 917, 948, 918, 949, 919, 950, - 920, 951, 921, 952, 922, 953, 923, 954, 924, 955, 925, 956, 926, 957, - 927, 958, 928, 928, 929, 960, 930, 961, 931, 962, 932, 963, 933, 964, - 934, 965, 935, 966, 936, 967, 937, 968, 938, 969, 939, 970, 940, 971, - 941, 972, 942, 973, 943, 974, 944, 975, 945, 976, 946, 977, 947, 978, - 948, 979, 949, 980, 950, 981, 951, 982, 952, 983, 953, 984, 954, 985, - 955, 986, 956, 987, 957, 988, 958, 989, 959, 990, 960, 960, 961, 992, - 962, 993, 963, 994, 964, 995, 965, 996, 966, 997, 967, 998, 968, 999, - 969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006, - 976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013, - 983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020, - 990, 1021, 991, 1022, 0, 0, -}; - -DECLARE_ALIGNED(16, static const int16_t, - default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = { - 0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 2, 2, - 2, 33, 33, 64, 64, 64, 96, 96, 65, 96, 34, 65, 3, 34, - 3, 3, 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128, - 160, 160, 129, 160, 98, 129, 67, 98, 36, 67, 5, 36, 5, 5, - 6, 6, 6, 37, 37, 68, 68, 99, 99, 130, 130, 161, 161, 192, - 192, 192, 224, 224, 193, 224, 162, 193, 131, 162, 100, 131, 69, 100, - 38, 69, 7, 38, 7, 7, 8, 8, 8, 39, 39, 70, 70, 101, - 101, 132, 132, 163, 163, 194, 194, 225, 225, 256, 256, 256, 288, 288, - 257, 288, 226, 257, 195, 226, 164, 195, 133, 164, 102, 133, 71, 102, - 40, 71, 9, 40, 9, 9, 10, 10, 10, 41, 41, 72, 72, 103, - 103, 134, 134, 165, 165, 196, 196, 227, 227, 258, 258, 289, 289, 320, - 320, 320, 352, 352, 321, 352, 290, 321, 259, 290, 228, 259, 197, 228, - 166, 197, 135, 166, 104, 135, 73, 104, 42, 73, 11, 42, 11, 11, - 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167, 198, - 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384, - 416, 416, 385, 416, 354, 385, 323, 354, 292, 323, 261, 292, 230, 261, - 199, 230, 168, 199, 137, 168, 106, 137, 75, 106, 44, 75, 13, 44, - 13, 13, 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138, 169, - 169, 200, 200, 231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, - 386, 417, 417, 448, 448, 448, 480, 480, 449, 480, 418, 449, 387, 418, - 356, 387, 325, 356, 294, 325, 263, 294, 232, 263, 201, 232, 170, 201, - 139, 170, 108, 139, 77, 108, 46, 77, 15, 46, 15, 15, 16, 16, - 16, 47, 47, 78, 78, 109, 109, 140, 140, 171, 171, 202, 202, 233, - 233, 264, 264, 295, 295, 326, 326, 357, 357, 388, 388, 419, 419, 450, - 450, 481, 481, 512, 512, 512, 544, 544, 513, 544, 482, 513, 451, 482, - 420, 451, 389, 420, 358, 389, 327, 358, 296, 327, 265, 296, 234, 265, - 203, 234, 172, 203, 141, 172, 110, 141, 79, 110, 48, 79, 17, 48, - 17, 17, 18, 18, 18, 49, 49, 80, 80, 111, 111, 142, 142, 173, - 173, 204, 204, 235, 235, 266, 266, 297, 297, 328, 328, 359, 359, 390, - 390, 421, 421, 452, 452, 483, 483, 514, 514, 545, 545, 576, 576, 576, - 608, 608, 577, 608, 546, 577, 515, 546, 484, 515, 453, 484, 422, 453, - 391, 422, 360, 391, 329, 360, 298, 329, 267, 298, 236, 267, 205, 236, - 174, 205, 143, 174, 112, 143, 81, 112, 50, 81, 19, 50, 19, 19, - 20, 20, 20, 51, 51, 82, 82, 113, 113, 144, 144, 175, 175, 206, - 206, 237, 237, 268, 268, 299, 299, 330, 330, 361, 361, 392, 392, 423, - 423, 454, 454, 485, 485, 516, 516, 547, 547, 578, 578, 609, 609, 640, - 640, 640, 672, 672, 641, 672, 610, 641, 579, 610, 548, 579, 517, 548, - 486, 517, 455, 486, 424, 455, 393, 424, 362, 393, 331, 362, 300, 331, - 269, 300, 238, 269, 207, 238, 176, 207, 145, 176, 114, 145, 83, 114, - 52, 83, 21, 52, 21, 21, 22, 22, 22, 53, 53, 84, 84, 115, - 115, 146, 146, 177, 177, 208, 208, 239, 239, 270, 270, 301, 301, 332, - 332, 363, 363, 394, 394, 425, 425, 456, 456, 487, 487, 518, 518, 549, - 549, 580, 580, 611, 611, 642, 642, 673, 673, 704, 704, 704, 736, 736, - 705, 736, 674, 705, 643, 674, 612, 643, 581, 612, 550, 581, 519, 550, - 488, 519, 457, 488, 426, 457, 395, 426, 364, 395, 333, 364, 302, 333, - 271, 302, 240, 271, 209, 240, 178, 209, 147, 178, 116, 147, 85, 116, - 54, 85, 23, 54, 23, 23, 24, 24, 24, 55, 55, 86, 86, 117, - 117, 148, 148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, - 334, 365, 365, 396, 396, 427, 427, 458, 458, 489, 489, 520, 520, 551, - 551, 582, 582, 613, 613, 644, 644, 675, 675, 706, 706, 737, 737, 768, - 768, 768, 800, 800, 769, 800, 738, 769, 707, 738, 676, 707, 645, 676, - 614, 645, 583, 614, 552, 583, 521, 552, 490, 521, 459, 490, 428, 459, - 397, 428, 366, 397, 335, 366, 304, 335, 273, 304, 242, 273, 211, 242, - 180, 211, 149, 180, 118, 149, 87, 118, 56, 87, 25, 56, 25, 25, - 26, 26, 26, 57, 57, 88, 88, 119, 119, 150, 150, 181, 181, 212, - 212, 243, 243, 274, 274, 305, 305, 336, 336, 367, 367, 398, 398, 429, - 429, 460, 460, 491, 491, 522, 522, 553, 553, 584, 584, 615, 615, 646, - 646, 677, 677, 708, 708, 739, 739, 770, 770, 801, 801, 832, 832, 832, - 864, 864, 833, 864, 802, 833, 771, 802, 740, 771, 709, 740, 678, 709, - 647, 678, 616, 647, 585, 616, 554, 585, 523, 554, 492, 523, 461, 492, - 430, 461, 399, 430, 368, 399, 337, 368, 306, 337, 275, 306, 244, 275, - 213, 244, 182, 213, 151, 182, 120, 151, 89, 120, 58, 89, 27, 58, - 27, 27, 28, 28, 28, 59, 59, 90, 90, 121, 121, 152, 152, 183, - 183, 214, 214, 245, 245, 276, 276, 307, 307, 338, 338, 369, 369, 400, - 400, 431, 431, 462, 462, 493, 493, 524, 524, 555, 555, 586, 586, 617, - 617, 648, 648, 679, 679, 710, 710, 741, 741, 772, 772, 803, 803, 834, - 834, 865, 865, 896, 896, 896, 928, 928, 897, 928, 866, 897, 835, 866, - 804, 835, 773, 804, 742, 773, 711, 742, 680, 711, 649, 680, 618, 649, - 587, 618, 556, 587, 525, 556, 494, 525, 463, 494, 432, 463, 401, 432, - 370, 401, 339, 370, 308, 339, 277, 308, 246, 277, 215, 246, 184, 215, - 153, 184, 122, 153, 91, 122, 60, 91, 29, 60, 29, 29, 30, 30, - 30, 61, 61, 92, 92, 123, 123, 154, 154, 185, 185, 216, 216, 247, - 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433, 433, 464, - 464, 495, 495, 526, 526, 557, 557, 588, 588, 619, 619, 650, 650, 681, - 681, 712, 712, 743, 743, 774, 774, 805, 805, 836, 836, 867, 867, 898, - 898, 929, 929, 960, 960, 960, 961, 992, 930, 961, 899, 930, 868, 899, - 837, 868, 806, 837, 775, 806, 744, 775, 713, 744, 682, 713, 651, 682, - 620, 651, 589, 620, 558, 589, 527, 558, 496, 527, 465, 496, 434, 465, - 403, 434, 372, 403, 341, 372, 310, 341, 279, 310, 248, 279, 217, 248, - 186, 217, 155, 186, 124, 155, 93, 124, 62, 93, 31, 62, 63, 94, - 94, 125, 125, 156, 156, 187, 187, 218, 218, 249, 249, 280, 280, 311, - 311, 342, 342, 373, 373, 404, 404, 435, 435, 466, 466, 497, 497, 528, - 528, 559, 559, 590, 590, 621, 621, 652, 652, 683, 683, 714, 714, 745, - 745, 776, 776, 807, 807, 838, 838, 869, 869, 900, 900, 931, 931, 962, - 962, 993, 963, 994, 932, 963, 901, 932, 870, 901, 839, 870, 808, 839, - 777, 808, 746, 777, 715, 746, 684, 715, 653, 684, 622, 653, 591, 622, - 560, 591, 529, 560, 498, 529, 467, 498, 436, 467, 405, 436, 374, 405, - 343, 374, 312, 343, 281, 312, 250, 281, 219, 250, 188, 219, 157, 188, - 126, 157, 95, 126, 127, 158, 158, 189, 189, 220, 220, 251, 251, 282, - 282, 313, 313, 344, 344, 375, 375, 406, 406, 437, 437, 468, 468, 499, - 499, 530, 530, 561, 561, 592, 592, 623, 623, 654, 654, 685, 685, 716, - 716, 747, 747, 778, 778, 809, 809, 840, 840, 871, 871, 902, 902, 933, - 933, 964, 964, 995, 965, 996, 934, 965, 903, 934, 872, 903, 841, 872, - 810, 841, 779, 810, 748, 779, 717, 748, 686, 717, 655, 686, 624, 655, - 593, 624, 562, 593, 531, 562, 500, 531, 469, 500, 438, 469, 407, 438, - 376, 407, 345, 376, 314, 345, 283, 314, 252, 283, 221, 252, 190, 221, - 159, 190, 191, 222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, - 377, 408, 408, 439, 439, 470, 470, 501, 501, 532, 532, 563, 563, 594, - 594, 625, 625, 656, 656, 687, 687, 718, 718, 749, 749, 780, 780, 811, - 811, 842, 842, 873, 873, 904, 904, 935, 935, 966, 966, 997, 967, 998, - 936, 967, 905, 936, 874, 905, 843, 874, 812, 843, 781, 812, 750, 781, - 719, 750, 688, 719, 657, 688, 626, 657, 595, 626, 564, 595, 533, 564, - 502, 533, 471, 502, 440, 471, 409, 440, 378, 409, 347, 378, 316, 347, - 285, 316, 254, 285, 223, 254, 255, 286, 286, 317, 317, 348, 348, 379, - 379, 410, 410, 441, 441, 472, 472, 503, 503, 534, 534, 565, 565, 596, - 596, 627, 627, 658, 658, 689, 689, 720, 720, 751, 751, 782, 782, 813, - 813, 844, 844, 875, 875, 906, 906, 937, 937, 968, 968, 999, 969, 1000, - 938, 969, 907, 938, 876, 907, 845, 876, 814, 845, 783, 814, 752, 783, - 721, 752, 690, 721, 659, 690, 628, 659, 597, 628, 566, 597, 535, 566, - 504, 535, 473, 504, 442, 473, 411, 442, 380, 411, 349, 380, 318, 349, - 287, 318, 319, 350, 350, 381, 381, 412, 412, 443, 443, 474, 474, 505, - 505, 536, 536, 567, 567, 598, 598, 629, 629, 660, 660, 691, 691, 722, - 722, 753, 753, 784, 784, 815, 815, 846, 846, 877, 877, 908, 908, 939, - 939, 970, 970, 1001, 971, 1002, 940, 971, 909, 940, 878, 909, 847, 878, - 816, 847, 785, 816, 754, 785, 723, 754, 692, 723, 661, 692, 630, 661, - 599, 630, 568, 599, 537, 568, 506, 537, 475, 506, 444, 475, 413, 444, - 382, 413, 351, 382, 383, 414, 414, 445, 445, 476, 476, 507, 507, 538, - 538, 569, 569, 600, 600, 631, 631, 662, 662, 693, 693, 724, 724, 755, - 755, 786, 786, 817, 817, 848, 848, 879, 879, 910, 910, 941, 941, 972, - 972, 1003, 973, 1004, 942, 973, 911, 942, 880, 911, 849, 880, 818, 849, - 787, 818, 756, 787, 725, 756, 694, 725, 663, 694, 632, 663, 601, 632, - 570, 601, 539, 570, 508, 539, 477, 508, 446, 477, 415, 446, 447, 478, - 478, 509, 509, 540, 540, 571, 571, 602, 602, 633, 633, 664, 664, 695, - 695, 726, 726, 757, 757, 788, 788, 819, 819, 850, 850, 881, 881, 912, - 912, 943, 943, 974, 974, 1005, 975, 1006, 944, 975, 913, 944, 882, 913, - 851, 882, 820, 851, 789, 820, 758, 789, 727, 758, 696, 727, 665, 696, - 634, 665, 603, 634, 572, 603, 541, 572, 510, 541, 479, 510, 511, 542, - 542, 573, 573, 604, 604, 635, 635, 666, 666, 697, 697, 728, 728, 759, - 759, 790, 790, 821, 821, 852, 852, 883, 883, 914, 914, 945, 945, 976, - 976, 1007, 977, 1008, 946, 977, 915, 946, 884, 915, 853, 884, 822, 853, - 791, 822, 760, 791, 729, 760, 698, 729, 667, 698, 636, 667, 605, 636, - 574, 605, 543, 574, 575, 606, 606, 637, 637, 668, 668, 699, 699, 730, - 730, 761, 761, 792, 792, 823, 823, 854, 854, 885, 885, 916, 916, 947, - 947, 978, 978, 1009, 979, 1010, 948, 979, 917, 948, 886, 917, 855, 886, - 824, 855, 793, 824, 762, 793, 731, 762, 700, 731, 669, 700, 638, 669, - 607, 638, 639, 670, 670, 701, 701, 732, 732, 763, 763, 794, 794, 825, - 825, 856, 856, 887, 887, 918, 918, 949, 949, 980, 980, 1011, 981, 1012, - 950, 981, 919, 950, 888, 919, 857, 888, 826, 857, 795, 826, 764, 795, - 733, 764, 702, 733, 671, 702, 703, 734, 734, 765, 765, 796, 796, 827, - 827, 858, 858, 889, 889, 920, 920, 951, 951, 982, 982, 1013, 983, 1014, - 952, 983, 921, 952, 890, 921, 859, 890, 828, 859, 797, 828, 766, 797, - 735, 766, 767, 798, 798, 829, 829, 860, 860, 891, 891, 922, 922, 953, - 953, 984, 984, 1015, 985, 1016, 954, 985, 923, 954, 892, 923, 861, 892, - 830, 861, 799, 830, 831, 862, 862, 893, 893, 924, 924, 955, 955, 986, - 986, 1017, 987, 1018, 956, 987, 925, 956, 894, 925, 863, 894, 895, 926, - 926, 957, 957, 988, 988, 1019, 989, 1020, 958, 989, 927, 958, 959, 990, - 990, 1021, 991, 1022, 0, 0 -}; - -DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = { - 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 -}; + av1_default_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12, + 3, 8, 11, 13, 9, 10, 14, 15 }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, @@ -3201,535 +1664,385 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = { }; const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = { - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_32x32, av1_default_iscan_32x32 }, // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. - { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors }, + { default_scan_32x32, av1_default_iscan_32x32 }, }; const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = { { // TX_4X4 - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors }, - { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors }, - { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors }, - { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors }, - { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors }, - { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors }, - { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, }, { // TX_8X8 - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors }, - { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors }, - { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors }, - { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors }, - { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors }, - { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors }, - { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, }, { // TX_16X16 - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { default_scan_16x16, av1_default_iscan_16x16, - default_scan_16x16_neighbors }, - { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors }, - { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors }, - { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors }, - { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors }, - { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors }, - { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, }, { // TX_32X32 - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_64X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_4X8 - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors }, - { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors }, - { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors }, - { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors }, - { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors }, - { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors }, - { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, }, { // TX_8X4 - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors }, - { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors }, - { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors }, - { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors }, - { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors }, - { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors }, - { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, }, { // TX_8X16 - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { default_scan_8x16, av1_default_iscan_8x16, - default_scan_8x16_neighbors }, - { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors }, - { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors }, - { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors }, - { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors }, - { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors }, - { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, }, { // TX_16X8 - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { default_scan_16x8, av1_default_iscan_16x8, - default_scan_16x8_neighbors }, - { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors }, - { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors }, - { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors }, - { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors }, - { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors }, - { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, }, { // TX_16X32 - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors }, - { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors }, - { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors }, - { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors }, - { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors }, - { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, }, { // TX_32X16 - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors }, - { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors }, - { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors }, - { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors }, - { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors }, - { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, }, { // TX_32X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_64X32 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { default_scan_32x32, av1_default_iscan_32x32, - default_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, - { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors }, - { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_4X16 - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { default_scan_4x16, av1_default_iscan_4x16, - default_scan_4x16_neighbors }, - { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors }, - { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors }, - { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors }, - { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors }, - { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors }, - { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, }, { // TX_16X4 - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { default_scan_16x4, av1_default_iscan_16x4, - default_scan_16x4_neighbors }, - { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors }, - { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors }, - { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors }, - { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors }, - { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors }, - { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, }, { // TX_8X32 - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { default_scan_8x32, av1_default_iscan_8x32, - default_scan_8x32_neighbors }, - { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors }, - { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors }, - { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors }, - { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors }, - { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors }, - { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, }, { // TX_32X8 - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { default_scan_32x8, av1_default_iscan_32x8, - default_scan_32x8_neighbors }, - { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors }, - { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors }, - { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors }, - { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors }, - { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors }, - { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, }, { // TX_16X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { default_scan_16x32, av1_default_iscan_16x32, - default_scan_16x32_neighbors }, - { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors }, - { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors }, - { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors }, - { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors }, - { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors }, - { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, }, { // TX_64X16 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { default_scan_32x16, av1_default_iscan_32x16, - default_scan_32x16_neighbors }, - { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors }, - { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors }, - { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors }, - { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors }, - { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors }, - { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, }, }; diff --git a/media/libaom/src/av1/common/scan.h b/media/libaom/src/av1/common/scan.h index 233dc0efa..d9620e1c5 100644 --- a/media/libaom/src/av1/common/scan.h +++ b/media/libaom/src/av1/common/scan.h @@ -15,9 +15,9 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" -#include "av1/common/enums.h" -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" +#include "av1/common/enums.h" #ifdef __cplusplus extern "C" { @@ -25,14 +25,14 @@ extern "C" { #define MAX_NEIGHBORS 2 -typedef enum SCAN_MODE { +enum { SCAN_MODE_ZIG_ZAG, SCAN_MODE_COL_DIAG, SCAN_MODE_ROW_DIAG, SCAN_MODE_COL_1D, SCAN_MODE_ROW_1D, SCAN_MODES -} SCAN_MODE; +} UENUM1BYTE(SCAN_MODE); extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES]; extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES]; diff --git a/media/libaom/src/av1/common/seg_common.c b/media/libaom/src/av1/common/seg_common.c index cd189ad76..60b185161 100644 --- a/media/libaom/src/av1/common/seg_common.c +++ b/media/libaom/src/av1/common/seg_common.c @@ -16,12 +16,19 @@ #include "av1/common/seg_common.h" #include "av1/common/quant_common.h" -static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0 }; - -static const int seg_feature_data_max[SEG_LVL_MAX] = { - MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0 +static const int seg_feature_data_signed[SEG_LVL_MAX] = { + 1, 1, 1, 1, 1, 0, 0, 0 }; +static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + 7, + 0, + 0 }; + // These functions provide access to new segment level features. // Eventually these function may be "optimized out" but for the moment, // the coding mechanism is still subject to change so these provide a @@ -32,7 +39,7 @@ void av1_clearall_segfeatures(struct segmentation *seg) { av1_zero(seg->feature_mask); } -void calculate_segdata(struct segmentation *seg) { +void av1_calculate_segdata(struct segmentation *seg) { seg->segid_preskip = 0; seg->last_active_segid = 0; for (int i = 0; i < MAX_SEGMENTS; i++) { diff --git a/media/libaom/src/av1/common/seg_common.h b/media/libaom/src/av1/common/seg_common.h index 8c35bba86..aeb9c1768 100644 --- a/media/libaom/src/av1/common/seg_common.h +++ b/media/libaom/src/av1/common/seg_common.h @@ -24,7 +24,7 @@ extern "C" { #define SEG_TEMPORAL_PRED_CTXS 3 #define SPATIAL_PREDICTION_PROBS 3 -typedef enum { +enum { SEG_LVL_ALT_Q, // Use alternate Quantizer .... SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal @@ -34,7 +34,7 @@ typedef enum { SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode SEG_LVL_GLOBALMV, SEG_LVL_MAX -} SEG_LVL_FEATURES; +} UENUM1BYTE(SEG_LVL_FEATURES); struct segmentation { uint8_t enabled; @@ -83,7 +83,7 @@ void av1_clearall_segfeatures(struct segmentation *seg); void av1_enable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -void calculate_segdata(struct segmentation *seg); +void av1_calculate_segdata(struct segmentation *seg); int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id); diff --git a/media/libaom/src/av1/common/thread_common.c b/media/libaom/src/av1/common/thread_common.c index 8df4c9a09..f3c8795f8 100644 --- a/media/libaom/src/av1/common/thread_common.c +++ b/media/libaom/src/av1/common/thread_common.c @@ -205,7 +205,11 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c, } static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start, - int stop, int plane_start, int plane_end) { + int stop, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + int plane_start, int plane_end) { int mi_row, plane, dir; AV1LfMTInfo *lf_job_queue = lf_sync->job_queue; lf_sync->jobs_enqueued = 0; @@ -219,7 +223,16 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start, continue; else if (plane == 2 && !(cm->lf.filter_level_v)) continue; - for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { +#if CONFIG_LPF_MASK + int step = MAX_MIB_SIZE; + if (is_decoding) { + step = MI_SIZE_64X64; + } + for (mi_row = start; mi_row < stop; mi_row += step) +#else + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) +#endif + { lf_job_queue->mi_row = mi_row; lf_job_queue->plane = plane; lf_job_queue->dir = dir; @@ -230,7 +243,7 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start, } } -AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) { +static AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) { AV1LfMTInfo *cur_job_info = NULL; #if CONFIG_MULTITHREAD @@ -255,7 +268,8 @@ static INLINE void thread_loop_filter_rows( struct macroblockd_plane *planes, MACROBLOCKD *xd, AV1LfSync *const lf_sync) { const int sb_cols = - ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2; + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >> + MAX_MIB_SIZE_LOG2; int mi_row, mi_col, plane, dir; int r, c; @@ -269,7 +283,8 @@ static INLINE void thread_loop_filter_rows( r = mi_row >> MAX_MIB_SIZE_LOG2; if (dir == 0) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MAX_MIB_SIZE) { c = mi_col >> MAX_MIB_SIZE_LOG2; av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer, @@ -280,7 +295,8 @@ static INLINE void thread_loop_filter_rows( sync_write(lf_sync, r, c, sb_cols, plane); } } else if (dir == 1) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MAX_MIB_SIZE) { c = mi_col >> MAX_MIB_SIZE_LOG2; // Wait for vertical edge filtering of the top-right block to be @@ -312,15 +328,98 @@ static int loop_filter_row_worker(void *arg1, void *arg2) { return 1; } +#if CONFIG_LPF_MASK +static INLINE void thread_loop_filter_bitmask_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, + AV1LfSync *const lf_sync) { + const int sb_cols = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >> + MIN_MIB_SIZE_LOG2; + int mi_row, mi_col, plane, dir; + int r, c; + (void)xd; + + while (1) { + AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync); + + if (cur_job_info != NULL) { + mi_row = cur_job_info->mi_row; + plane = cur_job_info->plane; + dir = cur_job_info->dir; + r = mi_row >> MIN_MIB_SIZE_LOG2; + + if (dir == 0) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MI_SIZE_64X64) { + c = mi_col >> MIN_MIB_SIZE_LOG2; + + av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row, + mi_col, plane, plane + 1); + + av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row, + mi_col); + sync_write(lf_sync, r, c, sb_cols, plane); + } + } else if (dir == 1) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += MI_SIZE_64X64) { + c = mi_col >> MIN_MIB_SIZE_LOG2; + + // Wait for vertical edge filtering of the top-right block to be + // completed + sync_read(lf_sync, r, c, plane); + + // Wait for vertical edge filtering of the right block to be + // completed + sync_read(lf_sync, r + 1, c, plane); + + av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row, + mi_col, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row, + mi_col); + } + } + } else { + break; + } + } +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; + thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm, + lf_data->planes, lf_data->xd, lf_sync); + return 1; +} +#endif // CONFIG_LPF_MASK + static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, int start, int stop, int plane_start, int plane_end, +#if CONFIG_LPF_MASK + int is_decoding, +#endif AVxWorker *workers, int nworkers, AV1LfSync *lf_sync) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); +#if CONFIG_LPF_MASK + int sb_rows; + if (is_decoding) { + sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >> + MIN_MIB_SIZE_LOG2; + } else { + sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >> + MAX_MIB_SIZE_LOG2; + } +#else // Number of superblock rows and cols const int sb_rows = - ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2; + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >> + MAX_MIB_SIZE_LOG2; +#endif const int num_workers = nworkers; int i; @@ -336,14 +435,26 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows); } - enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end); + enqueue_lf_jobs(lf_sync, cm, start, stop, +#if CONFIG_LPF_MASK + is_decoding, +#endif + plane_start, plane_end); // Set up loopfilter thread data. for (i = 0; i < num_workers; ++i) { AVxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; +#if CONFIG_LPF_MASK + if (is_decoding) { + worker->hook = loop_filter_bitmask_row_worker; + } else { + worker->hook = loop_filter_row_worker; + } +#else worker->hook = loop_filter_row_worker; +#endif worker->data1 = lf_sync; worker->data2 = lf_data; @@ -366,22 +477,55 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, int plane_start, int plane_end, - int partial_frame, AVxWorker *workers, - int num_workers, AV1LfSync *lf_sync) { + int partial_frame, +#if CONFIG_LPF_MASK + int is_decoding, +#endif + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync) { int start_mi_row, end_mi_row, mi_rows_to_filter; start_mi_row = 0; - mi_rows_to_filter = cm->mi_rows; - if (partial_frame && cm->mi_rows > 8) { - start_mi_row = cm->mi_rows >> 1; + mi_rows_to_filter = cm->mi_params.mi_rows; + if (partial_frame && cm->mi_params.mi_rows > 8) { + start_mi_row = cm->mi_params.mi_rows >> 1; start_mi_row &= 0xfffffff8; - mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8); + mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8); } end_mi_row = start_mi_row + mi_rows_to_filter; av1_loop_filter_frame_init(cm, plane_start, plane_end); +#if CONFIG_LPF_MASK + if (is_decoding) { + cm->is_decoding = is_decoding; + // TODO(chengchen): currently use one thread to build bitmasks for the + // frame. Make it support multi-thread later. + for (int plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; + + // TODO(chengchen): can we remove this? + struct macroblockd_plane *pd = xd->plane; + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane, + plane + 1); + + av1_build_bitmask_vert_info(cm, &pd[plane], plane); + av1_build_bitmask_horz_info(cm, &pd[plane], plane); + } + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, + plane_end, 1, workers, num_workers, lf_sync); + } else { + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, + plane_end, 0, workers, num_workers, lf_sync); + } +#else loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, plane_end, workers, num_workers, lf_sync); +#endif } static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) { @@ -630,7 +774,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt, } } -AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { +static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { AV1LrMTInfo *cur_job_info = NULL; #if CONFIG_MULTITHREAD @@ -664,9 +808,9 @@ static int loop_restoration_row_worker(void *arg1, void *arg2) { typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, int vstart, int vend); - static const copy_fun copy_funs[3] = { - aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v - }; + static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, + aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v }; while (1) { AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync); @@ -772,7 +916,7 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int optimized_lr, AVxWorker *workers, int num_workers, AV1LrSync *lr_sync, void *lr_ctxt) { - assert(!cm->all_lossless); + assert(!cm->features.all_lossless); const int num_planes = av1_num_planes(cm); diff --git a/media/libaom/src/av1/common/thread_common.h b/media/libaom/src/av1/common/thread_common.h index 23d61d72a..7397f1c54 100644 --- a/media/libaom/src/av1/common/thread_common.h +++ b/media/libaom/src/av1/common/thread_common.h @@ -101,8 +101,11 @@ typedef struct AV1LrSyncData { void av1_loop_filter_dealloc(AV1LfSync *lf_sync); void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, - struct macroblockd *mbd, int plane_start, + struct macroblockd *xd, int plane_start, int plane_end, int partial_frame, +#if CONFIG_LPF_MASK + int is_decoding, +#endif AVxWorker *workers, int num_workers, AV1LfSync *lf_sync); void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, diff --git a/media/libaom/src/av1/common/tile_common.c b/media/libaom/src/av1/common/tile_common.c index 1b413487f..1b11bd760 100644 --- a/media/libaom/src/av1/common/tile_common.c +++ b/media/libaom/src/av1/common/tile_common.c @@ -9,9 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "av1/common/tile_common.h" -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/resize.h" +#include "av1/common/tile_common.h" #include "aom_dsp/aom_dsp_common.h" void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) { @@ -28,102 +28,126 @@ static int tile_log2(int blk_size, int target) { } void av1_get_tile_limits(AV1_COMMON *const cm) { - int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2); - int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); - int sb_cols = mi_cols >> cm->seq_params.mib_size_log2; - int sb_rows = mi_rows >> cm->seq_params.mib_size_log2; - - int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2; - cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2; - int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2); - - cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols); - cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS)); - cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS)); - cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows); - cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols); + const SequenceHeader *const seq_params = &cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + const int mi_cols = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); + const int mi_rows = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); + const int sb_cols = mi_cols >> seq_params->mib_size_log2; + const int sb_rows = mi_rows >> seq_params->mib_size_log2; + + const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2; + tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2; + const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2); + + tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols); + tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS)); + tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS)); + tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows); + tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols); } -void av1_calculate_tile_cols(AV1_COMMON *const cm) { - int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2); - int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); - int sb_cols = mi_cols >> cm->seq_params.mib_size_log2; - int sb_rows = mi_rows >> cm->seq_params.mib_size_log2; +void av1_calculate_tile_cols(const SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + CommonTileParams *const tiles) { + int mi_cols = ALIGN_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2); + int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int sb_cols = mi_cols >> seq_params->mib_size_log2; + int sb_rows = mi_rows >> seq_params->mib_size_log2; int i; - if (cm->uniform_tile_spacing_flag) { + // This will be overridden if there is at least two columns of tiles + // (otherwise there is no inner tile width) + tiles->min_inner_width = -1; + + if (tiles->uniform_spacing) { int start_sb; - int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols); - size_sb >>= cm->log2_tile_cols; + int size_sb = ALIGN_POWER_OF_TWO(sb_cols, tiles->log2_cols); + size_sb >>= tiles->log2_cols; assert(size_sb > 0); for (i = 0, start_sb = 0; start_sb < sb_cols; i++) { - cm->tile_col_start_sb[i] = start_sb; + tiles->col_start_sb[i] = start_sb; start_sb += size_sb; } - cm->tile_cols = i; - cm->tile_col_start_sb[i] = sb_cols; - cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0); - cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows; - - cm->tile_width = size_sb << cm->seq_params.mib_size_log2; - cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols); + tiles->cols = i; + tiles->col_start_sb[i] = sb_cols; + tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0); + tiles->max_height_sb = sb_rows >> tiles->min_log2_rows; + + tiles->width = size_sb << seq_params->mib_size_log2; + tiles->width = AOMMIN(tiles->width, cm_mi_cols); + if (tiles->cols > 1) { + tiles->min_inner_width = tiles->width; + } } else { int max_tile_area_sb = (sb_rows * sb_cols); int widest_tile_sb = 1; - cm->log2_tile_cols = tile_log2(1, cm->tile_cols); - for (i = 0; i < cm->tile_cols; i++) { - int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i]; + int narrowest_inner_tile_sb = 65536; + tiles->log2_cols = tile_log2(1, tiles->cols); + for (i = 0; i < tiles->cols; i++) { + int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; widest_tile_sb = AOMMAX(widest_tile_sb, size_sb); + // ignore the rightmost tile in frame for determining the narrowest + if (i < tiles->cols - 1) + narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb); } - if (cm->min_log2_tiles) { - max_tile_area_sb >>= (cm->min_log2_tiles + 1); + if (tiles->min_log2) { + max_tile_area_sb >>= (tiles->min_log2 + 1); + } + tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1); + if (tiles->cols > 1) { + tiles->min_inner_width = narrowest_inner_tile_sb + << seq_params->mib_size_log2; } - cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1); } } -void av1_calculate_tile_rows(AV1_COMMON *const cm) { - int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); - int sb_rows = mi_rows >> cm->seq_params.mib_size_log2; +void av1_calculate_tile_rows(const SequenceHeader *const seq_params, + int cm_mi_rows, CommonTileParams *const tiles) { + int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int sb_rows = mi_rows >> seq_params->mib_size_log2; int start_sb, size_sb, i; - if (cm->uniform_tile_spacing_flag) { - size_sb = ALIGN_POWER_OF_TWO(sb_rows, cm->log2_tile_rows); - size_sb >>= cm->log2_tile_rows; + if (tiles->uniform_spacing) { + size_sb = ALIGN_POWER_OF_TWO(sb_rows, tiles->log2_rows); + size_sb >>= tiles->log2_rows; assert(size_sb > 0); for (i = 0, start_sb = 0; start_sb < sb_rows; i++) { - cm->tile_row_start_sb[i] = start_sb; + tiles->row_start_sb[i] = start_sb; start_sb += size_sb; } - cm->tile_rows = i; - cm->tile_row_start_sb[i] = sb_rows; + tiles->rows = i; + tiles->row_start_sb[i] = sb_rows; - cm->tile_height = size_sb << cm->seq_params.mib_size_log2; - cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows); + tiles->height = size_sb << seq_params->mib_size_log2; + tiles->height = AOMMIN(tiles->height, cm_mi_rows); } else { - cm->log2_tile_rows = tile_log2(1, cm->tile_rows); + tiles->log2_rows = tile_log2(1, tiles->rows); } } void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) { - assert(row < cm->tile_rows); - int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2; - int mi_row_end = cm->tile_row_start_sb[row + 1] + assert(row < cm->tiles.rows); + int mi_row_start = cm->tiles.row_start_sb[row] + << cm->seq_params.mib_size_log2; + int mi_row_end = cm->tiles.row_start_sb[row + 1] << cm->seq_params.mib_size_log2; tile->tile_row = row; tile->mi_row_start = mi_row_start; - tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows); + tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows); assert(tile->mi_row_end > tile->mi_row_start); } void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { - assert(col < cm->tile_cols); - int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2; - int mi_col_end = cm->tile_col_start_sb[col + 1] + assert(col < cm->tiles.cols); + int mi_col_start = cm->tiles.col_start_sb[col] + << cm->seq_params.mib_size_log2; + int mi_col_end = cm->tiles.col_start_sb[col + 1] << cm->seq_params.mib_size_log2; tile->tile_col = col; tile->mi_col_start = mi_col_start; - tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols); + tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols); assert(tile->mi_col_end > tile->mi_col_start); } @@ -143,30 +167,6 @@ int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) { return sb_cols; } -int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) { - // Round the frame up to a whole number of max superblocks - mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2); - - // Divide by the signalled number of tiles, rounding up to the multiple of - // the max superblock size. To do this, shift right (and round up) to get the - // tile size in max super-blocks and then shift left again to convert it to - // mi units. - const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2; - const int max_sb_tile_size = - ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift; - const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2; - - // The actual number of tiles is the ceiling of the frame size in mi units - // divided by mi_size. This is at most 1 << log2_tile_num but might be - // strictly less if max_sb_tile_size got rounded up significantly. - if (ntiles) { - *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size; - assert(*ntiles <= (1 << log2_tile_num)); - } - - return mi_tile_size; -} - AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, int is_uv) { AV1PixelRect r; @@ -205,3 +205,35 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, return r; } + +void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { + const CommonTileParams *const tiles = &cm->tiles; + if (tiles->uniform_spacing) { + *w = tiles->width; + *h = tiles->height; + } else { + for (int i = 0; i < tiles->cols; ++i) { + const int tile_width_sb = + tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + const int tile_w = tile_width_sb * cm->seq_params.mib_size; + assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension + *w = tile_w; + } + + for (int i = 0; i < tiles->rows; ++i) { + const int tile_height_sb = + tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; + const int tile_h = tile_height_sb * cm->seq_params.mib_size; + assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension + *h = tile_h; + } + } +} + +int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) { + // Disable check if there is a single tile col in the frame + if (cm->tiles.cols == 1) return 1; + + return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >= + (64 << av1_superres_scaled(cm))); +} diff --git a/media/libaom/src/av1/common/tile_common.h b/media/libaom/src/av1/common/tile_common.h index c03553dc6..ca7c5f496 100644 --- a/media/libaom/src/av1/common/tile_common.h +++ b/media/libaom/src/av1/common/tile_common.h @@ -19,13 +19,14 @@ extern "C" { #include "config/aom_config.h" struct AV1Common; +struct SequenceHeader; +struct CommonTileParams; #define DEFAULT_MAX_NUM_TG 1 typedef struct TileInfo { int mi_row_start, mi_row_end; int mi_col_start, mi_col_end; - int tg_horz_boundary; int tile_row; int tile_col; } TileInfo; @@ -37,12 +38,6 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row, void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row); void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col); -void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, - int *max_log2_tile_cols); - -// Calculate the correct tile size (width or height) for (1 << log2_tile_num) -// tiles horizontally or vertically in the frame. -int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles); int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile); int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile); @@ -61,9 +56,17 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, #define MAX_TILE_WIDTH (4096) // Max Tile width in pixels #define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels +void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); void av1_get_tile_limits(struct AV1Common *const cm); -void av1_calculate_tile_cols(struct AV1Common *const cm); -void av1_calculate_tile_rows(struct AV1Common *const cm); +void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + struct CommonTileParams *const tiles); +void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params, + int cm_mi_rows, + struct CommonTileParams *const tiles); + +// Checks if the minimum tile_width requirement is satisfied +int av1_is_min_tile_width_satisfied(const struct AV1Common *cm); #ifdef __cplusplus } // extern "C" diff --git a/media/libaom/src/av1/common/timing.c b/media/libaom/src/av1/common/timing.c index 49dbde78f..a959cdf76 100644 --- a/media/libaom/src/av1/common/timing.c +++ b/media/libaom/src/av1/common/timing.c @@ -15,22 +15,35 @@ * The tables are in Kbps instead of Mbps in the specification. * Note that depending on the profile, a multiplier is needed. */ +#define UNDEFINED_RATE \ + (1 << 21) // Placeholder rate for levels with undefined rate +#define INVALID_RATE \ + (0) // For invalid profile-level configuration, set rate to 0 /* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */ /* is a dummy value. The decoder model is not applicable for level 31. */ static int32_t main_kbps[1 << LEVEL_BITS] = { - 1500, 3000, 0, 0, 6000, 10000, 0, 0, 12000, 20000, 0, - 0, 30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, (1 << 26) + 1500, 3000, UNDEFINED_RATE, UNDEFINED_RATE, + 6000, 10000, UNDEFINED_RATE, UNDEFINED_RATE, + 12000, 20000, UNDEFINED_RATE, UNDEFINED_RATE, + 30000, 40000, 60000, 60000, + 60000, 100000, 160000, 160000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE }; /* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */ /* is a dummy value. The decoder model is not applicable for level 31. */ static int32_t high_kbps[1 << LEVEL_BITS] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 30000, 50000, 0, 0, 100000, 160000, 240000, 240000, - 240000, 480000, 800000, 800000, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, (1 << 26) + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + 30000, 50000, UNDEFINED_RATE, UNDEFINED_RATE, + 100000, 160000, 240000, 240000, + 240000, 480000, 800000, 800000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE }; /* BitrateProfileFactor */ @@ -38,8 +51,8 @@ static int bitrate_profile_factor[1 << PROFILE_BITS] = { 1, 2, 3, 0, 0, 0, 0, 0 }; -int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, - int seq_tier) { +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier) { int64_t bitrate; if (seq_tier) { @@ -51,13 +64,13 @@ int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, return bitrate * 1000; } -void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) { +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) { decoder_model->encoder_decoder_buffer_delay_length = 16; decoder_model->buffer_removal_time_length = 10; decoder_model->frame_presentation_time_length = 10; } -void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { op_params->decoder_model_param_present_flag = 1; op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s @@ -66,7 +79,7 @@ void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { op_params->initial_display_delay = 8; // 8 frames delay } -void set_resource_availability_parameters( +void av1_set_resource_availability_parameters( aom_dec_model_op_parameters_t *op_params) { op_params->decoder_model_param_present_flag = 0; op_params->decoder_buffer_delay = diff --git a/media/libaom/src/av1/common/timing.h b/media/libaom/src/av1/common/timing.h index 06939ae43..9192124f7 100644 --- a/media/libaom/src/av1/common/timing.h +++ b/media/libaom/src/av1/common/timing.h @@ -42,18 +42,14 @@ typedef struct aom_dec_model_op_parameters { int initial_display_delay; } aom_dec_model_op_parameters_t; -typedef struct aom_op_timing_info_t { - uint32_t buffer_removal_time; -} aom_op_timing_info_t; +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model); -void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model); +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params); -void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params); - -void set_resource_availability_parameters( +void av1_set_resource_availability_parameters( aom_dec_model_op_parameters_t *op_params); -int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, - int seq_tier); +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier); #endif // AOM_AV1_COMMON_TIMING_H_ diff --git a/media/libaom/src/av1/common/token_cdfs.h b/media/libaom/src/av1/common/token_cdfs.h index 53e956450..f1edda58d 100644 --- a/media/libaom/src/av1/common/token_cdfs.h +++ b/media/libaom/src/av1/common/token_cdfs.h @@ -1707,1687 +1707,1687 @@ static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs static const aom_cdf_prob av1_default_coeff_base_multi_cdfs [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] - [CDF_SIZE(NUM_BASE_LEVELS + 2)] = - { { { { { AOM_CDF4(4034, 8930, 12727) }, - { AOM_CDF4(18082, 29741, 31877) }, - { AOM_CDF4(12596, 26124, 30493) }, - { AOM_CDF4(9446, 21118, 27005) }, - { AOM_CDF4(6308, 15141, 21279) }, - { AOM_CDF4(2463, 6357, 9783) }, - { AOM_CDF4(20667, 30546, 31929) }, - { AOM_CDF4(13043, 26123, 30134) }, - { AOM_CDF4(8151, 18757, 24778) }, - { AOM_CDF4(5255, 12839, 18632) }, - { AOM_CDF4(2820, 7206, 11161) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(15736, 27553, 30604) }, - { AOM_CDF4(11210, 23794, 28787) }, - { AOM_CDF4(5947, 13874, 19701) }, - { AOM_CDF4(4215, 9323, 13891) }, - { AOM_CDF4(2833, 6462, 10059) }, - { AOM_CDF4(19605, 30393, 31582) }, - { AOM_CDF4(13523, 26252, 30248) }, - { AOM_CDF4(8446, 18622, 24512) }, - { AOM_CDF4(3818, 10343, 15974) }, - { AOM_CDF4(1481, 4117, 6796) }, - { AOM_CDF4(22649, 31302, 32190) }, - { AOM_CDF4(14829, 27127, 30449) }, - { AOM_CDF4(8313, 17702, 23304) }, - { AOM_CDF4(3022, 8301, 12786) }, - { AOM_CDF4(1536, 4412, 7184) }, - { AOM_CDF4(22354, 29774, 31372) }, - { AOM_CDF4(14723, 25472, 29214) }, - { AOM_CDF4(6673, 13745, 18662) }, - { AOM_CDF4(2068, 5766, 9322) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(6302, 16444, 21761) }, - { AOM_CDF4(23040, 31538, 32475) }, - { AOM_CDF4(15196, 28452, 31496) }, - { AOM_CDF4(10020, 22946, 28514) }, - { AOM_CDF4(6533, 16862, 23501) }, - { AOM_CDF4(3538, 9816, 15076) }, - { AOM_CDF4(24444, 31875, 32525) }, - { AOM_CDF4(15881, 28924, 31635) }, - { AOM_CDF4(9922, 22873, 28466) }, - { AOM_CDF4(6527, 16966, 23691) }, - { AOM_CDF4(4114, 11303, 17220) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(20201, 30770, 32209) }, - { AOM_CDF4(14754, 28071, 31258) }, - { AOM_CDF4(8378, 20186, 26517) }, - { AOM_CDF4(5916, 15299, 21978) }, - { AOM_CDF4(4268, 11583, 17901) }, - { AOM_CDF4(24361, 32025, 32581) }, - { AOM_CDF4(18673, 30105, 31943) }, - { AOM_CDF4(10196, 22244, 27576) }, - { AOM_CDF4(5495, 14349, 20417) }, - { AOM_CDF4(2676, 7415, 11498) }, - { AOM_CDF4(24678, 31958, 32585) }, - { AOM_CDF4(18629, 29906, 31831) }, - { AOM_CDF4(9364, 20724, 26315) }, - { AOM_CDF4(4641, 12318, 18094) }, - { AOM_CDF4(2758, 7387, 11579) }, - { AOM_CDF4(25433, 31842, 32469) }, - { AOM_CDF4(18795, 29289, 31411) }, - { AOM_CDF4(7644, 17584, 23592) }, - { AOM_CDF4(3408, 9014, 15047) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(4536, 10072, 14001) }, - { AOM_CDF4(25459, 31416, 32206) }, - { AOM_CDF4(16605, 28048, 30818) }, - { AOM_CDF4(11008, 22857, 27719) }, - { AOM_CDF4(6915, 16268, 22315) }, - { AOM_CDF4(2625, 6812, 10537) }, - { AOM_CDF4(24257, 31788, 32499) }, - { AOM_CDF4(16880, 29454, 31879) }, - { AOM_CDF4(11958, 25054, 29778) }, - { AOM_CDF4(7916, 18718, 25084) }, - { AOM_CDF4(3383, 8777, 13446) }, - { AOM_CDF4(22720, 31603, 32393) }, - { AOM_CDF4(14960, 28125, 31335) }, - { AOM_CDF4(9731, 22210, 27928) }, - { AOM_CDF4(6304, 15832, 22277) }, - { AOM_CDF4(2910, 7818, 12166) }, - { AOM_CDF4(20375, 30627, 32131) }, - { AOM_CDF4(13904, 27284, 30887) }, - { AOM_CDF4(9368, 21558, 27144) }, - { AOM_CDF4(5937, 14966, 21119) }, - { AOM_CDF4(2667, 7225, 11319) }, - { AOM_CDF4(23970, 31470, 32378) }, - { AOM_CDF4(17173, 29734, 32018) }, - { AOM_CDF4(12795, 25441, 29965) }, - { AOM_CDF4(8981, 19680, 25893) }, - { AOM_CDF4(4728, 11372, 16902) }, - { AOM_CDF4(24287, 31797, 32439) }, - { AOM_CDF4(16703, 29145, 31696) }, - { AOM_CDF4(10833, 23554, 28725) }, - { AOM_CDF4(6468, 16566, 23057) }, - { AOM_CDF4(2415, 6562, 10278) }, - { AOM_CDF4(26610, 32395, 32659) }, - { AOM_CDF4(18590, 30498, 32117) }, - { AOM_CDF4(12420, 25756, 29950) }, - { AOM_CDF4(7639, 18746, 24710) }, - { AOM_CDF4(3001, 8086, 12347) }, - { AOM_CDF4(25076, 32064, 32580) }, - { AOM_CDF4(17946, 30128, 32028) }, - { AOM_CDF4(12024, 24985, 29378) }, - { AOM_CDF4(7517, 18390, 24304) }, - { AOM_CDF4(3243, 8781, 13331) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(6037, 16771, 21957) }, - { AOM_CDF4(24774, 31704, 32426) }, - { AOM_CDF4(16830, 28589, 31056) }, - { AOM_CDF4(10602, 22828, 27760) }, - { AOM_CDF4(6733, 16829, 23071) }, - { AOM_CDF4(3250, 8914, 13556) }, - { AOM_CDF4(25582, 32220, 32668) }, - { AOM_CDF4(18659, 30342, 32223) }, - { AOM_CDF4(12546, 26149, 30515) }, - { AOM_CDF4(8420, 20451, 26801) }, - { AOM_CDF4(4636, 12420, 18344) }, - { AOM_CDF4(27581, 32362, 32639) }, - { AOM_CDF4(18987, 30083, 31978) }, - { AOM_CDF4(11327, 24248, 29084) }, - { AOM_CDF4(7264, 17719, 24120) }, - { AOM_CDF4(3995, 10768, 16169) }, - { AOM_CDF4(25893, 31831, 32487) }, - { AOM_CDF4(16577, 28587, 31379) }, - { AOM_CDF4(10189, 22748, 28182) }, - { AOM_CDF4(6832, 17094, 23556) }, - { AOM_CDF4(3708, 10110, 15334) }, - { AOM_CDF4(25904, 32282, 32656) }, - { AOM_CDF4(19721, 30792, 32276) }, - { AOM_CDF4(12819, 26243, 30411) }, - { AOM_CDF4(8572, 20614, 26891) }, - { AOM_CDF4(5364, 14059, 20467) }, - { AOM_CDF4(26580, 32438, 32677) }, - { AOM_CDF4(20852, 31225, 32340) }, - { AOM_CDF4(12435, 25700, 29967) }, - { AOM_CDF4(8691, 20825, 26976) }, - { AOM_CDF4(4446, 12209, 17269) }, - { AOM_CDF4(27350, 32429, 32696) }, - { AOM_CDF4(21372, 30977, 32272) }, - { AOM_CDF4(12673, 25270, 29853) }, - { AOM_CDF4(9208, 20925, 26640) }, - { AOM_CDF4(5018, 13351, 18732) }, - { AOM_CDF4(27351, 32479, 32713) }, - { AOM_CDF4(21398, 31209, 32387) }, - { AOM_CDF4(12162, 25047, 29842) }, - { AOM_CDF4(7896, 18691, 25319) }, - { AOM_CDF4(4670, 12882, 18881) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(5487, 10460, 13708) }, - { AOM_CDF4(21597, 28303, 30674) }, - { AOM_CDF4(11037, 21953, 26476) }, - { AOM_CDF4(8147, 17962, 22952) }, - { AOM_CDF4(5242, 13061, 18532) }, - { AOM_CDF4(1889, 5208, 8182) }, - { AOM_CDF4(26774, 32133, 32590) }, - { AOM_CDF4(17844, 29564, 31767) }, - { AOM_CDF4(11690, 24438, 29171) }, - { AOM_CDF4(7542, 18215, 24459) }, - { AOM_CDF4(2993, 8050, 12319) }, - { AOM_CDF4(28023, 32328, 32591) }, - { AOM_CDF4(18651, 30126, 31954) }, - { AOM_CDF4(12164, 25146, 29589) }, - { AOM_CDF4(7762, 18530, 24771) }, - { AOM_CDF4(3492, 9183, 13920) }, - { AOM_CDF4(27591, 32008, 32491) }, - { AOM_CDF4(17149, 28853, 31510) }, - { AOM_CDF4(11485, 24003, 28860) }, - { AOM_CDF4(7697, 18086, 24210) }, - { AOM_CDF4(3075, 7999, 12218) }, - { AOM_CDF4(28268, 32482, 32654) }, - { AOM_CDF4(19631, 31051, 32404) }, - { AOM_CDF4(13860, 27260, 31020) }, - { AOM_CDF4(9605, 21613, 27594) }, - { AOM_CDF4(4876, 12162, 17908) }, - { AOM_CDF4(27248, 32316, 32576) }, - { AOM_CDF4(18955, 30457, 32075) }, - { AOM_CDF4(11824, 23997, 28795) }, - { AOM_CDF4(7346, 18196, 24647) }, - { AOM_CDF4(3403, 9247, 14111) }, - { AOM_CDF4(29711, 32655, 32735) }, - { AOM_CDF4(21169, 31394, 32417) }, - { AOM_CDF4(13487, 27198, 30957) }, - { AOM_CDF4(8828, 21683, 27614) }, - { AOM_CDF4(4270, 11451, 17038) }, - { AOM_CDF4(28708, 32578, 32731) }, - { AOM_CDF4(20120, 31241, 32482) }, - { AOM_CDF4(13692, 27550, 31321) }, - { AOM_CDF4(9418, 22514, 28439) }, - { AOM_CDF4(4999, 13283, 19462) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(5673, 14302, 19711) }, - { AOM_CDF4(26251, 30701, 31834) }, - { AOM_CDF4(12782, 23783, 27803) }, - { AOM_CDF4(9127, 20657, 25808) }, - { AOM_CDF4(6368, 16208, 21462) }, - { AOM_CDF4(2465, 7177, 10822) }, - { AOM_CDF4(29961, 32563, 32719) }, - { AOM_CDF4(18318, 29891, 31949) }, - { AOM_CDF4(11361, 24514, 29357) }, - { AOM_CDF4(7900, 19603, 25607) }, - { AOM_CDF4(4002, 10590, 15546) }, - { AOM_CDF4(29637, 32310, 32595) }, - { AOM_CDF4(18296, 29913, 31809) }, - { AOM_CDF4(10144, 21515, 26871) }, - { AOM_CDF4(5358, 14322, 20394) }, - { AOM_CDF4(3067, 8362, 13346) }, - { AOM_CDF4(28652, 32470, 32676) }, - { AOM_CDF4(17538, 30771, 32209) }, - { AOM_CDF4(13924, 26882, 30494) }, - { AOM_CDF4(10496, 22837, 27869) }, - { AOM_CDF4(7236, 16396, 21621) }, - { AOM_CDF4(30743, 32687, 32746) }, - { AOM_CDF4(23006, 31676, 32489) }, - { AOM_CDF4(14494, 27828, 31120) }, - { AOM_CDF4(10174, 22801, 28352) }, - { AOM_CDF4(6242, 15281, 21043) }, - { AOM_CDF4(25817, 32243, 32720) }, - { AOM_CDF4(18618, 31367, 32325) }, - { AOM_CDF4(13997, 28318, 31878) }, - { AOM_CDF4(12255, 26534, 31383) }, - { AOM_CDF4(9561, 21588, 28450) }, - { AOM_CDF4(28188, 32635, 32724) }, - { AOM_CDF4(22060, 32365, 32728) }, - { AOM_CDF4(18102, 30690, 32528) }, - { AOM_CDF4(14196, 28864, 31999) }, - { AOM_CDF4(12262, 25792, 30865) }, - { AOM_CDF4(24176, 32109, 32628) }, - { AOM_CDF4(18280, 29681, 31963) }, - { AOM_CDF4(10205, 23703, 29664) }, - { AOM_CDF4(7889, 20025, 27676) }, - { AOM_CDF4(6060, 16743, 23970) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(5141, 7096, 8260) }, - { AOM_CDF4(27186, 29022, 29789) }, - { AOM_CDF4(6668, 12568, 15682) }, - { AOM_CDF4(2172, 6181, 8638) }, - { AOM_CDF4(1126, 3379, 4531) }, - { AOM_CDF4(443, 1361, 2254) }, - { AOM_CDF4(26083, 31153, 32436) }, - { AOM_CDF4(13486, 24603, 28483) }, - { AOM_CDF4(6508, 14840, 19910) }, - { AOM_CDF4(3386, 8800, 13286) }, - { AOM_CDF4(1530, 4322, 7054) }, - { AOM_CDF4(29639, 32080, 32548) }, - { AOM_CDF4(15897, 27552, 30290) }, - { AOM_CDF4(8588, 20047, 25383) }, - { AOM_CDF4(4889, 13339, 19269) }, - { AOM_CDF4(2240, 6871, 10498) }, - { AOM_CDF4(28165, 32197, 32517) }, - { AOM_CDF4(20735, 30427, 31568) }, - { AOM_CDF4(14325, 24671, 27692) }, - { AOM_CDF4(5119, 12554, 17805) }, - { AOM_CDF4(1810, 5441, 8261) }, - { AOM_CDF4(31212, 32724, 32748) }, - { AOM_CDF4(23352, 31766, 32545) }, - { AOM_CDF4(14669, 27570, 31059) }, - { AOM_CDF4(8492, 20894, 27272) }, - { AOM_CDF4(3644, 10194, 15204) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(2461, 7013, 9371) }, - { AOM_CDF4(24749, 29600, 30986) }, - { AOM_CDF4(9466, 19037, 22417) }, - { AOM_CDF4(3584, 9280, 14400) }, - { AOM_CDF4(1505, 3929, 5433) }, - { AOM_CDF4(677, 1500, 2736) }, - { AOM_CDF4(23987, 30702, 32117) }, - { AOM_CDF4(13554, 24571, 29263) }, - { AOM_CDF4(6211, 14556, 21155) }, - { AOM_CDF4(3135, 10972, 15625) }, - { AOM_CDF4(2435, 7127, 11427) }, - { AOM_CDF4(31300, 32532, 32550) }, - { AOM_CDF4(14757, 30365, 31954) }, - { AOM_CDF4(4405, 11612, 18553) }, - { AOM_CDF4(580, 4132, 7322) }, - { AOM_CDF4(1695, 10169, 14124) }, - { AOM_CDF4(30008, 32282, 32591) }, - { AOM_CDF4(19244, 30108, 31748) }, - { AOM_CDF4(11180, 24158, 29555) }, - { AOM_CDF4(5650, 14972, 19209) }, - { AOM_CDF4(2114, 5109, 8456) }, - { AOM_CDF4(31856, 32716, 32748) }, - { AOM_CDF4(23012, 31664, 32572) }, - { AOM_CDF4(13694, 26656, 30636) }, - { AOM_CDF4(8142, 19508, 26093) }, - { AOM_CDF4(4253, 10955, 16724) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(601, 983, 1311) }, - { AOM_CDF4(18725, 23406, 28087) }, - { AOM_CDF4(5461, 8192, 10923) }, - { AOM_CDF4(3781, 15124, 21425) }, - { AOM_CDF4(2587, 7761, 12072) }, - { AOM_CDF4(106, 458, 810) }, - { AOM_CDF4(22282, 29710, 31894) }, - { AOM_CDF4(8508, 20926, 25984) }, - { AOM_CDF4(3726, 12713, 18083) }, - { AOM_CDF4(1620, 7112, 10893) }, - { AOM_CDF4(729, 2236, 3495) }, - { AOM_CDF4(30163, 32474, 32684) }, - { AOM_CDF4(18304, 30464, 32000) }, - { AOM_CDF4(11443, 26526, 29647) }, - { AOM_CDF4(6007, 15292, 21299) }, - { AOM_CDF4(2234, 6703, 8937) }, - { AOM_CDF4(30954, 32177, 32571) }, - { AOM_CDF4(17363, 29562, 31076) }, - { AOM_CDF4(9686, 22464, 27410) }, - { AOM_CDF4(8192, 16384, 21390) }, - { AOM_CDF4(1755, 8046, 11264) }, - { AOM_CDF4(31168, 32734, 32748) }, - { AOM_CDF4(22486, 31441, 32471) }, - { AOM_CDF4(12833, 25627, 29738) }, - { AOM_CDF4(6980, 17379, 23122) }, - { AOM_CDF4(3111, 8887, 13479) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } } }, - { { { { AOM_CDF4(6041, 11854, 15927) }, - { AOM_CDF4(20326, 30905, 32251) }, - { AOM_CDF4(14164, 26831, 30725) }, - { AOM_CDF4(9760, 20647, 26585) }, - { AOM_CDF4(6416, 14953, 21219) }, - { AOM_CDF4(2966, 7151, 10891) }, - { AOM_CDF4(23567, 31374, 32254) }, - { AOM_CDF4(14978, 27416, 30946) }, - { AOM_CDF4(9434, 20225, 26254) }, - { AOM_CDF4(6658, 14558, 20535) }, - { AOM_CDF4(3916, 8677, 12989) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(18088, 29545, 31587) }, - { AOM_CDF4(13062, 25843, 30073) }, - { AOM_CDF4(8940, 16827, 22251) }, - { AOM_CDF4(7654, 13220, 17973) }, - { AOM_CDF4(5733, 10316, 14456) }, - { AOM_CDF4(22879, 31388, 32114) }, - { AOM_CDF4(15215, 27993, 30955) }, - { AOM_CDF4(9397, 19445, 24978) }, - { AOM_CDF4(3442, 9813, 15344) }, - { AOM_CDF4(1368, 3936, 6532) }, - { AOM_CDF4(25494, 32033, 32406) }, - { AOM_CDF4(16772, 27963, 30718) }, - { AOM_CDF4(9419, 18165, 23260) }, - { AOM_CDF4(2677, 7501, 11797) }, - { AOM_CDF4(1516, 4344, 7170) }, - { AOM_CDF4(26556, 31454, 32101) }, - { AOM_CDF4(17128, 27035, 30108) }, - { AOM_CDF4(8324, 15344, 20249) }, - { AOM_CDF4(1903, 5696, 9469) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(8455, 19003, 24368) }, - { AOM_CDF4(23563, 32021, 32604) }, - { AOM_CDF4(16237, 29446, 31935) }, - { AOM_CDF4(10724, 23999, 29358) }, - { AOM_CDF4(6725, 17528, 24416) }, - { AOM_CDF4(3927, 10927, 16825) }, - { AOM_CDF4(26313, 32288, 32634) }, - { AOM_CDF4(17430, 30095, 32095) }, - { AOM_CDF4(11116, 24606, 29679) }, - { AOM_CDF4(7195, 18384, 25269) }, - { AOM_CDF4(4726, 12852, 19315) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(22822, 31648, 32483) }, - { AOM_CDF4(16724, 29633, 31929) }, - { AOM_CDF4(10261, 23033, 28725) }, - { AOM_CDF4(7029, 17840, 24528) }, - { AOM_CDF4(4867, 13886, 21502) }, - { AOM_CDF4(25298, 31892, 32491) }, - { AOM_CDF4(17809, 29330, 31512) }, - { AOM_CDF4(9668, 21329, 26579) }, - { AOM_CDF4(4774, 12956, 18976) }, - { AOM_CDF4(2322, 7030, 11540) }, - { AOM_CDF4(25472, 31920, 32543) }, - { AOM_CDF4(17957, 29387, 31632) }, - { AOM_CDF4(9196, 20593, 26400) }, - { AOM_CDF4(4680, 12705, 19202) }, - { AOM_CDF4(2917, 8456, 13436) }, - { AOM_CDF4(26471, 32059, 32574) }, - { AOM_CDF4(18458, 29783, 31909) }, - { AOM_CDF4(8400, 19464, 25956) }, - { AOM_CDF4(3812, 10973, 17206) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(6779, 13743, 17678) }, - { AOM_CDF4(24806, 31797, 32457) }, - { AOM_CDF4(17616, 29047, 31372) }, - { AOM_CDF4(11063, 23175, 28003) }, - { AOM_CDF4(6521, 16110, 22324) }, - { AOM_CDF4(2764, 7504, 11654) }, - { AOM_CDF4(25266, 32367, 32637) }, - { AOM_CDF4(19054, 30553, 32175) }, - { AOM_CDF4(12139, 25212, 29807) }, - { AOM_CDF4(7311, 18162, 24704) }, - { AOM_CDF4(3397, 9164, 14074) }, - { AOM_CDF4(25988, 32208, 32522) }, - { AOM_CDF4(16253, 28912, 31526) }, - { AOM_CDF4(9151, 21387, 27372) }, - { AOM_CDF4(5688, 14915, 21496) }, - { AOM_CDF4(2717, 7627, 12004) }, - { AOM_CDF4(23144, 31855, 32443) }, - { AOM_CDF4(16070, 28491, 31325) }, - { AOM_CDF4(8702, 20467, 26517) }, - { AOM_CDF4(5243, 13956, 20367) }, - { AOM_CDF4(2621, 7335, 11567) }, - { AOM_CDF4(26636, 32340, 32630) }, - { AOM_CDF4(19990, 31050, 32341) }, - { AOM_CDF4(13243, 26105, 30315) }, - { AOM_CDF4(8588, 19521, 25918) }, - { AOM_CDF4(4717, 11585, 17304) }, - { AOM_CDF4(25844, 32292, 32582) }, - { AOM_CDF4(19090, 30635, 32097) }, - { AOM_CDF4(11963, 24546, 28939) }, - { AOM_CDF4(6218, 16087, 22354) }, - { AOM_CDF4(2340, 6608, 10426) }, - { AOM_CDF4(28046, 32576, 32694) }, - { AOM_CDF4(21178, 31313, 32296) }, - { AOM_CDF4(13486, 26184, 29870) }, - { AOM_CDF4(7149, 17871, 23723) }, - { AOM_CDF4(2833, 7958, 12259) }, - { AOM_CDF4(27710, 32528, 32686) }, - { AOM_CDF4(20674, 31076, 32268) }, - { AOM_CDF4(12413, 24955, 29243) }, - { AOM_CDF4(6676, 16927, 23097) }, - { AOM_CDF4(2966, 8333, 12919) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(8639, 19339, 24429) }, - { AOM_CDF4(24404, 31837, 32525) }, - { AOM_CDF4(16997, 29425, 31784) }, - { AOM_CDF4(11253, 24234, 29149) }, - { AOM_CDF4(6751, 17394, 24028) }, - { AOM_CDF4(3490, 9830, 15191) }, - { AOM_CDF4(26283, 32471, 32714) }, - { AOM_CDF4(19599, 31168, 32442) }, - { AOM_CDF4(13146, 26954, 30893) }, - { AOM_CDF4(8214, 20588, 26890) }, - { AOM_CDF4(4699, 13081, 19300) }, - { AOM_CDF4(28212, 32458, 32669) }, - { AOM_CDF4(18594, 30316, 32100) }, - { AOM_CDF4(11219, 24408, 29234) }, - { AOM_CDF4(6865, 17656, 24149) }, - { AOM_CDF4(3678, 10362, 16006) }, - { AOM_CDF4(25825, 32136, 32616) }, - { AOM_CDF4(17313, 29853, 32021) }, - { AOM_CDF4(11197, 24471, 29472) }, - { AOM_CDF4(6947, 17781, 24405) }, - { AOM_CDF4(3768, 10660, 16261) }, - { AOM_CDF4(27352, 32500, 32706) }, - { AOM_CDF4(20850, 31468, 32469) }, - { AOM_CDF4(14021, 27707, 31133) }, - { AOM_CDF4(8964, 21748, 27838) }, - { AOM_CDF4(5437, 14665, 21187) }, - { AOM_CDF4(26304, 32492, 32698) }, - { AOM_CDF4(20409, 31380, 32385) }, - { AOM_CDF4(13682, 27222, 30632) }, - { AOM_CDF4(8974, 21236, 26685) }, - { AOM_CDF4(4234, 11665, 16934) }, - { AOM_CDF4(26273, 32357, 32711) }, - { AOM_CDF4(20672, 31242, 32441) }, - { AOM_CDF4(14172, 27254, 30902) }, - { AOM_CDF4(9870, 21898, 27275) }, - { AOM_CDF4(5164, 13506, 19270) }, - { AOM_CDF4(26725, 32459, 32728) }, - { AOM_CDF4(20991, 31442, 32527) }, - { AOM_CDF4(13071, 26434, 30811) }, - { AOM_CDF4(8184, 20090, 26742) }, - { AOM_CDF4(4803, 13255, 19895) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(7555, 14942, 18501) }, - { AOM_CDF4(24410, 31178, 32287) }, - { AOM_CDF4(14394, 26738, 30253) }, - { AOM_CDF4(8413, 19554, 25195) }, - { AOM_CDF4(4766, 12924, 18785) }, - { AOM_CDF4(2029, 5806, 9207) }, - { AOM_CDF4(26776, 32364, 32663) }, - { AOM_CDF4(18732, 29967, 31931) }, - { AOM_CDF4(11005, 23786, 28852) }, - { AOM_CDF4(6466, 16909, 23510) }, - { AOM_CDF4(3044, 8638, 13419) }, - { AOM_CDF4(29208, 32582, 32704) }, - { AOM_CDF4(20068, 30857, 32208) }, - { AOM_CDF4(12003, 25085, 29595) }, - { AOM_CDF4(6947, 17750, 24189) }, - { AOM_CDF4(3245, 9103, 14007) }, - { AOM_CDF4(27359, 32465, 32669) }, - { AOM_CDF4(19421, 30614, 32174) }, - { AOM_CDF4(11915, 25010, 29579) }, - { AOM_CDF4(6950, 17676, 24074) }, - { AOM_CDF4(3007, 8473, 13096) }, - { AOM_CDF4(29002, 32676, 32735) }, - { AOM_CDF4(22102, 31849, 32576) }, - { AOM_CDF4(14408, 28009, 31405) }, - { AOM_CDF4(9027, 21679, 27931) }, - { AOM_CDF4(4694, 12678, 18748) }, - { AOM_CDF4(28216, 32528, 32682) }, - { AOM_CDF4(20849, 31264, 32318) }, - { AOM_CDF4(12756, 25815, 29751) }, - { AOM_CDF4(7565, 18801, 24923) }, - { AOM_CDF4(3509, 9533, 14477) }, - { AOM_CDF4(30133, 32687, 32739) }, - { AOM_CDF4(23063, 31910, 32515) }, - { AOM_CDF4(14588, 28051, 31132) }, - { AOM_CDF4(9085, 21649, 27457) }, - { AOM_CDF4(4261, 11654, 17264) }, - { AOM_CDF4(29518, 32691, 32748) }, - { AOM_CDF4(22451, 31959, 32613) }, - { AOM_CDF4(14864, 28722, 31700) }, - { AOM_CDF4(9695, 22964, 28716) }, - { AOM_CDF4(4932, 13358, 19502) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(6465, 16958, 21688) }, - { AOM_CDF4(25199, 31514, 32360) }, - { AOM_CDF4(14774, 27149, 30607) }, - { AOM_CDF4(9257, 21438, 26972) }, - { AOM_CDF4(5723, 15183, 21882) }, - { AOM_CDF4(3150, 8879, 13731) }, - { AOM_CDF4(26989, 32262, 32682) }, - { AOM_CDF4(17396, 29937, 32085) }, - { AOM_CDF4(11387, 24901, 29784) }, - { AOM_CDF4(7289, 18821, 25548) }, - { AOM_CDF4(3734, 10577, 16086) }, - { AOM_CDF4(29728, 32501, 32695) }, - { AOM_CDF4(17431, 29701, 31903) }, - { AOM_CDF4(9921, 22826, 28300) }, - { AOM_CDF4(5896, 15434, 22068) }, - { AOM_CDF4(3430, 9646, 14757) }, - { AOM_CDF4(28614, 32511, 32705) }, - { AOM_CDF4(19364, 30638, 32263) }, - { AOM_CDF4(13129, 26254, 30402) }, - { AOM_CDF4(8754, 20484, 26440) }, - { AOM_CDF4(4378, 11607, 17110) }, - { AOM_CDF4(30292, 32671, 32744) }, - { AOM_CDF4(21780, 31603, 32501) }, - { AOM_CDF4(14314, 27829, 31291) }, - { AOM_CDF4(9611, 22327, 28263) }, - { AOM_CDF4(4890, 13087, 19065) }, - { AOM_CDF4(25862, 32567, 32733) }, - { AOM_CDF4(20794, 32050, 32567) }, - { AOM_CDF4(17243, 30625, 32254) }, - { AOM_CDF4(13283, 27628, 31474) }, - { AOM_CDF4(9669, 22532, 28918) }, - { AOM_CDF4(27435, 32697, 32748) }, - { AOM_CDF4(24922, 32390, 32714) }, - { AOM_CDF4(21449, 31504, 32536) }, - { AOM_CDF4(16392, 29729, 31832) }, - { AOM_CDF4(11692, 24884, 29076) }, - { AOM_CDF4(24193, 32290, 32735) }, - { AOM_CDF4(18909, 31104, 32563) }, - { AOM_CDF4(12236, 26841, 31403) }, - { AOM_CDF4(8171, 21840, 29082) }, - { AOM_CDF4(7224, 17280, 25275) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(3078, 6839, 9890) }, - { AOM_CDF4(13837, 20450, 24479) }, - { AOM_CDF4(5914, 14222, 19328) }, - { AOM_CDF4(3866, 10267, 14762) }, - { AOM_CDF4(2612, 7208, 11042) }, - { AOM_CDF4(1067, 2991, 4776) }, - { AOM_CDF4(25817, 31646, 32529) }, - { AOM_CDF4(13708, 26338, 30385) }, - { AOM_CDF4(7328, 18585, 24870) }, - { AOM_CDF4(4691, 13080, 19276) }, - { AOM_CDF4(1825, 5253, 8352) }, - { AOM_CDF4(29386, 32315, 32624) }, - { AOM_CDF4(17160, 29001, 31360) }, - { AOM_CDF4(9602, 21862, 27396) }, - { AOM_CDF4(5915, 15772, 22148) }, - { AOM_CDF4(2786, 7779, 12047) }, - { AOM_CDF4(29246, 32450, 32663) }, - { AOM_CDF4(18696, 29929, 31818) }, - { AOM_CDF4(10510, 23369, 28560) }, - { AOM_CDF4(6229, 16499, 23125) }, - { AOM_CDF4(2608, 7448, 11705) }, - { AOM_CDF4(30753, 32710, 32748) }, - { AOM_CDF4(21638, 31487, 32503) }, - { AOM_CDF4(12937, 26854, 30870) }, - { AOM_CDF4(8182, 20596, 26970) }, - { AOM_CDF4(3637, 10269, 15497) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(5244, 12150, 16906) }, - { AOM_CDF4(20486, 26858, 29701) }, - { AOM_CDF4(7756, 18317, 23735) }, - { AOM_CDF4(3452, 9256, 13146) }, - { AOM_CDF4(2020, 5206, 8229) }, - { AOM_CDF4(1801, 4993, 7903) }, - { AOM_CDF4(27051, 31858, 32531) }, - { AOM_CDF4(15988, 27531, 30619) }, - { AOM_CDF4(9188, 21484, 26719) }, - { AOM_CDF4(6273, 17186, 23800) }, - { AOM_CDF4(3108, 9355, 14764) }, - { AOM_CDF4(31076, 32520, 32680) }, - { AOM_CDF4(18119, 30037, 31850) }, - { AOM_CDF4(10244, 22969, 27472) }, - { AOM_CDF4(4692, 14077, 19273) }, - { AOM_CDF4(3694, 11677, 17556) }, - { AOM_CDF4(30060, 32581, 32720) }, - { AOM_CDF4(21011, 30775, 32120) }, - { AOM_CDF4(11931, 24820, 29289) }, - { AOM_CDF4(7119, 17662, 24356) }, - { AOM_CDF4(3833, 10706, 16304) }, - { AOM_CDF4(31954, 32731, 32748) }, - { AOM_CDF4(23913, 31724, 32489) }, - { AOM_CDF4(15520, 28060, 31286) }, - { AOM_CDF4(11517, 23008, 28571) }, - { AOM_CDF4(6193, 14508, 20629) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(1035, 2807, 4156) }, - { AOM_CDF4(13162, 18138, 20939) }, - { AOM_CDF4(2696, 6633, 8755) }, - { AOM_CDF4(1373, 4161, 6853) }, - { AOM_CDF4(1099, 2746, 4716) }, - { AOM_CDF4(340, 1021, 1599) }, - { AOM_CDF4(22826, 30419, 32135) }, - { AOM_CDF4(10395, 21762, 26942) }, - { AOM_CDF4(4726, 12407, 17361) }, - { AOM_CDF4(2447, 7080, 10593) }, - { AOM_CDF4(1227, 3717, 6011) }, - { AOM_CDF4(28156, 31424, 31934) }, - { AOM_CDF4(16915, 27754, 30373) }, - { AOM_CDF4(9148, 20990, 26431) }, - { AOM_CDF4(5950, 15515, 21148) }, - { AOM_CDF4(2492, 7327, 11526) }, - { AOM_CDF4(30602, 32477, 32670) }, - { AOM_CDF4(20026, 29955, 31568) }, - { AOM_CDF4(11220, 23628, 28105) }, - { AOM_CDF4(6652, 17019, 22973) }, - { AOM_CDF4(3064, 8536, 13043) }, - { AOM_CDF4(31769, 32724, 32748) }, - { AOM_CDF4(22230, 30887, 32373) }, - { AOM_CDF4(12234, 25079, 29731) }, - { AOM_CDF4(7326, 18816, 25353) }, - { AOM_CDF4(3933, 10907, 16616) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } } }, - { { { { AOM_CDF4(8896, 16227, 20630) }, - { AOM_CDF4(23629, 31782, 32527) }, - { AOM_CDF4(15173, 27755, 31321) }, - { AOM_CDF4(10158, 21233, 27382) }, - { AOM_CDF4(6420, 14857, 21558) }, - { AOM_CDF4(3269, 8155, 12646) }, - { AOM_CDF4(24835, 32009, 32496) }, - { AOM_CDF4(16509, 28421, 31579) }, - { AOM_CDF4(10957, 21514, 27418) }, - { AOM_CDF4(7881, 15930, 22096) }, - { AOM_CDF4(5388, 10960, 15918) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(20745, 30773, 32093) }, - { AOM_CDF4(15200, 27221, 30861) }, - { AOM_CDF4(13032, 20873, 25667) }, - { AOM_CDF4(12285, 18663, 23494) }, - { AOM_CDF4(11563, 17481, 21489) }, - { AOM_CDF4(26260, 31982, 32320) }, - { AOM_CDF4(15397, 28083, 31100) }, - { AOM_CDF4(9742, 19217, 24824) }, - { AOM_CDF4(3261, 9629, 15362) }, - { AOM_CDF4(1480, 4322, 7499) }, - { AOM_CDF4(27599, 32256, 32460) }, - { AOM_CDF4(16857, 27659, 30774) }, - { AOM_CDF4(9551, 18290, 23748) }, - { AOM_CDF4(3052, 8933, 14103) }, - { AOM_CDF4(2021, 5910, 9787) }, - { AOM_CDF4(29005, 32015, 32392) }, - { AOM_CDF4(17677, 27694, 30863) }, - { AOM_CDF4(9204, 17356, 23219) }, - { AOM_CDF4(2403, 7516, 12814) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(10808, 22056, 26896) }, - { AOM_CDF4(25739, 32313, 32676) }, - { AOM_CDF4(17288, 30203, 32221) }, - { AOM_CDF4(11359, 24878, 29896) }, - { AOM_CDF4(6949, 17767, 24893) }, - { AOM_CDF4(4287, 11796, 18071) }, - { AOM_CDF4(27880, 32521, 32705) }, - { AOM_CDF4(19038, 31004, 32414) }, - { AOM_CDF4(12564, 26345, 30768) }, - { AOM_CDF4(8269, 19947, 26779) }, - { AOM_CDF4(5674, 14657, 21674) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(25742, 32319, 32671) }, - { AOM_CDF4(19557, 31164, 32454) }, - { AOM_CDF4(13381, 26381, 30755) }, - { AOM_CDF4(10101, 21466, 26722) }, - { AOM_CDF4(9209, 19650, 26825) }, - { AOM_CDF4(27107, 31917, 32432) }, - { AOM_CDF4(18056, 28893, 31203) }, - { AOM_CDF4(10200, 21434, 26764) }, - { AOM_CDF4(4660, 12913, 19502) }, - { AOM_CDF4(2368, 6930, 12504) }, - { AOM_CDF4(26960, 32158, 32613) }, - { AOM_CDF4(18628, 30005, 32031) }, - { AOM_CDF4(10233, 22442, 28232) }, - { AOM_CDF4(5471, 14630, 21516) }, - { AOM_CDF4(3235, 10767, 17109) }, - { AOM_CDF4(27696, 32440, 32692) }, - { AOM_CDF4(20032, 31167, 32438) }, - { AOM_CDF4(8700, 21341, 28442) }, - { AOM_CDF4(5662, 14831, 21795) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(9704, 17294, 21132) }, - { AOM_CDF4(26762, 32278, 32633) }, - { AOM_CDF4(18382, 29620, 31819) }, - { AOM_CDF4(10891, 23475, 28723) }, - { AOM_CDF4(6358, 16583, 23309) }, - { AOM_CDF4(3248, 9118, 14141) }, - { AOM_CDF4(27204, 32573, 32699) }, - { AOM_CDF4(19818, 30824, 32329) }, - { AOM_CDF4(11772, 25120, 30041) }, - { AOM_CDF4(6995, 18033, 25039) }, - { AOM_CDF4(3752, 10442, 16098) }, - { AOM_CDF4(27222, 32256, 32559) }, - { AOM_CDF4(15356, 28399, 31475) }, - { AOM_CDF4(8821, 20635, 27057) }, - { AOM_CDF4(5511, 14404, 21239) }, - { AOM_CDF4(2935, 8222, 13051) }, - { AOM_CDF4(24875, 32120, 32529) }, - { AOM_CDF4(15233, 28265, 31445) }, - { AOM_CDF4(8605, 20570, 26932) }, - { AOM_CDF4(5431, 14413, 21196) }, - { AOM_CDF4(2994, 8341, 13223) }, - { AOM_CDF4(28201, 32604, 32700) }, - { AOM_CDF4(21041, 31446, 32456) }, - { AOM_CDF4(13221, 26213, 30475) }, - { AOM_CDF4(8255, 19385, 26037) }, - { AOM_CDF4(4930, 12585, 18830) }, - { AOM_CDF4(28768, 32448, 32627) }, - { AOM_CDF4(19705, 30561, 32021) }, - { AOM_CDF4(11572, 23589, 28220) }, - { AOM_CDF4(5532, 15034, 21446) }, - { AOM_CDF4(2460, 7150, 11456) }, - { AOM_CDF4(29874, 32619, 32699) }, - { AOM_CDF4(21621, 31071, 32201) }, - { AOM_CDF4(12511, 24747, 28992) }, - { AOM_CDF4(6281, 16395, 22748) }, - { AOM_CDF4(3246, 9278, 14497) }, - { AOM_CDF4(29715, 32625, 32712) }, - { AOM_CDF4(20958, 31011, 32283) }, - { AOM_CDF4(11233, 23671, 28806) }, - { AOM_CDF4(6012, 16128, 22868) }, - { AOM_CDF4(3427, 9851, 15414) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(11016, 22111, 26794) }, - { AOM_CDF4(25946, 32357, 32677) }, - { AOM_CDF4(17890, 30452, 32252) }, - { AOM_CDF4(11678, 25142, 29816) }, - { AOM_CDF4(6720, 17534, 24584) }, - { AOM_CDF4(4230, 11665, 17820) }, - { AOM_CDF4(28400, 32623, 32747) }, - { AOM_CDF4(21164, 31668, 32575) }, - { AOM_CDF4(13572, 27388, 31182) }, - { AOM_CDF4(8234, 20750, 27358) }, - { AOM_CDF4(5065, 14055, 20897) }, - { AOM_CDF4(28981, 32547, 32705) }, - { AOM_CDF4(18681, 30543, 32239) }, - { AOM_CDF4(10919, 24075, 29286) }, - { AOM_CDF4(6431, 17199, 24077) }, - { AOM_CDF4(3819, 10464, 16618) }, - { AOM_CDF4(26870, 32467, 32693) }, - { AOM_CDF4(19041, 30831, 32347) }, - { AOM_CDF4(11794, 25211, 30016) }, - { AOM_CDF4(6888, 18019, 24970) }, - { AOM_CDF4(4370, 12363, 18992) }, - { AOM_CDF4(29578, 32670, 32744) }, - { AOM_CDF4(23159, 32007, 32613) }, - { AOM_CDF4(15315, 28669, 31676) }, - { AOM_CDF4(9298, 22607, 28782) }, - { AOM_CDF4(6144, 15913, 22968) }, - { AOM_CDF4(28110, 32499, 32669) }, - { AOM_CDF4(21574, 30937, 32015) }, - { AOM_CDF4(12759, 24818, 28727) }, - { AOM_CDF4(6545, 16761, 23042) }, - { AOM_CDF4(3649, 10597, 16833) }, - { AOM_CDF4(28163, 32552, 32728) }, - { AOM_CDF4(22101, 31469, 32464) }, - { AOM_CDF4(13160, 25472, 30143) }, - { AOM_CDF4(7303, 18684, 25468) }, - { AOM_CDF4(5241, 13975, 20955) }, - { AOM_CDF4(28400, 32631, 32744) }, - { AOM_CDF4(22104, 31793, 32603) }, - { AOM_CDF4(13557, 26571, 30846) }, - { AOM_CDF4(7749, 19861, 26675) }, - { AOM_CDF4(4873, 14030, 21234) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(9800, 17635, 21073) }, - { AOM_CDF4(26153, 31885, 32527) }, - { AOM_CDF4(15038, 27852, 31006) }, - { AOM_CDF4(8718, 20564, 26486) }, - { AOM_CDF4(5128, 14076, 20514) }, - { AOM_CDF4(2636, 7566, 11925) }, - { AOM_CDF4(27551, 32504, 32701) }, - { AOM_CDF4(18310, 30054, 32100) }, - { AOM_CDF4(10211, 23420, 29082) }, - { AOM_CDF4(6222, 16876, 23916) }, - { AOM_CDF4(3462, 9954, 15498) }, - { AOM_CDF4(29991, 32633, 32721) }, - { AOM_CDF4(19883, 30751, 32201) }, - { AOM_CDF4(11141, 24184, 29285) }, - { AOM_CDF4(6420, 16940, 23774) }, - { AOM_CDF4(3392, 9753, 15118) }, - { AOM_CDF4(28465, 32616, 32712) }, - { AOM_CDF4(19850, 30702, 32244) }, - { AOM_CDF4(10983, 24024, 29223) }, - { AOM_CDF4(6294, 16770, 23582) }, - { AOM_CDF4(3244, 9283, 14509) }, - { AOM_CDF4(30023, 32717, 32748) }, - { AOM_CDF4(22940, 32032, 32626) }, - { AOM_CDF4(14282, 27928, 31473) }, - { AOM_CDF4(8562, 21327, 27914) }, - { AOM_CDF4(4846, 13393, 19919) }, - { AOM_CDF4(29981, 32590, 32695) }, - { AOM_CDF4(20465, 30963, 32166) }, - { AOM_CDF4(11479, 23579, 28195) }, - { AOM_CDF4(5916, 15648, 22073) }, - { AOM_CDF4(3031, 8605, 13398) }, - { AOM_CDF4(31146, 32691, 32739) }, - { AOM_CDF4(23106, 31724, 32444) }, - { AOM_CDF4(13783, 26738, 30439) }, - { AOM_CDF4(7852, 19468, 25807) }, - { AOM_CDF4(3860, 11124, 16853) }, - { AOM_CDF4(31014, 32724, 32748) }, - { AOM_CDF4(23629, 32109, 32628) }, - { AOM_CDF4(14747, 28115, 31403) }, - { AOM_CDF4(8545, 21242, 27478) }, - { AOM_CDF4(4574, 12781, 19067) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(9185, 19694, 24688) }, - { AOM_CDF4(26081, 31985, 32621) }, - { AOM_CDF4(16015, 29000, 31787) }, - { AOM_CDF4(10542, 23690, 29206) }, - { AOM_CDF4(6732, 17945, 24677) }, - { AOM_CDF4(3916, 11039, 16722) }, - { AOM_CDF4(28224, 32566, 32744) }, - { AOM_CDF4(19100, 31138, 32485) }, - { AOM_CDF4(12528, 26620, 30879) }, - { AOM_CDF4(7741, 20277, 26885) }, - { AOM_CDF4(4566, 12845, 18990) }, - { AOM_CDF4(29933, 32593, 32718) }, - { AOM_CDF4(17670, 30333, 32155) }, - { AOM_CDF4(10385, 23600, 28909) }, - { AOM_CDF4(6243, 16236, 22407) }, - { AOM_CDF4(3976, 10389, 16017) }, - { AOM_CDF4(28377, 32561, 32738) }, - { AOM_CDF4(19366, 31175, 32482) }, - { AOM_CDF4(13327, 27175, 31094) }, - { AOM_CDF4(8258, 20769, 27143) }, - { AOM_CDF4(4703, 13198, 19527) }, - { AOM_CDF4(31086, 32706, 32748) }, - { AOM_CDF4(22853, 31902, 32583) }, - { AOM_CDF4(14759, 28186, 31419) }, - { AOM_CDF4(9284, 22382, 28348) }, - { AOM_CDF4(5585, 15192, 21868) }, - { AOM_CDF4(28291, 32652, 32746) }, - { AOM_CDF4(19849, 32107, 32571) }, - { AOM_CDF4(14834, 26818, 29214) }, - { AOM_CDF4(10306, 22594, 28672) }, - { AOM_CDF4(6615, 17384, 23384) }, - { AOM_CDF4(28947, 32604, 32745) }, - { AOM_CDF4(25625, 32289, 32646) }, - { AOM_CDF4(18758, 28672, 31403) }, - { AOM_CDF4(10017, 23430, 28523) }, - { AOM_CDF4(6862, 15269, 22131) }, - { AOM_CDF4(23933, 32509, 32739) }, - { AOM_CDF4(19927, 31495, 32631) }, - { AOM_CDF4(11903, 26023, 30621) }, - { AOM_CDF4(7026, 20094, 27252) }, - { AOM_CDF4(5998, 18106, 24437) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(4456, 11274, 15533) }, - { AOM_CDF4(21219, 29079, 31616) }, - { AOM_CDF4(11173, 23774, 28567) }, - { AOM_CDF4(7282, 18293, 24263) }, - { AOM_CDF4(4890, 13286, 19115) }, - { AOM_CDF4(1890, 5508, 8659) }, - { AOM_CDF4(26651, 32136, 32647) }, - { AOM_CDF4(14630, 28254, 31455) }, - { AOM_CDF4(8716, 21287, 27395) }, - { AOM_CDF4(5615, 15331, 22008) }, - { AOM_CDF4(2675, 7700, 12150) }, - { AOM_CDF4(29954, 32526, 32690) }, - { AOM_CDF4(16126, 28982, 31633) }, - { AOM_CDF4(9030, 21361, 27352) }, - { AOM_CDF4(5411, 14793, 21271) }, - { AOM_CDF4(2943, 8422, 13163) }, - { AOM_CDF4(29539, 32601, 32730) }, - { AOM_CDF4(18125, 30385, 32201) }, - { AOM_CDF4(10422, 24090, 29468) }, - { AOM_CDF4(6468, 17487, 24438) }, - { AOM_CDF4(2970, 8653, 13531) }, - { AOM_CDF4(30912, 32715, 32748) }, - { AOM_CDF4(20666, 31373, 32497) }, - { AOM_CDF4(12509, 26640, 30917) }, - { AOM_CDF4(8058, 20629, 27290) }, - { AOM_CDF4(4231, 12006, 18052) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(10202, 20633, 25484) }, - { AOM_CDF4(27336, 31445, 32352) }, - { AOM_CDF4(12420, 24384, 28552) }, - { AOM_CDF4(7648, 18115, 23856) }, - { AOM_CDF4(5662, 14341, 19902) }, - { AOM_CDF4(3611, 10328, 15390) }, - { AOM_CDF4(30945, 32616, 32736) }, - { AOM_CDF4(18682, 30505, 32253) }, - { AOM_CDF4(11513, 25336, 30203) }, - { AOM_CDF4(7449, 19452, 26148) }, - { AOM_CDF4(4482, 13051, 18886) }, - { AOM_CDF4(32022, 32690, 32747) }, - { AOM_CDF4(18578, 30501, 32146) }, - { AOM_CDF4(11249, 23368, 28631) }, - { AOM_CDF4(5645, 16958, 22158) }, - { AOM_CDF4(5009, 11444, 16637) }, - { AOM_CDF4(31357, 32710, 32748) }, - { AOM_CDF4(21552, 31494, 32504) }, - { AOM_CDF4(13891, 27677, 31340) }, - { AOM_CDF4(9051, 22098, 28172) }, - { AOM_CDF4(5190, 13377, 19486) }, - { AOM_CDF4(32364, 32740, 32748) }, - { AOM_CDF4(24839, 31907, 32551) }, - { AOM_CDF4(17160, 28779, 31696) }, - { AOM_CDF4(12452, 24137, 29602) }, - { AOM_CDF4(6165, 15389, 22477) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(2575, 7281, 11077) }, - { AOM_CDF4(14002, 20866, 25402) }, - { AOM_CDF4(6343, 15056, 19658) }, - { AOM_CDF4(4474, 11858, 17041) }, - { AOM_CDF4(2865, 8299, 12534) }, - { AOM_CDF4(1344, 3949, 6391) }, - { AOM_CDF4(24720, 31239, 32459) }, - { AOM_CDF4(12585, 25356, 29968) }, - { AOM_CDF4(7181, 18246, 24444) }, - { AOM_CDF4(5025, 13667, 19885) }, - { AOM_CDF4(2521, 7304, 11605) }, - { AOM_CDF4(29908, 32252, 32584) }, - { AOM_CDF4(17421, 29156, 31575) }, - { AOM_CDF4(9889, 22188, 27782) }, - { AOM_CDF4(5878, 15647, 22123) }, - { AOM_CDF4(2814, 8665, 13323) }, - { AOM_CDF4(30183, 32568, 32713) }, - { AOM_CDF4(18528, 30195, 32049) }, - { AOM_CDF4(10982, 24606, 29657) }, - { AOM_CDF4(6957, 18165, 25231) }, - { AOM_CDF4(3508, 10118, 15468) }, - { AOM_CDF4(31761, 32736, 32748) }, - { AOM_CDF4(21041, 31328, 32546) }, - { AOM_CDF4(12568, 26732, 31166) }, - { AOM_CDF4(8052, 20720, 27733) }, - { AOM_CDF4(4336, 12192, 18396) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } } }, - { { { { AOM_CDF4(7062, 16472, 22319) }, - { AOM_CDF4(24538, 32261, 32674) }, - { AOM_CDF4(13675, 28041, 31779) }, - { AOM_CDF4(8590, 20674, 27631) }, - { AOM_CDF4(5685, 14675, 22013) }, - { AOM_CDF4(3655, 9898, 15731) }, - { AOM_CDF4(26493, 32418, 32658) }, - { AOM_CDF4(16376, 29342, 32090) }, - { AOM_CDF4(10594, 22649, 28970) }, - { AOM_CDF4(8176, 17170, 24303) }, - { AOM_CDF4(5605, 12694, 19139) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(23888, 31902, 32542) }, - { AOM_CDF4(18612, 29687, 31987) }, - { AOM_CDF4(16245, 24852, 29249) }, - { AOM_CDF4(15765, 22608, 27559) }, - { AOM_CDF4(19895, 24699, 27510) }, - { AOM_CDF4(28401, 32212, 32457) }, - { AOM_CDF4(15274, 27825, 30980) }, - { AOM_CDF4(9364, 18128, 24332) }, - { AOM_CDF4(2283, 8193, 15082) }, - { AOM_CDF4(1228, 3972, 7881) }, - { AOM_CDF4(29455, 32469, 32620) }, - { AOM_CDF4(17981, 28245, 31388) }, - { AOM_CDF4(10921, 20098, 26240) }, - { AOM_CDF4(3743, 11829, 18657) }, - { AOM_CDF4(2374, 9593, 15715) }, - { AOM_CDF4(31068, 32466, 32635) }, - { AOM_CDF4(20321, 29572, 31971) }, - { AOM_CDF4(10771, 20255, 27119) }, - { AOM_CDF4(2795, 10410, 17361) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(9320, 22102, 27840) }, - { AOM_CDF4(27057, 32464, 32724) }, - { AOM_CDF4(16331, 30268, 32309) }, - { AOM_CDF4(10319, 23935, 29720) }, - { AOM_CDF4(6189, 16448, 24106) }, - { AOM_CDF4(3589, 10884, 18808) }, - { AOM_CDF4(29026, 32624, 32748) }, - { AOM_CDF4(19226, 31507, 32587) }, - { AOM_CDF4(12692, 26921, 31203) }, - { AOM_CDF4(7049, 19532, 27635) }, - { AOM_CDF4(7727, 15669, 23252) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(28056, 32625, 32748) }, - { AOM_CDF4(22383, 32075, 32669) }, - { AOM_CDF4(15417, 27098, 31749) }, - { AOM_CDF4(18127, 26493, 27190) }, - { AOM_CDF4(5461, 16384, 21845) }, - { AOM_CDF4(27982, 32091, 32584) }, - { AOM_CDF4(19045, 29868, 31972) }, - { AOM_CDF4(10397, 22266, 27932) }, - { AOM_CDF4(5990, 13697, 21500) }, - { AOM_CDF4(1792, 6912, 15104) }, - { AOM_CDF4(28198, 32501, 32718) }, - { AOM_CDF4(21534, 31521, 32569) }, - { AOM_CDF4(11109, 25217, 30017) }, - { AOM_CDF4(5671, 15124, 26151) }, - { AOM_CDF4(4681, 14043, 18725) }, - { AOM_CDF4(28688, 32580, 32741) }, - { AOM_CDF4(22576, 32079, 32661) }, - { AOM_CDF4(10627, 22141, 28340) }, - { AOM_CDF4(9362, 14043, 28087) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(7754, 16948, 22142) }, - { AOM_CDF4(25670, 32330, 32691) }, - { AOM_CDF4(15663, 29225, 31994) }, - { AOM_CDF4(9878, 23288, 29158) }, - { AOM_CDF4(6419, 17088, 24336) }, - { AOM_CDF4(3859, 11003, 17039) }, - { AOM_CDF4(27562, 32595, 32725) }, - { AOM_CDF4(17575, 30588, 32399) }, - { AOM_CDF4(10819, 24838, 30309) }, - { AOM_CDF4(7124, 18686, 25916) }, - { AOM_CDF4(4479, 12688, 19340) }, - { AOM_CDF4(28385, 32476, 32673) }, - { AOM_CDF4(15306, 29005, 31938) }, - { AOM_CDF4(8937, 21615, 28322) }, - { AOM_CDF4(5982, 15603, 22786) }, - { AOM_CDF4(3620, 10267, 16136) }, - { AOM_CDF4(27280, 32464, 32667) }, - { AOM_CDF4(15607, 29160, 32004) }, - { AOM_CDF4(9091, 22135, 28740) }, - { AOM_CDF4(6232, 16632, 24020) }, - { AOM_CDF4(4047, 11377, 17672) }, - { AOM_CDF4(29220, 32630, 32718) }, - { AOM_CDF4(19650, 31220, 32462) }, - { AOM_CDF4(13050, 26312, 30827) }, - { AOM_CDF4(9228, 20870, 27468) }, - { AOM_CDF4(6146, 15149, 21971) }, - { AOM_CDF4(30169, 32481, 32623) }, - { AOM_CDF4(17212, 29311, 31554) }, - { AOM_CDF4(9911, 21311, 26882) }, - { AOM_CDF4(4487, 13314, 20372) }, - { AOM_CDF4(2570, 7772, 12889) }, - { AOM_CDF4(30924, 32613, 32708) }, - { AOM_CDF4(19490, 30206, 32107) }, - { AOM_CDF4(11232, 23998, 29276) }, - { AOM_CDF4(6769, 17955, 25035) }, - { AOM_CDF4(4398, 12623, 19214) }, - { AOM_CDF4(30609, 32627, 32722) }, - { AOM_CDF4(19370, 30582, 32287) }, - { AOM_CDF4(10457, 23619, 29409) }, - { AOM_CDF4(6443, 17637, 24834) }, - { AOM_CDF4(4645, 13236, 20106) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(8626, 20271, 26216) }, - { AOM_CDF4(26707, 32406, 32711) }, - { AOM_CDF4(16999, 30329, 32286) }, - { AOM_CDF4(11445, 25123, 30286) }, - { AOM_CDF4(6411, 18828, 25601) }, - { AOM_CDF4(6801, 12458, 20248) }, - { AOM_CDF4(29918, 32682, 32748) }, - { AOM_CDF4(20649, 31739, 32618) }, - { AOM_CDF4(12879, 27773, 31581) }, - { AOM_CDF4(7896, 21751, 28244) }, - { AOM_CDF4(5260, 14870, 23698) }, - { AOM_CDF4(29252, 32593, 32731) }, - { AOM_CDF4(17072, 30460, 32294) }, - { AOM_CDF4(10653, 24143, 29365) }, - { AOM_CDF4(6536, 17490, 23983) }, - { AOM_CDF4(4929, 13170, 20085) }, - { AOM_CDF4(28137, 32518, 32715) }, - { AOM_CDF4(18171, 30784, 32407) }, - { AOM_CDF4(11437, 25436, 30459) }, - { AOM_CDF4(7252, 18534, 26176) }, - { AOM_CDF4(4126, 13353, 20978) }, - { AOM_CDF4(31162, 32726, 32748) }, - { AOM_CDF4(23017, 32222, 32701) }, - { AOM_CDF4(15629, 29233, 32046) }, - { AOM_CDF4(9387, 22621, 29480) }, - { AOM_CDF4(6922, 17616, 25010) }, - { AOM_CDF4(28838, 32265, 32614) }, - { AOM_CDF4(19701, 30206, 31920) }, - { AOM_CDF4(11214, 22410, 27933) }, - { AOM_CDF4(5320, 14177, 23034) }, - { AOM_CDF4(5049, 12881, 17827) }, - { AOM_CDF4(27484, 32471, 32734) }, - { AOM_CDF4(21076, 31526, 32561) }, - { AOM_CDF4(12707, 26303, 31211) }, - { AOM_CDF4(8169, 21722, 28219) }, - { AOM_CDF4(6045, 19406, 27042) }, - { AOM_CDF4(27753, 32572, 32745) }, - { AOM_CDF4(20832, 31878, 32653) }, - { AOM_CDF4(13250, 27356, 31674) }, - { AOM_CDF4(7718, 21508, 29858) }, - { AOM_CDF4(7209, 18350, 25559) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(7876, 16901, 21741) }, - { AOM_CDF4(24001, 31898, 32625) }, - { AOM_CDF4(14529, 27959, 31451) }, - { AOM_CDF4(8273, 20818, 27258) }, - { AOM_CDF4(5278, 14673, 21510) }, - { AOM_CDF4(2983, 8843, 14039) }, - { AOM_CDF4(28016, 32574, 32732) }, - { AOM_CDF4(17471, 30306, 32301) }, - { AOM_CDF4(10224, 24063, 29728) }, - { AOM_CDF4(6602, 17954, 25052) }, - { AOM_CDF4(4002, 11585, 17759) }, - { AOM_CDF4(30190, 32634, 32739) }, - { AOM_CDF4(17497, 30282, 32270) }, - { AOM_CDF4(10229, 23729, 29538) }, - { AOM_CDF4(6344, 17211, 24440) }, - { AOM_CDF4(3849, 11189, 17108) }, - { AOM_CDF4(28570, 32583, 32726) }, - { AOM_CDF4(17521, 30161, 32238) }, - { AOM_CDF4(10153, 23565, 29378) }, - { AOM_CDF4(6455, 17341, 24443) }, - { AOM_CDF4(3907, 11042, 17024) }, - { AOM_CDF4(30689, 32715, 32748) }, - { AOM_CDF4(21546, 31840, 32610) }, - { AOM_CDF4(13547, 27581, 31459) }, - { AOM_CDF4(8912, 21757, 28309) }, - { AOM_CDF4(5548, 15080, 22046) }, - { AOM_CDF4(30783, 32540, 32685) }, - { AOM_CDF4(17540, 29528, 31668) }, - { AOM_CDF4(10160, 21468, 26783) }, - { AOM_CDF4(4724, 13393, 20054) }, - { AOM_CDF4(2702, 8174, 13102) }, - { AOM_CDF4(31648, 32686, 32742) }, - { AOM_CDF4(20954, 31094, 32337) }, - { AOM_CDF4(12420, 25698, 30179) }, - { AOM_CDF4(7304, 19320, 26248) }, - { AOM_CDF4(4366, 12261, 18864) }, - { AOM_CDF4(31581, 32723, 32748) }, - { AOM_CDF4(21373, 31586, 32525) }, - { AOM_CDF4(12744, 26625, 30885) }, - { AOM_CDF4(7431, 20322, 26950) }, - { AOM_CDF4(4692, 13323, 20111) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(7833, 18369, 24095) }, - { AOM_CDF4(26650, 32273, 32702) }, - { AOM_CDF4(16371, 29961, 32191) }, - { AOM_CDF4(11055, 24082, 29629) }, - { AOM_CDF4(6892, 18644, 25400) }, - { AOM_CDF4(5006, 13057, 19240) }, - { AOM_CDF4(29834, 32666, 32748) }, - { AOM_CDF4(19577, 31335, 32570) }, - { AOM_CDF4(12253, 26509, 31122) }, - { AOM_CDF4(7991, 20772, 27711) }, - { AOM_CDF4(5677, 15910, 23059) }, - { AOM_CDF4(30109, 32532, 32720) }, - { AOM_CDF4(16747, 30166, 32252) }, - { AOM_CDF4(10134, 23542, 29184) }, - { AOM_CDF4(5791, 16176, 23556) }, - { AOM_CDF4(4362, 10414, 17284) }, - { AOM_CDF4(29492, 32626, 32748) }, - { AOM_CDF4(19894, 31402, 32525) }, - { AOM_CDF4(12942, 27071, 30869) }, - { AOM_CDF4(8346, 21216, 27405) }, - { AOM_CDF4(6572, 17087, 23859) }, - { AOM_CDF4(32035, 32735, 32748) }, - { AOM_CDF4(22957, 31838, 32618) }, - { AOM_CDF4(14724, 28572, 31772) }, - { AOM_CDF4(10364, 23999, 29553) }, - { AOM_CDF4(7004, 18433, 25655) }, - { AOM_CDF4(27528, 32277, 32681) }, - { AOM_CDF4(16959, 31171, 32096) }, - { AOM_CDF4(10486, 23593, 27962) }, - { AOM_CDF4(8192, 16384, 23211) }, - { AOM_CDF4(8937, 17873, 20852) }, - { AOM_CDF4(27715, 32002, 32615) }, - { AOM_CDF4(15073, 29491, 31676) }, - { AOM_CDF4(11264, 24576, 28672) }, - { AOM_CDF4(2341, 18725, 23406) }, - { AOM_CDF4(7282, 18204, 25486) }, - { AOM_CDF4(28547, 32213, 32657) }, - { AOM_CDF4(20788, 29773, 32239) }, - { AOM_CDF4(6780, 21469, 30508) }, - { AOM_CDF4(5958, 14895, 23831) }, - { AOM_CDF4(16384, 21845, 27307) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(5992, 14304, 19765) }, - { AOM_CDF4(22612, 31238, 32456) }, - { AOM_CDF4(13456, 27162, 31087) }, - { AOM_CDF4(8001, 20062, 26504) }, - { AOM_CDF4(5168, 14105, 20764) }, - { AOM_CDF4(2632, 7771, 12385) }, - { AOM_CDF4(27034, 32344, 32709) }, - { AOM_CDF4(15850, 29415, 31997) }, - { AOM_CDF4(9494, 22776, 28841) }, - { AOM_CDF4(6151, 16830, 23969) }, - { AOM_CDF4(3461, 10039, 15722) }, - { AOM_CDF4(30134, 32569, 32731) }, - { AOM_CDF4(15638, 29422, 31945) }, - { AOM_CDF4(9150, 21865, 28218) }, - { AOM_CDF4(5647, 15719, 22676) }, - { AOM_CDF4(3402, 9772, 15477) }, - { AOM_CDF4(28530, 32586, 32735) }, - { AOM_CDF4(17139, 30298, 32292) }, - { AOM_CDF4(10200, 24039, 29685) }, - { AOM_CDF4(6419, 17674, 24786) }, - { AOM_CDF4(3544, 10225, 15824) }, - { AOM_CDF4(31333, 32726, 32748) }, - { AOM_CDF4(20618, 31487, 32544) }, - { AOM_CDF4(12901, 27217, 31232) }, - { AOM_CDF4(8624, 21734, 28171) }, - { AOM_CDF4(5104, 14191, 20748) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(11206, 21090, 26561) }, - { AOM_CDF4(28759, 32279, 32671) }, - { AOM_CDF4(14171, 27952, 31569) }, - { AOM_CDF4(9743, 22907, 29141) }, - { AOM_CDF4(6871, 17886, 24868) }, - { AOM_CDF4(4960, 13152, 19315) }, - { AOM_CDF4(31077, 32661, 32748) }, - { AOM_CDF4(19400, 31195, 32515) }, - { AOM_CDF4(12752, 26858, 31040) }, - { AOM_CDF4(8370, 22098, 28591) }, - { AOM_CDF4(5457, 15373, 22298) }, - { AOM_CDF4(31697, 32706, 32748) }, - { AOM_CDF4(17860, 30657, 32333) }, - { AOM_CDF4(12510, 24812, 29261) }, - { AOM_CDF4(6180, 19124, 24722) }, - { AOM_CDF4(5041, 13548, 17959) }, - { AOM_CDF4(31552, 32716, 32748) }, - { AOM_CDF4(21908, 31769, 32623) }, - { AOM_CDF4(14470, 28201, 31565) }, - { AOM_CDF4(9493, 22982, 28608) }, - { AOM_CDF4(6858, 17240, 24137) }, - { AOM_CDF4(32543, 32752, 32756) }, - { AOM_CDF4(24286, 32097, 32666) }, - { AOM_CDF4(15958, 29217, 32024) }, - { AOM_CDF4(10207, 24234, 29958) }, - { AOM_CDF4(6929, 18305, 25652) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } }, - { { { AOM_CDF4(4137, 10847, 15682) }, - { AOM_CDF4(17824, 27001, 30058) }, - { AOM_CDF4(10204, 22796, 28291) }, - { AOM_CDF4(6076, 15935, 22125) }, - { AOM_CDF4(3852, 10937, 16816) }, - { AOM_CDF4(2252, 6324, 10131) }, - { AOM_CDF4(25840, 32016, 32662) }, - { AOM_CDF4(15109, 28268, 31531) }, - { AOM_CDF4(9385, 22231, 28340) }, - { AOM_CDF4(6082, 16672, 23479) }, - { AOM_CDF4(3318, 9427, 14681) }, - { AOM_CDF4(30594, 32574, 32718) }, - { AOM_CDF4(16836, 29552, 31859) }, - { AOM_CDF4(9556, 22542, 28356) }, - { AOM_CDF4(6305, 16725, 23540) }, - { AOM_CDF4(3376, 9895, 15184) }, - { AOM_CDF4(29383, 32617, 32745) }, - { AOM_CDF4(18891, 30809, 32401) }, - { AOM_CDF4(11688, 25942, 30687) }, - { AOM_CDF4(7468, 19469, 26651) }, - { AOM_CDF4(3909, 11358, 17012) }, - { AOM_CDF4(31564, 32736, 32748) }, - { AOM_CDF4(20906, 31611, 32600) }, - { AOM_CDF4(13191, 27621, 31537) }, - { AOM_CDF4(8768, 22029, 28676) }, - { AOM_CDF4(5079, 14109, 20906) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } }, - { { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) }, - { AOM_CDF4(8192, 16384, 24576) } } } } }; + [CDF_SIZE(NUM_BASE_LEVELS + + 2)] = { { { { { AOM_CDF4(4034, 8930, 12727) }, + { AOM_CDF4(18082, 29741, 31877) }, + { AOM_CDF4(12596, 26124, 30493) }, + { AOM_CDF4(9446, 21118, 27005) }, + { AOM_CDF4(6308, 15141, 21279) }, + { AOM_CDF4(2463, 6357, 9783) }, + { AOM_CDF4(20667, 30546, 31929) }, + { AOM_CDF4(13043, 26123, 30134) }, + { AOM_CDF4(8151, 18757, 24778) }, + { AOM_CDF4(5255, 12839, 18632) }, + { AOM_CDF4(2820, 7206, 11161) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(15736, 27553, 30604) }, + { AOM_CDF4(11210, 23794, 28787) }, + { AOM_CDF4(5947, 13874, 19701) }, + { AOM_CDF4(4215, 9323, 13891) }, + { AOM_CDF4(2833, 6462, 10059) }, + { AOM_CDF4(19605, 30393, 31582) }, + { AOM_CDF4(13523, 26252, 30248) }, + { AOM_CDF4(8446, 18622, 24512) }, + { AOM_CDF4(3818, 10343, 15974) }, + { AOM_CDF4(1481, 4117, 6796) }, + { AOM_CDF4(22649, 31302, 32190) }, + { AOM_CDF4(14829, 27127, 30449) }, + { AOM_CDF4(8313, 17702, 23304) }, + { AOM_CDF4(3022, 8301, 12786) }, + { AOM_CDF4(1536, 4412, 7184) }, + { AOM_CDF4(22354, 29774, 31372) }, + { AOM_CDF4(14723, 25472, 29214) }, + { AOM_CDF4(6673, 13745, 18662) }, + { AOM_CDF4(2068, 5766, 9322) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6302, 16444, 21761) }, + { AOM_CDF4(23040, 31538, 32475) }, + { AOM_CDF4(15196, 28452, 31496) }, + { AOM_CDF4(10020, 22946, 28514) }, + { AOM_CDF4(6533, 16862, 23501) }, + { AOM_CDF4(3538, 9816, 15076) }, + { AOM_CDF4(24444, 31875, 32525) }, + { AOM_CDF4(15881, 28924, 31635) }, + { AOM_CDF4(9922, 22873, 28466) }, + { AOM_CDF4(6527, 16966, 23691) }, + { AOM_CDF4(4114, 11303, 17220) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20201, 30770, 32209) }, + { AOM_CDF4(14754, 28071, 31258) }, + { AOM_CDF4(8378, 20186, 26517) }, + { AOM_CDF4(5916, 15299, 21978) }, + { AOM_CDF4(4268, 11583, 17901) }, + { AOM_CDF4(24361, 32025, 32581) }, + { AOM_CDF4(18673, 30105, 31943) }, + { AOM_CDF4(10196, 22244, 27576) }, + { AOM_CDF4(5495, 14349, 20417) }, + { AOM_CDF4(2676, 7415, 11498) }, + { AOM_CDF4(24678, 31958, 32585) }, + { AOM_CDF4(18629, 29906, 31831) }, + { AOM_CDF4(9364, 20724, 26315) }, + { AOM_CDF4(4641, 12318, 18094) }, + { AOM_CDF4(2758, 7387, 11579) }, + { AOM_CDF4(25433, 31842, 32469) }, + { AOM_CDF4(18795, 29289, 31411) }, + { AOM_CDF4(7644, 17584, 23592) }, + { AOM_CDF4(3408, 9014, 15047) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4536, 10072, 14001) }, + { AOM_CDF4(25459, 31416, 32206) }, + { AOM_CDF4(16605, 28048, 30818) }, + { AOM_CDF4(11008, 22857, 27719) }, + { AOM_CDF4(6915, 16268, 22315) }, + { AOM_CDF4(2625, 6812, 10537) }, + { AOM_CDF4(24257, 31788, 32499) }, + { AOM_CDF4(16880, 29454, 31879) }, + { AOM_CDF4(11958, 25054, 29778) }, + { AOM_CDF4(7916, 18718, 25084) }, + { AOM_CDF4(3383, 8777, 13446) }, + { AOM_CDF4(22720, 31603, 32393) }, + { AOM_CDF4(14960, 28125, 31335) }, + { AOM_CDF4(9731, 22210, 27928) }, + { AOM_CDF4(6304, 15832, 22277) }, + { AOM_CDF4(2910, 7818, 12166) }, + { AOM_CDF4(20375, 30627, 32131) }, + { AOM_CDF4(13904, 27284, 30887) }, + { AOM_CDF4(9368, 21558, 27144) }, + { AOM_CDF4(5937, 14966, 21119) }, + { AOM_CDF4(2667, 7225, 11319) }, + { AOM_CDF4(23970, 31470, 32378) }, + { AOM_CDF4(17173, 29734, 32018) }, + { AOM_CDF4(12795, 25441, 29965) }, + { AOM_CDF4(8981, 19680, 25893) }, + { AOM_CDF4(4728, 11372, 16902) }, + { AOM_CDF4(24287, 31797, 32439) }, + { AOM_CDF4(16703, 29145, 31696) }, + { AOM_CDF4(10833, 23554, 28725) }, + { AOM_CDF4(6468, 16566, 23057) }, + { AOM_CDF4(2415, 6562, 10278) }, + { AOM_CDF4(26610, 32395, 32659) }, + { AOM_CDF4(18590, 30498, 32117) }, + { AOM_CDF4(12420, 25756, 29950) }, + { AOM_CDF4(7639, 18746, 24710) }, + { AOM_CDF4(3001, 8086, 12347) }, + { AOM_CDF4(25076, 32064, 32580) }, + { AOM_CDF4(17946, 30128, 32028) }, + { AOM_CDF4(12024, 24985, 29378) }, + { AOM_CDF4(7517, 18390, 24304) }, + { AOM_CDF4(3243, 8781, 13331) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6037, 16771, 21957) }, + { AOM_CDF4(24774, 31704, 32426) }, + { AOM_CDF4(16830, 28589, 31056) }, + { AOM_CDF4(10602, 22828, 27760) }, + { AOM_CDF4(6733, 16829, 23071) }, + { AOM_CDF4(3250, 8914, 13556) }, + { AOM_CDF4(25582, 32220, 32668) }, + { AOM_CDF4(18659, 30342, 32223) }, + { AOM_CDF4(12546, 26149, 30515) }, + { AOM_CDF4(8420, 20451, 26801) }, + { AOM_CDF4(4636, 12420, 18344) }, + { AOM_CDF4(27581, 32362, 32639) }, + { AOM_CDF4(18987, 30083, 31978) }, + { AOM_CDF4(11327, 24248, 29084) }, + { AOM_CDF4(7264, 17719, 24120) }, + { AOM_CDF4(3995, 10768, 16169) }, + { AOM_CDF4(25893, 31831, 32487) }, + { AOM_CDF4(16577, 28587, 31379) }, + { AOM_CDF4(10189, 22748, 28182) }, + { AOM_CDF4(6832, 17094, 23556) }, + { AOM_CDF4(3708, 10110, 15334) }, + { AOM_CDF4(25904, 32282, 32656) }, + { AOM_CDF4(19721, 30792, 32276) }, + { AOM_CDF4(12819, 26243, 30411) }, + { AOM_CDF4(8572, 20614, 26891) }, + { AOM_CDF4(5364, 14059, 20467) }, + { AOM_CDF4(26580, 32438, 32677) }, + { AOM_CDF4(20852, 31225, 32340) }, + { AOM_CDF4(12435, 25700, 29967) }, + { AOM_CDF4(8691, 20825, 26976) }, + { AOM_CDF4(4446, 12209, 17269) }, + { AOM_CDF4(27350, 32429, 32696) }, + { AOM_CDF4(21372, 30977, 32272) }, + { AOM_CDF4(12673, 25270, 29853) }, + { AOM_CDF4(9208, 20925, 26640) }, + { AOM_CDF4(5018, 13351, 18732) }, + { AOM_CDF4(27351, 32479, 32713) }, + { AOM_CDF4(21398, 31209, 32387) }, + { AOM_CDF4(12162, 25047, 29842) }, + { AOM_CDF4(7896, 18691, 25319) }, + { AOM_CDF4(4670, 12882, 18881) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5487, 10460, 13708) }, + { AOM_CDF4(21597, 28303, 30674) }, + { AOM_CDF4(11037, 21953, 26476) }, + { AOM_CDF4(8147, 17962, 22952) }, + { AOM_CDF4(5242, 13061, 18532) }, + { AOM_CDF4(1889, 5208, 8182) }, + { AOM_CDF4(26774, 32133, 32590) }, + { AOM_CDF4(17844, 29564, 31767) }, + { AOM_CDF4(11690, 24438, 29171) }, + { AOM_CDF4(7542, 18215, 24459) }, + { AOM_CDF4(2993, 8050, 12319) }, + { AOM_CDF4(28023, 32328, 32591) }, + { AOM_CDF4(18651, 30126, 31954) }, + { AOM_CDF4(12164, 25146, 29589) }, + { AOM_CDF4(7762, 18530, 24771) }, + { AOM_CDF4(3492, 9183, 13920) }, + { AOM_CDF4(27591, 32008, 32491) }, + { AOM_CDF4(17149, 28853, 31510) }, + { AOM_CDF4(11485, 24003, 28860) }, + { AOM_CDF4(7697, 18086, 24210) }, + { AOM_CDF4(3075, 7999, 12218) }, + { AOM_CDF4(28268, 32482, 32654) }, + { AOM_CDF4(19631, 31051, 32404) }, + { AOM_CDF4(13860, 27260, 31020) }, + { AOM_CDF4(9605, 21613, 27594) }, + { AOM_CDF4(4876, 12162, 17908) }, + { AOM_CDF4(27248, 32316, 32576) }, + { AOM_CDF4(18955, 30457, 32075) }, + { AOM_CDF4(11824, 23997, 28795) }, + { AOM_CDF4(7346, 18196, 24647) }, + { AOM_CDF4(3403, 9247, 14111) }, + { AOM_CDF4(29711, 32655, 32735) }, + { AOM_CDF4(21169, 31394, 32417) }, + { AOM_CDF4(13487, 27198, 30957) }, + { AOM_CDF4(8828, 21683, 27614) }, + { AOM_CDF4(4270, 11451, 17038) }, + { AOM_CDF4(28708, 32578, 32731) }, + { AOM_CDF4(20120, 31241, 32482) }, + { AOM_CDF4(13692, 27550, 31321) }, + { AOM_CDF4(9418, 22514, 28439) }, + { AOM_CDF4(4999, 13283, 19462) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5673, 14302, 19711) }, + { AOM_CDF4(26251, 30701, 31834) }, + { AOM_CDF4(12782, 23783, 27803) }, + { AOM_CDF4(9127, 20657, 25808) }, + { AOM_CDF4(6368, 16208, 21462) }, + { AOM_CDF4(2465, 7177, 10822) }, + { AOM_CDF4(29961, 32563, 32719) }, + { AOM_CDF4(18318, 29891, 31949) }, + { AOM_CDF4(11361, 24514, 29357) }, + { AOM_CDF4(7900, 19603, 25607) }, + { AOM_CDF4(4002, 10590, 15546) }, + { AOM_CDF4(29637, 32310, 32595) }, + { AOM_CDF4(18296, 29913, 31809) }, + { AOM_CDF4(10144, 21515, 26871) }, + { AOM_CDF4(5358, 14322, 20394) }, + { AOM_CDF4(3067, 8362, 13346) }, + { AOM_CDF4(28652, 32470, 32676) }, + { AOM_CDF4(17538, 30771, 32209) }, + { AOM_CDF4(13924, 26882, 30494) }, + { AOM_CDF4(10496, 22837, 27869) }, + { AOM_CDF4(7236, 16396, 21621) }, + { AOM_CDF4(30743, 32687, 32746) }, + { AOM_CDF4(23006, 31676, 32489) }, + { AOM_CDF4(14494, 27828, 31120) }, + { AOM_CDF4(10174, 22801, 28352) }, + { AOM_CDF4(6242, 15281, 21043) }, + { AOM_CDF4(25817, 32243, 32720) }, + { AOM_CDF4(18618, 31367, 32325) }, + { AOM_CDF4(13997, 28318, 31878) }, + { AOM_CDF4(12255, 26534, 31383) }, + { AOM_CDF4(9561, 21588, 28450) }, + { AOM_CDF4(28188, 32635, 32724) }, + { AOM_CDF4(22060, 32365, 32728) }, + { AOM_CDF4(18102, 30690, 32528) }, + { AOM_CDF4(14196, 28864, 31999) }, + { AOM_CDF4(12262, 25792, 30865) }, + { AOM_CDF4(24176, 32109, 32628) }, + { AOM_CDF4(18280, 29681, 31963) }, + { AOM_CDF4(10205, 23703, 29664) }, + { AOM_CDF4(7889, 20025, 27676) }, + { AOM_CDF4(6060, 16743, 23970) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5141, 7096, 8260) }, + { AOM_CDF4(27186, 29022, 29789) }, + { AOM_CDF4(6668, 12568, 15682) }, + { AOM_CDF4(2172, 6181, 8638) }, + { AOM_CDF4(1126, 3379, 4531) }, + { AOM_CDF4(443, 1361, 2254) }, + { AOM_CDF4(26083, 31153, 32436) }, + { AOM_CDF4(13486, 24603, 28483) }, + { AOM_CDF4(6508, 14840, 19910) }, + { AOM_CDF4(3386, 8800, 13286) }, + { AOM_CDF4(1530, 4322, 7054) }, + { AOM_CDF4(29639, 32080, 32548) }, + { AOM_CDF4(15897, 27552, 30290) }, + { AOM_CDF4(8588, 20047, 25383) }, + { AOM_CDF4(4889, 13339, 19269) }, + { AOM_CDF4(2240, 6871, 10498) }, + { AOM_CDF4(28165, 32197, 32517) }, + { AOM_CDF4(20735, 30427, 31568) }, + { AOM_CDF4(14325, 24671, 27692) }, + { AOM_CDF4(5119, 12554, 17805) }, + { AOM_CDF4(1810, 5441, 8261) }, + { AOM_CDF4(31212, 32724, 32748) }, + { AOM_CDF4(23352, 31766, 32545) }, + { AOM_CDF4(14669, 27570, 31059) }, + { AOM_CDF4(8492, 20894, 27272) }, + { AOM_CDF4(3644, 10194, 15204) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(2461, 7013, 9371) }, + { AOM_CDF4(24749, 29600, 30986) }, + { AOM_CDF4(9466, 19037, 22417) }, + { AOM_CDF4(3584, 9280, 14400) }, + { AOM_CDF4(1505, 3929, 5433) }, + { AOM_CDF4(677, 1500, 2736) }, + { AOM_CDF4(23987, 30702, 32117) }, + { AOM_CDF4(13554, 24571, 29263) }, + { AOM_CDF4(6211, 14556, 21155) }, + { AOM_CDF4(3135, 10972, 15625) }, + { AOM_CDF4(2435, 7127, 11427) }, + { AOM_CDF4(31300, 32532, 32550) }, + { AOM_CDF4(14757, 30365, 31954) }, + { AOM_CDF4(4405, 11612, 18553) }, + { AOM_CDF4(580, 4132, 7322) }, + { AOM_CDF4(1695, 10169, 14124) }, + { AOM_CDF4(30008, 32282, 32591) }, + { AOM_CDF4(19244, 30108, 31748) }, + { AOM_CDF4(11180, 24158, 29555) }, + { AOM_CDF4(5650, 14972, 19209) }, + { AOM_CDF4(2114, 5109, 8456) }, + { AOM_CDF4(31856, 32716, 32748) }, + { AOM_CDF4(23012, 31664, 32572) }, + { AOM_CDF4(13694, 26656, 30636) }, + { AOM_CDF4(8142, 19508, 26093) }, + { AOM_CDF4(4253, 10955, 16724) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(601, 983, 1311) }, + { AOM_CDF4(18725, 23406, 28087) }, + { AOM_CDF4(5461, 8192, 10923) }, + { AOM_CDF4(3781, 15124, 21425) }, + { AOM_CDF4(2587, 7761, 12072) }, + { AOM_CDF4(106, 458, 810) }, + { AOM_CDF4(22282, 29710, 31894) }, + { AOM_CDF4(8508, 20926, 25984) }, + { AOM_CDF4(3726, 12713, 18083) }, + { AOM_CDF4(1620, 7112, 10893) }, + { AOM_CDF4(729, 2236, 3495) }, + { AOM_CDF4(30163, 32474, 32684) }, + { AOM_CDF4(18304, 30464, 32000) }, + { AOM_CDF4(11443, 26526, 29647) }, + { AOM_CDF4(6007, 15292, 21299) }, + { AOM_CDF4(2234, 6703, 8937) }, + { AOM_CDF4(30954, 32177, 32571) }, + { AOM_CDF4(17363, 29562, 31076) }, + { AOM_CDF4(9686, 22464, 27410) }, + { AOM_CDF4(8192, 16384, 21390) }, + { AOM_CDF4(1755, 8046, 11264) }, + { AOM_CDF4(31168, 32734, 32748) }, + { AOM_CDF4(22486, 31441, 32471) }, + { AOM_CDF4(12833, 25627, 29738) }, + { AOM_CDF4(6980, 17379, 23122) }, + { AOM_CDF4(3111, 8887, 13479) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(6041, 11854, 15927) }, + { AOM_CDF4(20326, 30905, 32251) }, + { AOM_CDF4(14164, 26831, 30725) }, + { AOM_CDF4(9760, 20647, 26585) }, + { AOM_CDF4(6416, 14953, 21219) }, + { AOM_CDF4(2966, 7151, 10891) }, + { AOM_CDF4(23567, 31374, 32254) }, + { AOM_CDF4(14978, 27416, 30946) }, + { AOM_CDF4(9434, 20225, 26254) }, + { AOM_CDF4(6658, 14558, 20535) }, + { AOM_CDF4(3916, 8677, 12989) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(18088, 29545, 31587) }, + { AOM_CDF4(13062, 25843, 30073) }, + { AOM_CDF4(8940, 16827, 22251) }, + { AOM_CDF4(7654, 13220, 17973) }, + { AOM_CDF4(5733, 10316, 14456) }, + { AOM_CDF4(22879, 31388, 32114) }, + { AOM_CDF4(15215, 27993, 30955) }, + { AOM_CDF4(9397, 19445, 24978) }, + { AOM_CDF4(3442, 9813, 15344) }, + { AOM_CDF4(1368, 3936, 6532) }, + { AOM_CDF4(25494, 32033, 32406) }, + { AOM_CDF4(16772, 27963, 30718) }, + { AOM_CDF4(9419, 18165, 23260) }, + { AOM_CDF4(2677, 7501, 11797) }, + { AOM_CDF4(1516, 4344, 7170) }, + { AOM_CDF4(26556, 31454, 32101) }, + { AOM_CDF4(17128, 27035, 30108) }, + { AOM_CDF4(8324, 15344, 20249) }, + { AOM_CDF4(1903, 5696, 9469) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8455, 19003, 24368) }, + { AOM_CDF4(23563, 32021, 32604) }, + { AOM_CDF4(16237, 29446, 31935) }, + { AOM_CDF4(10724, 23999, 29358) }, + { AOM_CDF4(6725, 17528, 24416) }, + { AOM_CDF4(3927, 10927, 16825) }, + { AOM_CDF4(26313, 32288, 32634) }, + { AOM_CDF4(17430, 30095, 32095) }, + { AOM_CDF4(11116, 24606, 29679) }, + { AOM_CDF4(7195, 18384, 25269) }, + { AOM_CDF4(4726, 12852, 19315) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(22822, 31648, 32483) }, + { AOM_CDF4(16724, 29633, 31929) }, + { AOM_CDF4(10261, 23033, 28725) }, + { AOM_CDF4(7029, 17840, 24528) }, + { AOM_CDF4(4867, 13886, 21502) }, + { AOM_CDF4(25298, 31892, 32491) }, + { AOM_CDF4(17809, 29330, 31512) }, + { AOM_CDF4(9668, 21329, 26579) }, + { AOM_CDF4(4774, 12956, 18976) }, + { AOM_CDF4(2322, 7030, 11540) }, + { AOM_CDF4(25472, 31920, 32543) }, + { AOM_CDF4(17957, 29387, 31632) }, + { AOM_CDF4(9196, 20593, 26400) }, + { AOM_CDF4(4680, 12705, 19202) }, + { AOM_CDF4(2917, 8456, 13436) }, + { AOM_CDF4(26471, 32059, 32574) }, + { AOM_CDF4(18458, 29783, 31909) }, + { AOM_CDF4(8400, 19464, 25956) }, + { AOM_CDF4(3812, 10973, 17206) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(6779, 13743, 17678) }, + { AOM_CDF4(24806, 31797, 32457) }, + { AOM_CDF4(17616, 29047, 31372) }, + { AOM_CDF4(11063, 23175, 28003) }, + { AOM_CDF4(6521, 16110, 22324) }, + { AOM_CDF4(2764, 7504, 11654) }, + { AOM_CDF4(25266, 32367, 32637) }, + { AOM_CDF4(19054, 30553, 32175) }, + { AOM_CDF4(12139, 25212, 29807) }, + { AOM_CDF4(7311, 18162, 24704) }, + { AOM_CDF4(3397, 9164, 14074) }, + { AOM_CDF4(25988, 32208, 32522) }, + { AOM_CDF4(16253, 28912, 31526) }, + { AOM_CDF4(9151, 21387, 27372) }, + { AOM_CDF4(5688, 14915, 21496) }, + { AOM_CDF4(2717, 7627, 12004) }, + { AOM_CDF4(23144, 31855, 32443) }, + { AOM_CDF4(16070, 28491, 31325) }, + { AOM_CDF4(8702, 20467, 26517) }, + { AOM_CDF4(5243, 13956, 20367) }, + { AOM_CDF4(2621, 7335, 11567) }, + { AOM_CDF4(26636, 32340, 32630) }, + { AOM_CDF4(19990, 31050, 32341) }, + { AOM_CDF4(13243, 26105, 30315) }, + { AOM_CDF4(8588, 19521, 25918) }, + { AOM_CDF4(4717, 11585, 17304) }, + { AOM_CDF4(25844, 32292, 32582) }, + { AOM_CDF4(19090, 30635, 32097) }, + { AOM_CDF4(11963, 24546, 28939) }, + { AOM_CDF4(6218, 16087, 22354) }, + { AOM_CDF4(2340, 6608, 10426) }, + { AOM_CDF4(28046, 32576, 32694) }, + { AOM_CDF4(21178, 31313, 32296) }, + { AOM_CDF4(13486, 26184, 29870) }, + { AOM_CDF4(7149, 17871, 23723) }, + { AOM_CDF4(2833, 7958, 12259) }, + { AOM_CDF4(27710, 32528, 32686) }, + { AOM_CDF4(20674, 31076, 32268) }, + { AOM_CDF4(12413, 24955, 29243) }, + { AOM_CDF4(6676, 16927, 23097) }, + { AOM_CDF4(2966, 8333, 12919) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8639, 19339, 24429) }, + { AOM_CDF4(24404, 31837, 32525) }, + { AOM_CDF4(16997, 29425, 31784) }, + { AOM_CDF4(11253, 24234, 29149) }, + { AOM_CDF4(6751, 17394, 24028) }, + { AOM_CDF4(3490, 9830, 15191) }, + { AOM_CDF4(26283, 32471, 32714) }, + { AOM_CDF4(19599, 31168, 32442) }, + { AOM_CDF4(13146, 26954, 30893) }, + { AOM_CDF4(8214, 20588, 26890) }, + { AOM_CDF4(4699, 13081, 19300) }, + { AOM_CDF4(28212, 32458, 32669) }, + { AOM_CDF4(18594, 30316, 32100) }, + { AOM_CDF4(11219, 24408, 29234) }, + { AOM_CDF4(6865, 17656, 24149) }, + { AOM_CDF4(3678, 10362, 16006) }, + { AOM_CDF4(25825, 32136, 32616) }, + { AOM_CDF4(17313, 29853, 32021) }, + { AOM_CDF4(11197, 24471, 29472) }, + { AOM_CDF4(6947, 17781, 24405) }, + { AOM_CDF4(3768, 10660, 16261) }, + { AOM_CDF4(27352, 32500, 32706) }, + { AOM_CDF4(20850, 31468, 32469) }, + { AOM_CDF4(14021, 27707, 31133) }, + { AOM_CDF4(8964, 21748, 27838) }, + { AOM_CDF4(5437, 14665, 21187) }, + { AOM_CDF4(26304, 32492, 32698) }, + { AOM_CDF4(20409, 31380, 32385) }, + { AOM_CDF4(13682, 27222, 30632) }, + { AOM_CDF4(8974, 21236, 26685) }, + { AOM_CDF4(4234, 11665, 16934) }, + { AOM_CDF4(26273, 32357, 32711) }, + { AOM_CDF4(20672, 31242, 32441) }, + { AOM_CDF4(14172, 27254, 30902) }, + { AOM_CDF4(9870, 21898, 27275) }, + { AOM_CDF4(5164, 13506, 19270) }, + { AOM_CDF4(26725, 32459, 32728) }, + { AOM_CDF4(20991, 31442, 32527) }, + { AOM_CDF4(13071, 26434, 30811) }, + { AOM_CDF4(8184, 20090, 26742) }, + { AOM_CDF4(4803, 13255, 19895) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7555, 14942, 18501) }, + { AOM_CDF4(24410, 31178, 32287) }, + { AOM_CDF4(14394, 26738, 30253) }, + { AOM_CDF4(8413, 19554, 25195) }, + { AOM_CDF4(4766, 12924, 18785) }, + { AOM_CDF4(2029, 5806, 9207) }, + { AOM_CDF4(26776, 32364, 32663) }, + { AOM_CDF4(18732, 29967, 31931) }, + { AOM_CDF4(11005, 23786, 28852) }, + { AOM_CDF4(6466, 16909, 23510) }, + { AOM_CDF4(3044, 8638, 13419) }, + { AOM_CDF4(29208, 32582, 32704) }, + { AOM_CDF4(20068, 30857, 32208) }, + { AOM_CDF4(12003, 25085, 29595) }, + { AOM_CDF4(6947, 17750, 24189) }, + { AOM_CDF4(3245, 9103, 14007) }, + { AOM_CDF4(27359, 32465, 32669) }, + { AOM_CDF4(19421, 30614, 32174) }, + { AOM_CDF4(11915, 25010, 29579) }, + { AOM_CDF4(6950, 17676, 24074) }, + { AOM_CDF4(3007, 8473, 13096) }, + { AOM_CDF4(29002, 32676, 32735) }, + { AOM_CDF4(22102, 31849, 32576) }, + { AOM_CDF4(14408, 28009, 31405) }, + { AOM_CDF4(9027, 21679, 27931) }, + { AOM_CDF4(4694, 12678, 18748) }, + { AOM_CDF4(28216, 32528, 32682) }, + { AOM_CDF4(20849, 31264, 32318) }, + { AOM_CDF4(12756, 25815, 29751) }, + { AOM_CDF4(7565, 18801, 24923) }, + { AOM_CDF4(3509, 9533, 14477) }, + { AOM_CDF4(30133, 32687, 32739) }, + { AOM_CDF4(23063, 31910, 32515) }, + { AOM_CDF4(14588, 28051, 31132) }, + { AOM_CDF4(9085, 21649, 27457) }, + { AOM_CDF4(4261, 11654, 17264) }, + { AOM_CDF4(29518, 32691, 32748) }, + { AOM_CDF4(22451, 31959, 32613) }, + { AOM_CDF4(14864, 28722, 31700) }, + { AOM_CDF4(9695, 22964, 28716) }, + { AOM_CDF4(4932, 13358, 19502) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6465, 16958, 21688) }, + { AOM_CDF4(25199, 31514, 32360) }, + { AOM_CDF4(14774, 27149, 30607) }, + { AOM_CDF4(9257, 21438, 26972) }, + { AOM_CDF4(5723, 15183, 21882) }, + { AOM_CDF4(3150, 8879, 13731) }, + { AOM_CDF4(26989, 32262, 32682) }, + { AOM_CDF4(17396, 29937, 32085) }, + { AOM_CDF4(11387, 24901, 29784) }, + { AOM_CDF4(7289, 18821, 25548) }, + { AOM_CDF4(3734, 10577, 16086) }, + { AOM_CDF4(29728, 32501, 32695) }, + { AOM_CDF4(17431, 29701, 31903) }, + { AOM_CDF4(9921, 22826, 28300) }, + { AOM_CDF4(5896, 15434, 22068) }, + { AOM_CDF4(3430, 9646, 14757) }, + { AOM_CDF4(28614, 32511, 32705) }, + { AOM_CDF4(19364, 30638, 32263) }, + { AOM_CDF4(13129, 26254, 30402) }, + { AOM_CDF4(8754, 20484, 26440) }, + { AOM_CDF4(4378, 11607, 17110) }, + { AOM_CDF4(30292, 32671, 32744) }, + { AOM_CDF4(21780, 31603, 32501) }, + { AOM_CDF4(14314, 27829, 31291) }, + { AOM_CDF4(9611, 22327, 28263) }, + { AOM_CDF4(4890, 13087, 19065) }, + { AOM_CDF4(25862, 32567, 32733) }, + { AOM_CDF4(20794, 32050, 32567) }, + { AOM_CDF4(17243, 30625, 32254) }, + { AOM_CDF4(13283, 27628, 31474) }, + { AOM_CDF4(9669, 22532, 28918) }, + { AOM_CDF4(27435, 32697, 32748) }, + { AOM_CDF4(24922, 32390, 32714) }, + { AOM_CDF4(21449, 31504, 32536) }, + { AOM_CDF4(16392, 29729, 31832) }, + { AOM_CDF4(11692, 24884, 29076) }, + { AOM_CDF4(24193, 32290, 32735) }, + { AOM_CDF4(18909, 31104, 32563) }, + { AOM_CDF4(12236, 26841, 31403) }, + { AOM_CDF4(8171, 21840, 29082) }, + { AOM_CDF4(7224, 17280, 25275) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(3078, 6839, 9890) }, + { AOM_CDF4(13837, 20450, 24479) }, + { AOM_CDF4(5914, 14222, 19328) }, + { AOM_CDF4(3866, 10267, 14762) }, + { AOM_CDF4(2612, 7208, 11042) }, + { AOM_CDF4(1067, 2991, 4776) }, + { AOM_CDF4(25817, 31646, 32529) }, + { AOM_CDF4(13708, 26338, 30385) }, + { AOM_CDF4(7328, 18585, 24870) }, + { AOM_CDF4(4691, 13080, 19276) }, + { AOM_CDF4(1825, 5253, 8352) }, + { AOM_CDF4(29386, 32315, 32624) }, + { AOM_CDF4(17160, 29001, 31360) }, + { AOM_CDF4(9602, 21862, 27396) }, + { AOM_CDF4(5915, 15772, 22148) }, + { AOM_CDF4(2786, 7779, 12047) }, + { AOM_CDF4(29246, 32450, 32663) }, + { AOM_CDF4(18696, 29929, 31818) }, + { AOM_CDF4(10510, 23369, 28560) }, + { AOM_CDF4(6229, 16499, 23125) }, + { AOM_CDF4(2608, 7448, 11705) }, + { AOM_CDF4(30753, 32710, 32748) }, + { AOM_CDF4(21638, 31487, 32503) }, + { AOM_CDF4(12937, 26854, 30870) }, + { AOM_CDF4(8182, 20596, 26970) }, + { AOM_CDF4(3637, 10269, 15497) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5244, 12150, 16906) }, + { AOM_CDF4(20486, 26858, 29701) }, + { AOM_CDF4(7756, 18317, 23735) }, + { AOM_CDF4(3452, 9256, 13146) }, + { AOM_CDF4(2020, 5206, 8229) }, + { AOM_CDF4(1801, 4993, 7903) }, + { AOM_CDF4(27051, 31858, 32531) }, + { AOM_CDF4(15988, 27531, 30619) }, + { AOM_CDF4(9188, 21484, 26719) }, + { AOM_CDF4(6273, 17186, 23800) }, + { AOM_CDF4(3108, 9355, 14764) }, + { AOM_CDF4(31076, 32520, 32680) }, + { AOM_CDF4(18119, 30037, 31850) }, + { AOM_CDF4(10244, 22969, 27472) }, + { AOM_CDF4(4692, 14077, 19273) }, + { AOM_CDF4(3694, 11677, 17556) }, + { AOM_CDF4(30060, 32581, 32720) }, + { AOM_CDF4(21011, 30775, 32120) }, + { AOM_CDF4(11931, 24820, 29289) }, + { AOM_CDF4(7119, 17662, 24356) }, + { AOM_CDF4(3833, 10706, 16304) }, + { AOM_CDF4(31954, 32731, 32748) }, + { AOM_CDF4(23913, 31724, 32489) }, + { AOM_CDF4(15520, 28060, 31286) }, + { AOM_CDF4(11517, 23008, 28571) }, + { AOM_CDF4(6193, 14508, 20629) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(1035, 2807, 4156) }, + { AOM_CDF4(13162, 18138, 20939) }, + { AOM_CDF4(2696, 6633, 8755) }, + { AOM_CDF4(1373, 4161, 6853) }, + { AOM_CDF4(1099, 2746, 4716) }, + { AOM_CDF4(340, 1021, 1599) }, + { AOM_CDF4(22826, 30419, 32135) }, + { AOM_CDF4(10395, 21762, 26942) }, + { AOM_CDF4(4726, 12407, 17361) }, + { AOM_CDF4(2447, 7080, 10593) }, + { AOM_CDF4(1227, 3717, 6011) }, + { AOM_CDF4(28156, 31424, 31934) }, + { AOM_CDF4(16915, 27754, 30373) }, + { AOM_CDF4(9148, 20990, 26431) }, + { AOM_CDF4(5950, 15515, 21148) }, + { AOM_CDF4(2492, 7327, 11526) }, + { AOM_CDF4(30602, 32477, 32670) }, + { AOM_CDF4(20026, 29955, 31568) }, + { AOM_CDF4(11220, 23628, 28105) }, + { AOM_CDF4(6652, 17019, 22973) }, + { AOM_CDF4(3064, 8536, 13043) }, + { AOM_CDF4(31769, 32724, 32748) }, + { AOM_CDF4(22230, 30887, 32373) }, + { AOM_CDF4(12234, 25079, 29731) }, + { AOM_CDF4(7326, 18816, 25353) }, + { AOM_CDF4(3933, 10907, 16616) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(8896, 16227, 20630) }, + { AOM_CDF4(23629, 31782, 32527) }, + { AOM_CDF4(15173, 27755, 31321) }, + { AOM_CDF4(10158, 21233, 27382) }, + { AOM_CDF4(6420, 14857, 21558) }, + { AOM_CDF4(3269, 8155, 12646) }, + { AOM_CDF4(24835, 32009, 32496) }, + { AOM_CDF4(16509, 28421, 31579) }, + { AOM_CDF4(10957, 21514, 27418) }, + { AOM_CDF4(7881, 15930, 22096) }, + { AOM_CDF4(5388, 10960, 15918) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20745, 30773, 32093) }, + { AOM_CDF4(15200, 27221, 30861) }, + { AOM_CDF4(13032, 20873, 25667) }, + { AOM_CDF4(12285, 18663, 23494) }, + { AOM_CDF4(11563, 17481, 21489) }, + { AOM_CDF4(26260, 31982, 32320) }, + { AOM_CDF4(15397, 28083, 31100) }, + { AOM_CDF4(9742, 19217, 24824) }, + { AOM_CDF4(3261, 9629, 15362) }, + { AOM_CDF4(1480, 4322, 7499) }, + { AOM_CDF4(27599, 32256, 32460) }, + { AOM_CDF4(16857, 27659, 30774) }, + { AOM_CDF4(9551, 18290, 23748) }, + { AOM_CDF4(3052, 8933, 14103) }, + { AOM_CDF4(2021, 5910, 9787) }, + { AOM_CDF4(29005, 32015, 32392) }, + { AOM_CDF4(17677, 27694, 30863) }, + { AOM_CDF4(9204, 17356, 23219) }, + { AOM_CDF4(2403, 7516, 12814) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10808, 22056, 26896) }, + { AOM_CDF4(25739, 32313, 32676) }, + { AOM_CDF4(17288, 30203, 32221) }, + { AOM_CDF4(11359, 24878, 29896) }, + { AOM_CDF4(6949, 17767, 24893) }, + { AOM_CDF4(4287, 11796, 18071) }, + { AOM_CDF4(27880, 32521, 32705) }, + { AOM_CDF4(19038, 31004, 32414) }, + { AOM_CDF4(12564, 26345, 30768) }, + { AOM_CDF4(8269, 19947, 26779) }, + { AOM_CDF4(5674, 14657, 21674) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(25742, 32319, 32671) }, + { AOM_CDF4(19557, 31164, 32454) }, + { AOM_CDF4(13381, 26381, 30755) }, + { AOM_CDF4(10101, 21466, 26722) }, + { AOM_CDF4(9209, 19650, 26825) }, + { AOM_CDF4(27107, 31917, 32432) }, + { AOM_CDF4(18056, 28893, 31203) }, + { AOM_CDF4(10200, 21434, 26764) }, + { AOM_CDF4(4660, 12913, 19502) }, + { AOM_CDF4(2368, 6930, 12504) }, + { AOM_CDF4(26960, 32158, 32613) }, + { AOM_CDF4(18628, 30005, 32031) }, + { AOM_CDF4(10233, 22442, 28232) }, + { AOM_CDF4(5471, 14630, 21516) }, + { AOM_CDF4(3235, 10767, 17109) }, + { AOM_CDF4(27696, 32440, 32692) }, + { AOM_CDF4(20032, 31167, 32438) }, + { AOM_CDF4(8700, 21341, 28442) }, + { AOM_CDF4(5662, 14831, 21795) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9704, 17294, 21132) }, + { AOM_CDF4(26762, 32278, 32633) }, + { AOM_CDF4(18382, 29620, 31819) }, + { AOM_CDF4(10891, 23475, 28723) }, + { AOM_CDF4(6358, 16583, 23309) }, + { AOM_CDF4(3248, 9118, 14141) }, + { AOM_CDF4(27204, 32573, 32699) }, + { AOM_CDF4(19818, 30824, 32329) }, + { AOM_CDF4(11772, 25120, 30041) }, + { AOM_CDF4(6995, 18033, 25039) }, + { AOM_CDF4(3752, 10442, 16098) }, + { AOM_CDF4(27222, 32256, 32559) }, + { AOM_CDF4(15356, 28399, 31475) }, + { AOM_CDF4(8821, 20635, 27057) }, + { AOM_CDF4(5511, 14404, 21239) }, + { AOM_CDF4(2935, 8222, 13051) }, + { AOM_CDF4(24875, 32120, 32529) }, + { AOM_CDF4(15233, 28265, 31445) }, + { AOM_CDF4(8605, 20570, 26932) }, + { AOM_CDF4(5431, 14413, 21196) }, + { AOM_CDF4(2994, 8341, 13223) }, + { AOM_CDF4(28201, 32604, 32700) }, + { AOM_CDF4(21041, 31446, 32456) }, + { AOM_CDF4(13221, 26213, 30475) }, + { AOM_CDF4(8255, 19385, 26037) }, + { AOM_CDF4(4930, 12585, 18830) }, + { AOM_CDF4(28768, 32448, 32627) }, + { AOM_CDF4(19705, 30561, 32021) }, + { AOM_CDF4(11572, 23589, 28220) }, + { AOM_CDF4(5532, 15034, 21446) }, + { AOM_CDF4(2460, 7150, 11456) }, + { AOM_CDF4(29874, 32619, 32699) }, + { AOM_CDF4(21621, 31071, 32201) }, + { AOM_CDF4(12511, 24747, 28992) }, + { AOM_CDF4(6281, 16395, 22748) }, + { AOM_CDF4(3246, 9278, 14497) }, + { AOM_CDF4(29715, 32625, 32712) }, + { AOM_CDF4(20958, 31011, 32283) }, + { AOM_CDF4(11233, 23671, 28806) }, + { AOM_CDF4(6012, 16128, 22868) }, + { AOM_CDF4(3427, 9851, 15414) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11016, 22111, 26794) }, + { AOM_CDF4(25946, 32357, 32677) }, + { AOM_CDF4(17890, 30452, 32252) }, + { AOM_CDF4(11678, 25142, 29816) }, + { AOM_CDF4(6720, 17534, 24584) }, + { AOM_CDF4(4230, 11665, 17820) }, + { AOM_CDF4(28400, 32623, 32747) }, + { AOM_CDF4(21164, 31668, 32575) }, + { AOM_CDF4(13572, 27388, 31182) }, + { AOM_CDF4(8234, 20750, 27358) }, + { AOM_CDF4(5065, 14055, 20897) }, + { AOM_CDF4(28981, 32547, 32705) }, + { AOM_CDF4(18681, 30543, 32239) }, + { AOM_CDF4(10919, 24075, 29286) }, + { AOM_CDF4(6431, 17199, 24077) }, + { AOM_CDF4(3819, 10464, 16618) }, + { AOM_CDF4(26870, 32467, 32693) }, + { AOM_CDF4(19041, 30831, 32347) }, + { AOM_CDF4(11794, 25211, 30016) }, + { AOM_CDF4(6888, 18019, 24970) }, + { AOM_CDF4(4370, 12363, 18992) }, + { AOM_CDF4(29578, 32670, 32744) }, + { AOM_CDF4(23159, 32007, 32613) }, + { AOM_CDF4(15315, 28669, 31676) }, + { AOM_CDF4(9298, 22607, 28782) }, + { AOM_CDF4(6144, 15913, 22968) }, + { AOM_CDF4(28110, 32499, 32669) }, + { AOM_CDF4(21574, 30937, 32015) }, + { AOM_CDF4(12759, 24818, 28727) }, + { AOM_CDF4(6545, 16761, 23042) }, + { AOM_CDF4(3649, 10597, 16833) }, + { AOM_CDF4(28163, 32552, 32728) }, + { AOM_CDF4(22101, 31469, 32464) }, + { AOM_CDF4(13160, 25472, 30143) }, + { AOM_CDF4(7303, 18684, 25468) }, + { AOM_CDF4(5241, 13975, 20955) }, + { AOM_CDF4(28400, 32631, 32744) }, + { AOM_CDF4(22104, 31793, 32603) }, + { AOM_CDF4(13557, 26571, 30846) }, + { AOM_CDF4(7749, 19861, 26675) }, + { AOM_CDF4(4873, 14030, 21234) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9800, 17635, 21073) }, + { AOM_CDF4(26153, 31885, 32527) }, + { AOM_CDF4(15038, 27852, 31006) }, + { AOM_CDF4(8718, 20564, 26486) }, + { AOM_CDF4(5128, 14076, 20514) }, + { AOM_CDF4(2636, 7566, 11925) }, + { AOM_CDF4(27551, 32504, 32701) }, + { AOM_CDF4(18310, 30054, 32100) }, + { AOM_CDF4(10211, 23420, 29082) }, + { AOM_CDF4(6222, 16876, 23916) }, + { AOM_CDF4(3462, 9954, 15498) }, + { AOM_CDF4(29991, 32633, 32721) }, + { AOM_CDF4(19883, 30751, 32201) }, + { AOM_CDF4(11141, 24184, 29285) }, + { AOM_CDF4(6420, 16940, 23774) }, + { AOM_CDF4(3392, 9753, 15118) }, + { AOM_CDF4(28465, 32616, 32712) }, + { AOM_CDF4(19850, 30702, 32244) }, + { AOM_CDF4(10983, 24024, 29223) }, + { AOM_CDF4(6294, 16770, 23582) }, + { AOM_CDF4(3244, 9283, 14509) }, + { AOM_CDF4(30023, 32717, 32748) }, + { AOM_CDF4(22940, 32032, 32626) }, + { AOM_CDF4(14282, 27928, 31473) }, + { AOM_CDF4(8562, 21327, 27914) }, + { AOM_CDF4(4846, 13393, 19919) }, + { AOM_CDF4(29981, 32590, 32695) }, + { AOM_CDF4(20465, 30963, 32166) }, + { AOM_CDF4(11479, 23579, 28195) }, + { AOM_CDF4(5916, 15648, 22073) }, + { AOM_CDF4(3031, 8605, 13398) }, + { AOM_CDF4(31146, 32691, 32739) }, + { AOM_CDF4(23106, 31724, 32444) }, + { AOM_CDF4(13783, 26738, 30439) }, + { AOM_CDF4(7852, 19468, 25807) }, + { AOM_CDF4(3860, 11124, 16853) }, + { AOM_CDF4(31014, 32724, 32748) }, + { AOM_CDF4(23629, 32109, 32628) }, + { AOM_CDF4(14747, 28115, 31403) }, + { AOM_CDF4(8545, 21242, 27478) }, + { AOM_CDF4(4574, 12781, 19067) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9185, 19694, 24688) }, + { AOM_CDF4(26081, 31985, 32621) }, + { AOM_CDF4(16015, 29000, 31787) }, + { AOM_CDF4(10542, 23690, 29206) }, + { AOM_CDF4(6732, 17945, 24677) }, + { AOM_CDF4(3916, 11039, 16722) }, + { AOM_CDF4(28224, 32566, 32744) }, + { AOM_CDF4(19100, 31138, 32485) }, + { AOM_CDF4(12528, 26620, 30879) }, + { AOM_CDF4(7741, 20277, 26885) }, + { AOM_CDF4(4566, 12845, 18990) }, + { AOM_CDF4(29933, 32593, 32718) }, + { AOM_CDF4(17670, 30333, 32155) }, + { AOM_CDF4(10385, 23600, 28909) }, + { AOM_CDF4(6243, 16236, 22407) }, + { AOM_CDF4(3976, 10389, 16017) }, + { AOM_CDF4(28377, 32561, 32738) }, + { AOM_CDF4(19366, 31175, 32482) }, + { AOM_CDF4(13327, 27175, 31094) }, + { AOM_CDF4(8258, 20769, 27143) }, + { AOM_CDF4(4703, 13198, 19527) }, + { AOM_CDF4(31086, 32706, 32748) }, + { AOM_CDF4(22853, 31902, 32583) }, + { AOM_CDF4(14759, 28186, 31419) }, + { AOM_CDF4(9284, 22382, 28348) }, + { AOM_CDF4(5585, 15192, 21868) }, + { AOM_CDF4(28291, 32652, 32746) }, + { AOM_CDF4(19849, 32107, 32571) }, + { AOM_CDF4(14834, 26818, 29214) }, + { AOM_CDF4(10306, 22594, 28672) }, + { AOM_CDF4(6615, 17384, 23384) }, + { AOM_CDF4(28947, 32604, 32745) }, + { AOM_CDF4(25625, 32289, 32646) }, + { AOM_CDF4(18758, 28672, 31403) }, + { AOM_CDF4(10017, 23430, 28523) }, + { AOM_CDF4(6862, 15269, 22131) }, + { AOM_CDF4(23933, 32509, 32739) }, + { AOM_CDF4(19927, 31495, 32631) }, + { AOM_CDF4(11903, 26023, 30621) }, + { AOM_CDF4(7026, 20094, 27252) }, + { AOM_CDF4(5998, 18106, 24437) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4456, 11274, 15533) }, + { AOM_CDF4(21219, 29079, 31616) }, + { AOM_CDF4(11173, 23774, 28567) }, + { AOM_CDF4(7282, 18293, 24263) }, + { AOM_CDF4(4890, 13286, 19115) }, + { AOM_CDF4(1890, 5508, 8659) }, + { AOM_CDF4(26651, 32136, 32647) }, + { AOM_CDF4(14630, 28254, 31455) }, + { AOM_CDF4(8716, 21287, 27395) }, + { AOM_CDF4(5615, 15331, 22008) }, + { AOM_CDF4(2675, 7700, 12150) }, + { AOM_CDF4(29954, 32526, 32690) }, + { AOM_CDF4(16126, 28982, 31633) }, + { AOM_CDF4(9030, 21361, 27352) }, + { AOM_CDF4(5411, 14793, 21271) }, + { AOM_CDF4(2943, 8422, 13163) }, + { AOM_CDF4(29539, 32601, 32730) }, + { AOM_CDF4(18125, 30385, 32201) }, + { AOM_CDF4(10422, 24090, 29468) }, + { AOM_CDF4(6468, 17487, 24438) }, + { AOM_CDF4(2970, 8653, 13531) }, + { AOM_CDF4(30912, 32715, 32748) }, + { AOM_CDF4(20666, 31373, 32497) }, + { AOM_CDF4(12509, 26640, 30917) }, + { AOM_CDF4(8058, 20629, 27290) }, + { AOM_CDF4(4231, 12006, 18052) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10202, 20633, 25484) }, + { AOM_CDF4(27336, 31445, 32352) }, + { AOM_CDF4(12420, 24384, 28552) }, + { AOM_CDF4(7648, 18115, 23856) }, + { AOM_CDF4(5662, 14341, 19902) }, + { AOM_CDF4(3611, 10328, 15390) }, + { AOM_CDF4(30945, 32616, 32736) }, + { AOM_CDF4(18682, 30505, 32253) }, + { AOM_CDF4(11513, 25336, 30203) }, + { AOM_CDF4(7449, 19452, 26148) }, + { AOM_CDF4(4482, 13051, 18886) }, + { AOM_CDF4(32022, 32690, 32747) }, + { AOM_CDF4(18578, 30501, 32146) }, + { AOM_CDF4(11249, 23368, 28631) }, + { AOM_CDF4(5645, 16958, 22158) }, + { AOM_CDF4(5009, 11444, 16637) }, + { AOM_CDF4(31357, 32710, 32748) }, + { AOM_CDF4(21552, 31494, 32504) }, + { AOM_CDF4(13891, 27677, 31340) }, + { AOM_CDF4(9051, 22098, 28172) }, + { AOM_CDF4(5190, 13377, 19486) }, + { AOM_CDF4(32364, 32740, 32748) }, + { AOM_CDF4(24839, 31907, 32551) }, + { AOM_CDF4(17160, 28779, 31696) }, + { AOM_CDF4(12452, 24137, 29602) }, + { AOM_CDF4(6165, 15389, 22477) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(2575, 7281, 11077) }, + { AOM_CDF4(14002, 20866, 25402) }, + { AOM_CDF4(6343, 15056, 19658) }, + { AOM_CDF4(4474, 11858, 17041) }, + { AOM_CDF4(2865, 8299, 12534) }, + { AOM_CDF4(1344, 3949, 6391) }, + { AOM_CDF4(24720, 31239, 32459) }, + { AOM_CDF4(12585, 25356, 29968) }, + { AOM_CDF4(7181, 18246, 24444) }, + { AOM_CDF4(5025, 13667, 19885) }, + { AOM_CDF4(2521, 7304, 11605) }, + { AOM_CDF4(29908, 32252, 32584) }, + { AOM_CDF4(17421, 29156, 31575) }, + { AOM_CDF4(9889, 22188, 27782) }, + { AOM_CDF4(5878, 15647, 22123) }, + { AOM_CDF4(2814, 8665, 13323) }, + { AOM_CDF4(30183, 32568, 32713) }, + { AOM_CDF4(18528, 30195, 32049) }, + { AOM_CDF4(10982, 24606, 29657) }, + { AOM_CDF4(6957, 18165, 25231) }, + { AOM_CDF4(3508, 10118, 15468) }, + { AOM_CDF4(31761, 32736, 32748) }, + { AOM_CDF4(21041, 31328, 32546) }, + { AOM_CDF4(12568, 26732, 31166) }, + { AOM_CDF4(8052, 20720, 27733) }, + { AOM_CDF4(4336, 12192, 18396) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(7062, 16472, 22319) }, + { AOM_CDF4(24538, 32261, 32674) }, + { AOM_CDF4(13675, 28041, 31779) }, + { AOM_CDF4(8590, 20674, 27631) }, + { AOM_CDF4(5685, 14675, 22013) }, + { AOM_CDF4(3655, 9898, 15731) }, + { AOM_CDF4(26493, 32418, 32658) }, + { AOM_CDF4(16376, 29342, 32090) }, + { AOM_CDF4(10594, 22649, 28970) }, + { AOM_CDF4(8176, 17170, 24303) }, + { AOM_CDF4(5605, 12694, 19139) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(23888, 31902, 32542) }, + { AOM_CDF4(18612, 29687, 31987) }, + { AOM_CDF4(16245, 24852, 29249) }, + { AOM_CDF4(15765, 22608, 27559) }, + { AOM_CDF4(19895, 24699, 27510) }, + { AOM_CDF4(28401, 32212, 32457) }, + { AOM_CDF4(15274, 27825, 30980) }, + { AOM_CDF4(9364, 18128, 24332) }, + { AOM_CDF4(2283, 8193, 15082) }, + { AOM_CDF4(1228, 3972, 7881) }, + { AOM_CDF4(29455, 32469, 32620) }, + { AOM_CDF4(17981, 28245, 31388) }, + { AOM_CDF4(10921, 20098, 26240) }, + { AOM_CDF4(3743, 11829, 18657) }, + { AOM_CDF4(2374, 9593, 15715) }, + { AOM_CDF4(31068, 32466, 32635) }, + { AOM_CDF4(20321, 29572, 31971) }, + { AOM_CDF4(10771, 20255, 27119) }, + { AOM_CDF4(2795, 10410, 17361) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9320, 22102, 27840) }, + { AOM_CDF4(27057, 32464, 32724) }, + { AOM_CDF4(16331, 30268, 32309) }, + { AOM_CDF4(10319, 23935, 29720) }, + { AOM_CDF4(6189, 16448, 24106) }, + { AOM_CDF4(3589, 10884, 18808) }, + { AOM_CDF4(29026, 32624, 32748) }, + { AOM_CDF4(19226, 31507, 32587) }, + { AOM_CDF4(12692, 26921, 31203) }, + { AOM_CDF4(7049, 19532, 27635) }, + { AOM_CDF4(7727, 15669, 23252) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(28056, 32625, 32748) }, + { AOM_CDF4(22383, 32075, 32669) }, + { AOM_CDF4(15417, 27098, 31749) }, + { AOM_CDF4(18127, 26493, 27190) }, + { AOM_CDF4(5461, 16384, 21845) }, + { AOM_CDF4(27982, 32091, 32584) }, + { AOM_CDF4(19045, 29868, 31972) }, + { AOM_CDF4(10397, 22266, 27932) }, + { AOM_CDF4(5990, 13697, 21500) }, + { AOM_CDF4(1792, 6912, 15104) }, + { AOM_CDF4(28198, 32501, 32718) }, + { AOM_CDF4(21534, 31521, 32569) }, + { AOM_CDF4(11109, 25217, 30017) }, + { AOM_CDF4(5671, 15124, 26151) }, + { AOM_CDF4(4681, 14043, 18725) }, + { AOM_CDF4(28688, 32580, 32741) }, + { AOM_CDF4(22576, 32079, 32661) }, + { AOM_CDF4(10627, 22141, 28340) }, + { AOM_CDF4(9362, 14043, 28087) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7754, 16948, 22142) }, + { AOM_CDF4(25670, 32330, 32691) }, + { AOM_CDF4(15663, 29225, 31994) }, + { AOM_CDF4(9878, 23288, 29158) }, + { AOM_CDF4(6419, 17088, 24336) }, + { AOM_CDF4(3859, 11003, 17039) }, + { AOM_CDF4(27562, 32595, 32725) }, + { AOM_CDF4(17575, 30588, 32399) }, + { AOM_CDF4(10819, 24838, 30309) }, + { AOM_CDF4(7124, 18686, 25916) }, + { AOM_CDF4(4479, 12688, 19340) }, + { AOM_CDF4(28385, 32476, 32673) }, + { AOM_CDF4(15306, 29005, 31938) }, + { AOM_CDF4(8937, 21615, 28322) }, + { AOM_CDF4(5982, 15603, 22786) }, + { AOM_CDF4(3620, 10267, 16136) }, + { AOM_CDF4(27280, 32464, 32667) }, + { AOM_CDF4(15607, 29160, 32004) }, + { AOM_CDF4(9091, 22135, 28740) }, + { AOM_CDF4(6232, 16632, 24020) }, + { AOM_CDF4(4047, 11377, 17672) }, + { AOM_CDF4(29220, 32630, 32718) }, + { AOM_CDF4(19650, 31220, 32462) }, + { AOM_CDF4(13050, 26312, 30827) }, + { AOM_CDF4(9228, 20870, 27468) }, + { AOM_CDF4(6146, 15149, 21971) }, + { AOM_CDF4(30169, 32481, 32623) }, + { AOM_CDF4(17212, 29311, 31554) }, + { AOM_CDF4(9911, 21311, 26882) }, + { AOM_CDF4(4487, 13314, 20372) }, + { AOM_CDF4(2570, 7772, 12889) }, + { AOM_CDF4(30924, 32613, 32708) }, + { AOM_CDF4(19490, 30206, 32107) }, + { AOM_CDF4(11232, 23998, 29276) }, + { AOM_CDF4(6769, 17955, 25035) }, + { AOM_CDF4(4398, 12623, 19214) }, + { AOM_CDF4(30609, 32627, 32722) }, + { AOM_CDF4(19370, 30582, 32287) }, + { AOM_CDF4(10457, 23619, 29409) }, + { AOM_CDF4(6443, 17637, 24834) }, + { AOM_CDF4(4645, 13236, 20106) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8626, 20271, 26216) }, + { AOM_CDF4(26707, 32406, 32711) }, + { AOM_CDF4(16999, 30329, 32286) }, + { AOM_CDF4(11445, 25123, 30286) }, + { AOM_CDF4(6411, 18828, 25601) }, + { AOM_CDF4(6801, 12458, 20248) }, + { AOM_CDF4(29918, 32682, 32748) }, + { AOM_CDF4(20649, 31739, 32618) }, + { AOM_CDF4(12879, 27773, 31581) }, + { AOM_CDF4(7896, 21751, 28244) }, + { AOM_CDF4(5260, 14870, 23698) }, + { AOM_CDF4(29252, 32593, 32731) }, + { AOM_CDF4(17072, 30460, 32294) }, + { AOM_CDF4(10653, 24143, 29365) }, + { AOM_CDF4(6536, 17490, 23983) }, + { AOM_CDF4(4929, 13170, 20085) }, + { AOM_CDF4(28137, 32518, 32715) }, + { AOM_CDF4(18171, 30784, 32407) }, + { AOM_CDF4(11437, 25436, 30459) }, + { AOM_CDF4(7252, 18534, 26176) }, + { AOM_CDF4(4126, 13353, 20978) }, + { AOM_CDF4(31162, 32726, 32748) }, + { AOM_CDF4(23017, 32222, 32701) }, + { AOM_CDF4(15629, 29233, 32046) }, + { AOM_CDF4(9387, 22621, 29480) }, + { AOM_CDF4(6922, 17616, 25010) }, + { AOM_CDF4(28838, 32265, 32614) }, + { AOM_CDF4(19701, 30206, 31920) }, + { AOM_CDF4(11214, 22410, 27933) }, + { AOM_CDF4(5320, 14177, 23034) }, + { AOM_CDF4(5049, 12881, 17827) }, + { AOM_CDF4(27484, 32471, 32734) }, + { AOM_CDF4(21076, 31526, 32561) }, + { AOM_CDF4(12707, 26303, 31211) }, + { AOM_CDF4(8169, 21722, 28219) }, + { AOM_CDF4(6045, 19406, 27042) }, + { AOM_CDF4(27753, 32572, 32745) }, + { AOM_CDF4(20832, 31878, 32653) }, + { AOM_CDF4(13250, 27356, 31674) }, + { AOM_CDF4(7718, 21508, 29858) }, + { AOM_CDF4(7209, 18350, 25559) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7876, 16901, 21741) }, + { AOM_CDF4(24001, 31898, 32625) }, + { AOM_CDF4(14529, 27959, 31451) }, + { AOM_CDF4(8273, 20818, 27258) }, + { AOM_CDF4(5278, 14673, 21510) }, + { AOM_CDF4(2983, 8843, 14039) }, + { AOM_CDF4(28016, 32574, 32732) }, + { AOM_CDF4(17471, 30306, 32301) }, + { AOM_CDF4(10224, 24063, 29728) }, + { AOM_CDF4(6602, 17954, 25052) }, + { AOM_CDF4(4002, 11585, 17759) }, + { AOM_CDF4(30190, 32634, 32739) }, + { AOM_CDF4(17497, 30282, 32270) }, + { AOM_CDF4(10229, 23729, 29538) }, + { AOM_CDF4(6344, 17211, 24440) }, + { AOM_CDF4(3849, 11189, 17108) }, + { AOM_CDF4(28570, 32583, 32726) }, + { AOM_CDF4(17521, 30161, 32238) }, + { AOM_CDF4(10153, 23565, 29378) }, + { AOM_CDF4(6455, 17341, 24443) }, + { AOM_CDF4(3907, 11042, 17024) }, + { AOM_CDF4(30689, 32715, 32748) }, + { AOM_CDF4(21546, 31840, 32610) }, + { AOM_CDF4(13547, 27581, 31459) }, + { AOM_CDF4(8912, 21757, 28309) }, + { AOM_CDF4(5548, 15080, 22046) }, + { AOM_CDF4(30783, 32540, 32685) }, + { AOM_CDF4(17540, 29528, 31668) }, + { AOM_CDF4(10160, 21468, 26783) }, + { AOM_CDF4(4724, 13393, 20054) }, + { AOM_CDF4(2702, 8174, 13102) }, + { AOM_CDF4(31648, 32686, 32742) }, + { AOM_CDF4(20954, 31094, 32337) }, + { AOM_CDF4(12420, 25698, 30179) }, + { AOM_CDF4(7304, 19320, 26248) }, + { AOM_CDF4(4366, 12261, 18864) }, + { AOM_CDF4(31581, 32723, 32748) }, + { AOM_CDF4(21373, 31586, 32525) }, + { AOM_CDF4(12744, 26625, 30885) }, + { AOM_CDF4(7431, 20322, 26950) }, + { AOM_CDF4(4692, 13323, 20111) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(7833, 18369, 24095) }, + { AOM_CDF4(26650, 32273, 32702) }, + { AOM_CDF4(16371, 29961, 32191) }, + { AOM_CDF4(11055, 24082, 29629) }, + { AOM_CDF4(6892, 18644, 25400) }, + { AOM_CDF4(5006, 13057, 19240) }, + { AOM_CDF4(29834, 32666, 32748) }, + { AOM_CDF4(19577, 31335, 32570) }, + { AOM_CDF4(12253, 26509, 31122) }, + { AOM_CDF4(7991, 20772, 27711) }, + { AOM_CDF4(5677, 15910, 23059) }, + { AOM_CDF4(30109, 32532, 32720) }, + { AOM_CDF4(16747, 30166, 32252) }, + { AOM_CDF4(10134, 23542, 29184) }, + { AOM_CDF4(5791, 16176, 23556) }, + { AOM_CDF4(4362, 10414, 17284) }, + { AOM_CDF4(29492, 32626, 32748) }, + { AOM_CDF4(19894, 31402, 32525) }, + { AOM_CDF4(12942, 27071, 30869) }, + { AOM_CDF4(8346, 21216, 27405) }, + { AOM_CDF4(6572, 17087, 23859) }, + { AOM_CDF4(32035, 32735, 32748) }, + { AOM_CDF4(22957, 31838, 32618) }, + { AOM_CDF4(14724, 28572, 31772) }, + { AOM_CDF4(10364, 23999, 29553) }, + { AOM_CDF4(7004, 18433, 25655) }, + { AOM_CDF4(27528, 32277, 32681) }, + { AOM_CDF4(16959, 31171, 32096) }, + { AOM_CDF4(10486, 23593, 27962) }, + { AOM_CDF4(8192, 16384, 23211) }, + { AOM_CDF4(8937, 17873, 20852) }, + { AOM_CDF4(27715, 32002, 32615) }, + { AOM_CDF4(15073, 29491, 31676) }, + { AOM_CDF4(11264, 24576, 28672) }, + { AOM_CDF4(2341, 18725, 23406) }, + { AOM_CDF4(7282, 18204, 25486) }, + { AOM_CDF4(28547, 32213, 32657) }, + { AOM_CDF4(20788, 29773, 32239) }, + { AOM_CDF4(6780, 21469, 30508) }, + { AOM_CDF4(5958, 14895, 23831) }, + { AOM_CDF4(16384, 21845, 27307) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5992, 14304, 19765) }, + { AOM_CDF4(22612, 31238, 32456) }, + { AOM_CDF4(13456, 27162, 31087) }, + { AOM_CDF4(8001, 20062, 26504) }, + { AOM_CDF4(5168, 14105, 20764) }, + { AOM_CDF4(2632, 7771, 12385) }, + { AOM_CDF4(27034, 32344, 32709) }, + { AOM_CDF4(15850, 29415, 31997) }, + { AOM_CDF4(9494, 22776, 28841) }, + { AOM_CDF4(6151, 16830, 23969) }, + { AOM_CDF4(3461, 10039, 15722) }, + { AOM_CDF4(30134, 32569, 32731) }, + { AOM_CDF4(15638, 29422, 31945) }, + { AOM_CDF4(9150, 21865, 28218) }, + { AOM_CDF4(5647, 15719, 22676) }, + { AOM_CDF4(3402, 9772, 15477) }, + { AOM_CDF4(28530, 32586, 32735) }, + { AOM_CDF4(17139, 30298, 32292) }, + { AOM_CDF4(10200, 24039, 29685) }, + { AOM_CDF4(6419, 17674, 24786) }, + { AOM_CDF4(3544, 10225, 15824) }, + { AOM_CDF4(31333, 32726, 32748) }, + { AOM_CDF4(20618, 31487, 32544) }, + { AOM_CDF4(12901, 27217, 31232) }, + { AOM_CDF4(8624, 21734, 28171) }, + { AOM_CDF4(5104, 14191, 20748) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11206, 21090, 26561) }, + { AOM_CDF4(28759, 32279, 32671) }, + { AOM_CDF4(14171, 27952, 31569) }, + { AOM_CDF4(9743, 22907, 29141) }, + { AOM_CDF4(6871, 17886, 24868) }, + { AOM_CDF4(4960, 13152, 19315) }, + { AOM_CDF4(31077, 32661, 32748) }, + { AOM_CDF4(19400, 31195, 32515) }, + { AOM_CDF4(12752, 26858, 31040) }, + { AOM_CDF4(8370, 22098, 28591) }, + { AOM_CDF4(5457, 15373, 22298) }, + { AOM_CDF4(31697, 32706, 32748) }, + { AOM_CDF4(17860, 30657, 32333) }, + { AOM_CDF4(12510, 24812, 29261) }, + { AOM_CDF4(6180, 19124, 24722) }, + { AOM_CDF4(5041, 13548, 17959) }, + { AOM_CDF4(31552, 32716, 32748) }, + { AOM_CDF4(21908, 31769, 32623) }, + { AOM_CDF4(14470, 28201, 31565) }, + { AOM_CDF4(9493, 22982, 28608) }, + { AOM_CDF4(6858, 17240, 24137) }, + { AOM_CDF4(32543, 32752, 32756) }, + { AOM_CDF4(24286, 32097, 32666) }, + { AOM_CDF4(15958, 29217, 32024) }, + { AOM_CDF4(10207, 24234, 29958) }, + { AOM_CDF4(6929, 18305, 25652) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4137, 10847, 15682) }, + { AOM_CDF4(17824, 27001, 30058) }, + { AOM_CDF4(10204, 22796, 28291) }, + { AOM_CDF4(6076, 15935, 22125) }, + { AOM_CDF4(3852, 10937, 16816) }, + { AOM_CDF4(2252, 6324, 10131) }, + { AOM_CDF4(25840, 32016, 32662) }, + { AOM_CDF4(15109, 28268, 31531) }, + { AOM_CDF4(9385, 22231, 28340) }, + { AOM_CDF4(6082, 16672, 23479) }, + { AOM_CDF4(3318, 9427, 14681) }, + { AOM_CDF4(30594, 32574, 32718) }, + { AOM_CDF4(16836, 29552, 31859) }, + { AOM_CDF4(9556, 22542, 28356) }, + { AOM_CDF4(6305, 16725, 23540) }, + { AOM_CDF4(3376, 9895, 15184) }, + { AOM_CDF4(29383, 32617, 32745) }, + { AOM_CDF4(18891, 30809, 32401) }, + { AOM_CDF4(11688, 25942, 30687) }, + { AOM_CDF4(7468, 19469, 26651) }, + { AOM_CDF4(3909, 11358, 17012) }, + { AOM_CDF4(31564, 32736, 32748) }, + { AOM_CDF4(20906, 31611, 32600) }, + { AOM_CDF4(13191, 27621, 31537) }, + { AOM_CDF4(8768, 22029, 28676) }, + { AOM_CDF4(5079, 14109, 20906) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } } }; static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE( diff --git a/media/libaom/src/av1/common/txb_common.c b/media/libaom/src/av1/common/txb_common.c index c96d37cca..4eef319cd 100644 --- a/media/libaom/src/av1/common/txb_common.c +++ b/media/libaom/src/av1/common/txb_common.c @@ -9,7 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom/aom_integer.h" -#include "av1/common/onyxc_int.h" +#include "av1/common/av1_common_int.h" #include "av1/common/txb_common.h" const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2, 3, 4, 5, 6, 7, @@ -453,23 +453,6 @@ const int8_t *av1_nz_map_ctx_offset[19] = { av1_nz_map_ctx_offset_64x32, // TX_64x16 }; -void av1_init_lv_map(AV1_COMMON *cm) { - LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table; - for (int row = 0; row < 2; ++row) { - for (int col = 0; col < 2; ++col) { - for (int sig_mag = 0; sig_mag < 3; ++sig_mag) { - for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) { - if (row == 0 && col == 0 && count > 5) continue; - if ((row == 0 || col == 0) && count > 8) continue; - - coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] = - get_base_ctx_from_count_mag(row, col, count, sig_mag); - } - } - } - } -} - -const int16_t k_eob_group_start[12] = { 0, 1, 2, 3, 5, 9, - 17, 33, 65, 129, 257, 513 }; -const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; +const int16_t av1_eob_group_start[12] = { 0, 1, 2, 3, 5, 9, + 17, 33, 65, 129, 257, 513 }; +const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; diff --git a/media/libaom/src/av1/common/txb_common.h b/media/libaom/src/av1/common/txb_common.h index 1dda51f8b..5a62fa89b 100644 --- a/media/libaom/src/av1/common/txb_common.h +++ b/media/libaom/src/av1/common/txb_common.h @@ -12,8 +12,10 @@ #ifndef AOM_AV1_COMMON_TXB_COMMON_H_ #define AOM_AV1_COMMON_TXB_COMMON_H_ -extern const int16_t k_eob_group_start[12]; -extern const int16_t k_eob_offset_bits[12]; +#include "av1/common/av1_common_int.h" + +extern const int16_t av1_eob_group_start[12]; +extern const int16_t av1_eob_offset_bits[12]; extern const int8_t av1_coeff_band_4x4[16]; @@ -157,6 +159,19 @@ static INLINE int get_br_ctx_2d(const uint8_t *const levels, return mag + 14; } +static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order + const int bwl, + const TX_CLASS tx_class) { + const int row = c >> bwl; + const int col = c - (row << bwl); + if (c == 0) return 0; + if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) || + (tx_class == TX_CLASS_HORIZ && col == 0) || + (tx_class == TX_CLASS_VERT && row == 0)) + return 7; + return 14; +} + static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, const int c, // raster order const int bwl, const TX_CLASS tx_class) { @@ -270,12 +285,10 @@ static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats( const int row = coeff_idx >> bwl; const int col = coeff_idx - (row << bwl); return ctx + nz_map_ctx_offset_1d[col]; - break; } case TX_CLASS_VERT: { const int row = coeff_idx >> bwl; return ctx + nz_map_ctx_offset_1d[row]; - break; } default: break; } @@ -373,7 +386,9 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, if (plane_bsize == txsize_to_bsize[tx_size]) { txb_ctx->txb_skip_ctx = 0; } else { - // This is the algorithm to generate table skip_contexts[min][max]. + // This is the algorithm to generate table skip_contexts[top][left]. + // const int max = AOMMIN(top | left, 4); + // const int min = AOMMIN(AOMMIN(top, left), 4); // if (!max) // txb_skip_ctx = 1; // else if (!min) @@ -385,10 +400,15 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, // else // txb_skip_ctx = 6; static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, - { 1, 4, 4, 4, 5 }, - { 1, 4, 4, 4, 5 }, - { 1, 4, 4, 4, 5 }, - { 1, 4, 4, 4, 6 } }; + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 3, 5, 5, 5, 6 } }; + // For top and left, we only care about which of the following three + // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The + // spec calculates top and left with the Max() function. We can calculate + // an approximate max with bitwise OR because the real max and the + // approximate max belong to the same category. int top = 0; int left = 0; @@ -397,16 +417,16 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, top |= a[k]; } while (++k < txb_w_unit); top &= COEFF_CONTEXT_MASK; + top = AOMMIN(top, 4); k = 0; do { left |= l[k]; } while (++k < txb_h_unit); left &= COEFF_CONTEXT_MASK; - const int max = AOMMIN(top | left, 4); - const int min = AOMMIN(AOMMIN(top, left), 4); + left = AOMMIN(left, 4); - txb_ctx->txb_skip_ctx = skip_contexts[min][max]; + txb_ctx->txb_skip_ctx = skip_contexts[top][left]; } } else { const int ctx_base = get_entropy_context(tx_size, a, l); @@ -419,6 +439,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, #undef MAX_TX_SIZE_UNIT } -void av1_init_lv_map(AV1_COMMON *cm); - #endif // AOM_AV1_COMMON_TXB_COMMON_H_ diff --git a/media/libaom/src/av1/common/warped_motion.c b/media/libaom/src/av1/common/warped_motion.c index 4144c4389..4e9fab9bd 100644 --- a/media/libaom/src/av1/common/warped_motion.c +++ b/media/libaom/src/av1/common/warped_motion.c @@ -20,85 +20,13 @@ #include "av1/common/warped_motion.h" #include "av1/common/scale.h" -#define WARP_ERROR_BLOCK 32 - -/* clang-format off */ -static const int error_measure_lut[512] = { - // pow 0.7 - 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068, - 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703, - 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335, - 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963, - 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587, - 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206, - 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822, - 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432, - 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038, - 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639, - 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234, - 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823, - 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406, - 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982, - 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552, - 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113, - 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666, - 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211, - 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745, - 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269, - 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780, - 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278, - 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760, - 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225, - 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670, - 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090, - 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480, - 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832, - 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133, - 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359, - 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452, - 1323, 1187, 1045, 894, 731, 550, 339, 0, - 339, 550, 731, 894, 1045, 1187, 1323, 1452, - 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359, - 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133, - 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832, - 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480, - 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090, - 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670, - 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225, - 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760, - 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278, - 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780, - 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269, - 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745, - 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211, - 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666, - 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113, - 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552, - 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982, - 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406, - 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823, - 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234, - 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639, - 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038, - 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432, - 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822, - 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206, - 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587, - 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963, - 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335, - 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703, - 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068, - 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384, -}; -/* clang-format on */ - // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels // at a time. The zoom/rotation/shear in the model are applied to the // "fractional" position of each pixel, which therefore varies within // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. // We need an extra 2 taps to fit this in, for a total of 8 taps. /* clang-format off */ -const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { +const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { #if WARPEDPIXEL_PREC_BITS == 6 // [-1, 0) { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 }, @@ -345,7 +273,7 @@ static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma, } // Returns 1 on success or 0 on an invalid affine set -int get_shear_params(WarpedMotionParams *wm) { +int av1_get_shear_params(WarpedMotionParams *wm) { const int32_t *mat = wm->wmmat; if (!is_affine_valid(wm)) return 0; wm->alpha = @@ -376,6 +304,7 @@ int get_shear_params(WarpedMotionParams *wm) { return 1; } +#if CONFIG_AV1_HIGHBITDEPTH static INLINE int highbd_error_measure(int err, int bd) { const int b = bd - 8; const int bmask = (1 << b) - 1; @@ -447,7 +376,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = warped_filter[offs]; + const int16_t *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { @@ -468,7 +397,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = warped_filter[offs]; + const int16_t *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_vert; for (int m = 0; m < 8; ++m) { @@ -485,7 +414,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, uint16_t *dst16 = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; int32_t tmp32 = *p; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset; tmp32 = tmp32 >> DIST_PRECISION_BITS; @@ -514,12 +443,11 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, } } -static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8, - int width, int height, int stride, - const uint8_t *const pred8, int p_col, int p_row, - int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, int bd, - ConvolveParams *conv_params) { +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params) { assert(wm->wmtype <= AFFINE); if (wm->wmtype == ROTZOOM) { wm->wmmat[5] = wm->wmmat[2]; @@ -531,17 +459,15 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8, const int16_t gamma = wm->gamma; const int16_t delta = wm->delta; - const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params, alpha, beta, gamma, delta); } -static int64_t highbd_frame_error(const uint16_t *const ref, int stride, - const uint16_t *const dst, int p_width, - int p_height, int p_stride, int bd) { +int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride, + const uint16_t *const dst, int p_width, + int p_height, int p_stride, int bd) { int64_t sum_error = 0; for (int i = 0; i < p_height; ++i) { for (int j = 0; j < p_width; ++j) { @@ -552,41 +478,33 @@ static int64_t highbd_frame_error(const uint16_t *const ref, int stride, return sum_error; } -static int64_t highbd_warp_error( - WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height, - int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width, - int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, - int64_t best_error) { - int64_t gm_sumerr = 0; +static int64_t highbd_segmented_frame_error( + const uint16_t *const ref, int stride, const uint16_t *const dst, + int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map, + int segment_map_stride) { + int patch_w, patch_h; const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); - uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; - - ConvolveParams conv_params = get_conv_params(0, 0, bd); - conv_params.use_jnt_comp_avg = 0; - for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { - for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { - // avoid warping extra 8x8 blocks in the padded region of the frame - // when p_width and p_height are not multiples of WARP_ERROR_BLOCK - const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j); - const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i); - highbd_warp_plane(wm, ref8, width, height, stride, - CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h, - WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd, - &conv_params); - - gm_sumerr += highbd_frame_error( - tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride, - warp_w, warp_h, p_stride, bd); - if (gm_sumerr > best_error) return gm_sumerr; + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride, + dst + j + i * p_stride, patch_w, + patch_h, p_stride, bd); } } - return gm_sumerr; -} - -static INLINE int error_measure(int err) { - return error_measure_lut[255 + err]; + return sum_error; } +#endif // CONFIG_AV1_HIGHBITDEPTH /* The warp filter for ROTZOOM and AFFINE models works as follows: * Split the input into 8x8 blocks @@ -732,7 +650,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = warped_filter[offs]; + const int16_t *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { @@ -756,7 +674,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = warped_filter[offs]; + const int16_t *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_vert; for (int m = 0; m < 8; ++m) { @@ -773,7 +691,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, uint8_t *dst8 = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; int32_t tmp32 = *p; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset; tmp32 = tmp32 >> DIST_PRECISION_BITS; @@ -801,11 +719,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, } } -static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, - int width, int height, int stride, uint8_t *pred, - int p_col, int p_row, int p_width, int p_height, - int p_stride, int subsampling_x, int subsampling_y, - ConvolveParams *conv_params) { +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params) { assert(wm->wmtype <= AFFINE); if (wm->wmtype == ROTZOOM) { wm->wmmat[5] = wm->wmmat[2]; @@ -821,9 +738,9 @@ static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, alpha, beta, gamma, delta); } -static int64_t frame_error(const uint8_t *const ref, int stride, - const uint8_t *const dst, int p_width, int p_height, - int p_stride) { +int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride, + const uint8_t *const dst, int p_width, + int p_height, int p_stride) { int64_t sum_error = 0; for (int i = 0; i < p_height; ++i) { for (int j = 0; j < p_width; ++j) { @@ -834,61 +751,64 @@ static int64_t frame_error(const uint8_t *const ref, int stride, return sum_error; } -static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, - int width, int height, int stride, - const uint8_t *const dst, int p_col, int p_row, - int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, - int64_t best_error) { - int64_t gm_sumerr = 0; - int warp_w, warp_h; - int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); - int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); - uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; - ConvolveParams conv_params = get_conv_params(0, 0, 8); - conv_params.use_jnt_comp_avg = 0; - - for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { - for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { - // avoid warping extra 8x8 blocks in the padded region of the frame - // when p_width and p_height are not multiples of WARP_ERROR_BLOCK - warp_w = AOMMIN(error_bsize_w, p_col + p_width - j); - warp_h = AOMMIN(error_bsize_h, p_row + p_height - i); - warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h, - WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params); - - gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride, - warp_w, warp_h, p_stride); - if (gm_sumerr > best_error) return gm_sumerr; +static int64_t segmented_frame_error(const uint8_t *const ref, int stride, + const uint8_t *const dst, int p_width, + int p_height, int p_stride, + uint8_t *segment_map, + int segment_map_stride) { + int patch_w, patch_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + sum_error += av1_calc_frame_error(ref + j + i * stride, stride, + dst + j + i * p_stride, patch_w, + patch_h, p_stride); } } - return gm_sumerr; + return sum_error; } int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height, int p_stride) { +#if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) { - return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride, - CONVERT_TO_SHORTPTR(dst), p_width, p_height, - p_stride, bd); + return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride, + CONVERT_TO_SHORTPTR(dst), p_width, + p_height, p_stride, bd); } - return frame_error(ref, stride, dst, p_width, p_height, p_stride); +#endif + (void)use_hbd; + (void)bd; + return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride); } -int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, - const uint8_t *ref, int width, int height, int stride, - uint8_t *dst, int p_col, int p_row, int p_width, - int p_height, int p_stride, int subsampling_x, - int subsampling_y, int64_t best_error) { - if (wm->wmtype <= AFFINE) - if (!get_shear_params(wm)) return 1; - if (use_hbd) - return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row, - p_width, p_height, p_stride, subsampling_x, - subsampling_y, bd, best_error); - return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width, - p_height, p_stride, subsampling_x, subsampling_y, - best_error); +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int stride, uint8_t *dst, int p_width, + int p_height, int p_stride, + uint8_t *segment_map, + int segment_map_stride) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + return highbd_segmented_frame_error( + CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width, + p_height, p_stride, bd, segment_map, segment_map_stride); + } +#endif + (void)use_hbd; + (void)bd; + return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride, + segment_map, segment_map_stride); } void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, @@ -896,13 +816,21 @@ void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params) { +#if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) - highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, - p_width, p_height, p_stride, subsampling_x, subsampling_y, - bd, conv_params); + highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride, + CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, bd, + conv_params); else warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#else + (void)use_hbd; + (void)bd; + warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#endif } #define LS_MV_MAX 256 // max mv in 1/8-pel @@ -1023,18 +951,15 @@ static int find_affine_int(int np, const int *pts1, const int *pts2, int32_t A[2][2] = { { 0, 0 }, { 0, 0 } }; int32_t Bx[2] = { 0, 0 }; int32_t By[2] = { 0, 0 }; - int i; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1); - const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1); + const int rsuy = bh / 2 - 1; + const int rsux = bw / 2 - 1; const int suy = rsuy * 8; const int sux = rsux * 8; const int duy = suy + mvy; const int dux = sux + mvx; - const int isuy = (mi_row * MI_SIZE + rsuy); - const int isux = (mi_col * MI_SIZE + rsux); // Assume the center pixel of the block has exactly the same motion vector // as transmitted for the block. First shift the origin of the source @@ -1059,7 +984,7 @@ static int find_affine_int(int np, const int *pts1, const int *pts2, // The loop below computes: A = P'P, Bx = P'q, By = P'r // We need to just compute inv(A).Bx and inv(A).By for the solutions. // Contribution from neighbor block - for (i = 0; i < np; i++) { + for (int i = 0; i < np; i++) { const int dx = pts2[i * 2] - dux; const int dy = pts2[i * 2 + 1] - duy; const int sx = pts1[i * 2] - sux; @@ -1087,13 +1012,12 @@ static int find_affine_int(int np, const int *pts1, const int *pts2, assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX); assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX); - int64_t Det; - int16_t iDet, shift; - // Compute Determinant of A - Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1]; + const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1]; if (Det == 0) return 1; - iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1); + + int16_t shift; + int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1); shift -= WARPEDMODEL_PREC_BITS; if (shift < 0) { iDet <<= (-shift); @@ -1101,7 +1025,6 @@ static int find_affine_int(int np, const int *pts1, const int *pts2, } int64_t Px[2], Py[2]; - // These divided by the Det, are the least squares solutions Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1]; Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1]; @@ -1113,16 +1036,18 @@ static int find_affine_int(int np, const int *pts1, const int *pts2, wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift); wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift); + const int isuy = (mi_row * MI_SIZE + rsuy); + const int isux = (mi_col * MI_SIZE + rsux); // Note: In the vx, vy expressions below, the max value of each of the // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room // for the first term so that the overall sum in the worst case fits // within 32 bits overall. - int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - - (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + - isuy * wm->wmmat[3]); - int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - - (isux * wm->wmmat[4] + - isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); + const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + + isuy * wm->wmmat[3]); + const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * wm->wmmat[4] + + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); wm->wmmat[0] = clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); wm->wmmat[1] = @@ -1132,9 +1057,9 @@ static int find_affine_int(int np, const int *pts1, const int *pts2, return 0; } -int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, - int mvx, WarpedMotionParams *wm_params, int mi_row, - int mi_col) { +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col) { assert(wm_params->wmtype == AFFINE); if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row, @@ -1142,7 +1067,7 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, return 1; // check compatibility with the fast warp filter - if (!get_shear_params(wm_params)) return 1; + if (!av1_get_shear_params(wm_params)) return 1; return 0; } diff --git a/media/libaom/src/av1/common/warped_motion.h b/media/libaom/src/av1/common/warped_motion.h index a1a4f067d..14dc0fe47 100644 --- a/media/libaom/src/av1/common/warped_motion.h +++ b/media/libaom/src/av1/common/warped_motion.h @@ -31,8 +31,83 @@ #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2) #define WARPED_MOTION_DEBUG 0 #define DEFAULT_WMTYPE AFFINE +#define WARP_ERROR_BLOCK_LOG 5 +#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG) -extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; +extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; + +DECLARE_ALIGNED(8, extern const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]); + +/* clang-format off */ +static const int error_measure_lut[512] = { + // pow 0.7 + 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068, + 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703, + 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335, + 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963, + 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587, + 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206, + 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822, + 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432, + 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038, + 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639, + 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234, + 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823, + 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406, + 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982, + 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552, + 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113, + 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666, + 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211, + 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745, + 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269, + 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780, + 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278, + 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760, + 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225, + 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670, + 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090, + 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480, + 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832, + 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133, + 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359, + 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452, + 1323, 1187, 1045, 894, 731, 550, 339, 0, + 339, 550, 731, 894, 1045, 1187, 1323, 1452, + 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359, + 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133, + 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832, + 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480, + 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090, + 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670, + 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225, + 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760, + 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278, + 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780, + 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269, + 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745, + 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211, + 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666, + 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113, + 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552, + 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982, + 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406, + 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823, + 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234, + 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639, + 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038, + 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432, + 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822, + 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206, + 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587, + 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963, + 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335, + 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703, + 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068, + 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384, +}; +/* clang-format on */ static const uint8_t warp_pad_left[14][16] = { { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, @@ -68,28 +143,44 @@ static const uint8_t warp_pad_right[14][16] = { { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } }; -// Returns the error between the result of applying motion 'wm' to the frame -// described by 'ref' and the frame described by 'dst'. -int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, - const uint8_t *ref, int width, int height, int stride, - uint8_t *dst, int p_col, int p_row, int p_width, - int p_height, int p_stride, int subsampling_x, - int subsampling_y, int64_t best_error); +static INLINE int error_measure(int err) { + return error_measure_lut[255 + err]; +} // Returns the error between the frame described by 'ref' and the frame // described by 'dst'. int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height, int p_stride); +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int stride, uint8_t *dst, int p_width, + int p_height, int p_stride, + uint8_t *segment_map, int segment_map_stride); + +int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride, + const uint16_t *const dst, int p_width, + int p_height, int p_stride, int bd); + +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params); + +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params); + void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params); -int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, - int mvx, WarpedMotionParams *wm_params, int mi_row, - int mi_col); +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col); -int get_shear_params(WarpedMotionParams *wm); +int av1_get_shear_params(WarpedMotionParams *wm); #endif // AOM_AV1_COMMON_WARPED_MOTION_H_ diff --git a/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c b/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c index d9fb53785..196618176 100644 --- a/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c +++ b/media/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c @@ -129,8 +129,8 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt0 = _mm_set1_epi16((short)w0); + const __m128i wt1 = _mm_set1_epi16((short)w1); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); int y_qn = subpel_y_qn; @@ -175,7 +175,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, if (conv_params->is_compound) { if (conv_params->do_average) { const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16); const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); const __m128i shifted_32 = @@ -207,7 +207,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -236,8 +236,7 @@ void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { - // TODO(yaowu): remove unnecessary initializations - int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 }; + int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; @@ -408,7 +407,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x)); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), _mm_mullo_epi32(shifted, wt1)); shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS); @@ -443,7 +442,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { diff --git a/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c b/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c deleted file mode 100644 index 212d3bd72..000000000 --- a/media/libaom/src/av1/common/x86/av1_highbd_convolve_sse4.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <smmintrin.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/filter.h" - -typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src, - int src_stride, uint16_t *dst, int dst_stride, - int bd); - -// pixelsNum 0: write all 4 pixels -// 1/2/3: residual pixels 1/2/3 -static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst, - int dst_stride) { - if (2 == width) { - if (0 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); - *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); - *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]); - } else if (1 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - } else if (2 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); - } else if (3 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); - *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); - } - } else { - if (0 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); - _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); - _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]); - } else if (1 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - } else if (2 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); - } else if (3 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); - _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); - } - } -} - -// 16-bit pixels clip with bd (10/12) -static void highbd_clip(__m128i *p, int numVecs, int bd) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - __m128i clamped, mask; - int i; - - for (i = 0; i < numVecs; i++) { - mask = _mm_cmpgt_epi16(p[i], max); - clamped = _mm_andnot_si128(mask, p[i]); - mask = _mm_and_si128(mask, max); - clamped = _mm_or_si128(mask, clamped); - mask = _mm_cmpgt_epi16(clamped, zero); - p[i] = _mm_and_si128(clamped, mask); - } -} - -static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) { - __m128i v0, v1; - __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); - - u[0] = _mm_loadu_si128((__m128i const *)src); - u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); - u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - - u[0] = _mm_add_epi32(u[0], rnd); - u[1] = _mm_add_epi32(u[1], rnd); - u[2] = _mm_add_epi32(u[2], rnd); - u[3] = _mm_add_epi32(u[3], rnd); - - u[0] = _mm_srai_epi32(u[0], FILTER_BITS); - u[1] = _mm_srai_epi32(u[1], FILTER_BITS); - u[2] = _mm_srai_epi32(u[2], FILTER_BITS); - u[3] = _mm_srai_epi32(u[3], FILTER_BITS); - - u[0] = _mm_packus_epi32(u[0], u[1]); - u[1] = _mm_packus_epi32(u[2], u[3]); - - highbd_clip(u, 2, bd); - - v0 = _mm_unpacklo_epi16(u[0], u[1]); - v1 = _mm_unpackhi_epi16(u[0], u[1]); - - u[0] = _mm_unpacklo_epi16(v0, v1); - u[2] = _mm_unpackhi_epi16(v0, v1); - - u[1] = _mm_srli_si128(u[0], 8); - u[3] = _mm_srli_si128(u[2], 8); -} - -// pixelsNum = 0 : all 4 rows of pixels will be saved. -// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved. -void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride, - uint16_t *dst, int dst_stride, int bd) { - __m128i u[4]; - transClipPixel(src, src_stride, u, bd); - writePixel(u, width, pixelsNum, dst, dst_stride); -} - -void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src, - int src_stride, uint16_t *dst, int dst_stride, - int bd) { - __m128i u[4], v[4]; - const __m128i ones = _mm_set1_epi16(1); - - transClipPixel(src, src_stride, u, bd); - - v[0] = _mm_loadl_epi64((__m128i const *)dst); - v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride)); - v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride)); - v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride)); - - u[0] = _mm_add_epi16(u[0], v[0]); - u[1] = _mm_add_epi16(u[1], v[1]); - u[2] = _mm_add_epi16(u[2], v[2]); - u[3] = _mm_add_epi16(u[3], v[3]); - - u[0] = _mm_add_epi16(u[0], ones); - u[1] = _mm_add_epi16(u[1], ones); - u[2] = _mm_add_epi16(u[2], ones); - u[3] = _mm_add_epi16(u[3], ones); - - u[0] = _mm_srai_epi16(u[0], 1); - u[1] = _mm_srai_epi16(u[1], 1); - u[2] = _mm_srai_epi16(u[2], 1); - u[3] = _mm_srai_epi16(u[3], 1); - - writePixel(u, width, pixelsNum, dst, dst_stride); -} - -// Vertical convolutional filter - -typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst); - -static void highbdRndingPacks(__m128i *u) { - __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); - u[0] = _mm_add_epi32(u[0], rnd); - u[0] = _mm_srai_epi32(u[0], FILTER_BITS); - u[0] = _mm_packus_epi32(u[0], u[0]); -} - -static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) { - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]); -} - -static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) { - __m128i v = _mm_loadl_epi64((__m128i const *)dst); - const __m128i ones = _mm_set1_epi16(1); - - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - - v = _mm_add_epi16(v, u[0]); - v = _mm_add_epi16(v, ones); - v = _mm_srai_epi16(v, 1); - *(uint32_t *)dst = _mm_cvtsi128_si32(v); -} - -WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum }; - -static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) { - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - _mm_storel_epi64((__m128i *)dst, u[0]); -} - -static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) { - __m128i v = _mm_loadl_epi64((__m128i const *)dst); - const __m128i ones = _mm_set1_epi16(1); - - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - - v = _mm_add_epi16(v, u[0]); - v = _mm_add_epi16(v, ones); - v = _mm_srai_epi16(v, 1); - _mm_storel_epi64((__m128i *)dst, v); -} - -WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum }; diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c index 5db2ccf6c..0fbd5eae4 100644 --- a/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c +++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c @@ -61,8 +61,7 @@ static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); } -static void idct16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -133,8 +132,8 @@ static void idct16_new_avx2(const __m256i *input, __m256i *output, idct16_stage7_avx2(output, x1); } -static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct16_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -181,8 +180,8 @@ static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, idct16_stage7_avx2(output, x1); } -static void idct16_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct16_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -303,8 +302,8 @@ static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { output[15] = _mm256_subs_epi16(__zero, x1[1]); } -static void iadst16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void iadst16_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -365,8 +364,8 @@ static void iadst16_new_avx2(const __m256i *input, __m256i *output, iadst16_stage9_avx2(output, x1); } -static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void iadst16_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -401,8 +400,8 @@ static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, iadst16_stage9_avx2(output, x1); } -static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void iadst16_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -568,8 +567,8 @@ static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); } -static void idct32_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct32_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -621,8 +620,8 @@ static void idct32_low1_new_avx2(const __m256i *input, __m256i *output, output[16] = x[0]; } -static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct32_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -679,8 +678,8 @@ static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, idct32_stage9_avx2(output, x); } -static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct32_low16_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -746,8 +745,7 @@ static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, idct32_stage9_avx2(output, x); } -static void idct32_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)(cos_bit); const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1104,8 +1102,8 @@ static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); } -static void idct64_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct64_low1_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -1191,8 +1189,8 @@ static void idct64_low1_new_avx2(const __m256i *input, __m256i *output, output[32] = x[0]; } -static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct64_low8_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1302,7 +1300,6 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, x[6] = x[1]; x[5] = x[2]; x[4] = x[3]; - x[9] = x[9]; btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); @@ -1312,8 +1309,8 @@ static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, idct64_stage11_avx2(output, x); } -static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct64_low16_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1428,8 +1425,8 @@ static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, idct64_stage11_avx2(output, x); } -static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { +static void idct64_low32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1577,6 +1574,9 @@ static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, idct64_stage11_avx2(output, x); } +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + // 1D functions process 16 pixels at one time. static const transform_1d_avx2 lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { @@ -1589,17 +1589,15 @@ static const transform_1d_avx2 { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, { - { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL }, - { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2, - NULL }, + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, { NULL, NULL, NULL, NULL }, }, - { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2, - idct32_new_avx2 }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, - { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2, - idct64_low32_new_avx2 }, + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, + idct64_low32_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; @@ -1611,11 +1609,11 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( __m256i buf1[64 * 16]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div16 = txfm_size_col >> 4; @@ -1635,6 +1633,7 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0])); for (int i = 0; i < buf_size_nonzero_h_div16; i++) { __m256i buf0[64]; const int32_t *input_row = input + (i << 4) * input_stride; @@ -1649,7 +1648,9 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( round_shift_avx2(buf0, buf0, input_stride); // rect special code } row_txfm(buf0, buf0, cos_bit_row); - round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); + for (int j = 0; j < txfm_size_col; ++j) { + buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0); + } __m256i *buf1_cur = buf1 + (i << 4); if (lr_flip) { @@ -1665,10 +1666,13 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( } } } + const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1])); for (int i = 0; i < buf_size_w_div16; i++) { __m256i *buf1_cur = buf1 + i * txfm_size_row; col_txfm(buf1_cur, buf1_cur, cos_bit_col); - round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]); + for (int j = 0; j < txfm_size_row; ++j) { + buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1); + } } for (int i = 0; i < buf_size_w_div16; i++) { lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, @@ -1745,7 +1749,7 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, TX_SIZE tx_size, int32_t eob) { (void)eob; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; @@ -1767,10 +1771,10 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2( TX_SIZE tx_size, int eob) { int eobx, eoby; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col); @@ -1807,10 +1811,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( __m256i buf1[64]; int eobx, eoby; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div16 = txfm_size_col >> 4; diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c index 995bc3da4..46c051ff8 100644 --- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c +++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c @@ -24,8 +24,7 @@ static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, // TODO(binpengsmail@gmail.com): replace some for loop with do {} while -static void idct4_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -51,7 +50,8 @@ static void idct4_new_sse2(const __m128i *input, __m128i *output, btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); } -void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void idct4_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -77,8 +77,8 @@ void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); } -void idct8_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct8_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -102,7 +102,7 @@ void idct8_low1_new_ssse3(const __m128i *input, __m128i *output, output[4] = x[0]; } -void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -150,7 +150,8 @@ void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); } -void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void idct8_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -236,8 +237,8 @@ static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) { btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]); } -static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct16_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -271,8 +272,8 @@ static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output, output[8] = x[0]; } -static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct16_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -318,7 +319,7 @@ static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output, idct16_stage7_sse2(output, x); } -void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -390,7 +391,8 @@ void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { idct16_stage7_sse2(output, x); } -void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void idct16_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -600,8 +602,8 @@ static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) { btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]); } -static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct32_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -653,8 +655,8 @@ static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output, output[16] = x[0]; } -static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct32_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -711,8 +713,8 @@ static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output, idct32_stage9_sse2(output, x); } -static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct32_low16_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -778,8 +780,7 @@ static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output, idct32_stage9_sse2(output, x); } -static void idct32_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1138,8 +1139,8 @@ static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) { btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]); } -static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct64_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); @@ -1225,8 +1226,8 @@ static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output, output[32] = x[0]; } -static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct64_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1336,7 +1337,6 @@ static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output, x[6] = x[1]; x[5] = x[2]; x[4] = x[3]; - x[9] = x[9]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); @@ -1346,8 +1346,8 @@ static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output, idct64_stage11_sse2(output, x); } -static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct64_low16_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1462,8 +1462,8 @@ static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output, idct64_stage11_sse2(output, x); } -static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void idct64_low32_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -1611,7 +1611,7 @@ static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output, idct64_stage11_sse2(output, x); } -void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *sinpi = sinpi_arr(INV_COS_BIT); const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); @@ -1672,10 +1672,8 @@ void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { } } -// TODO(binpengsmail@gmail.com): -// To explore the reuse of VP9 versions of corresponding SSE2 functions and -// evaluate whether there is a possibility for further speedup. -void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void iadst4_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *sinpi = sinpi_arr(INV_COS_BIT); const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); @@ -1720,8 +1718,8 @@ void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { } } -static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void iadst8_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __zero = _mm_setzero_si128(); @@ -1767,7 +1765,7 @@ static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output, output[7] = _mm_subs_epi16(__zero, x[1]); } -void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __zero = _mm_setzero_si128(); @@ -1835,7 +1833,8 @@ void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { output[7] = _mm_subs_epi16(__zero, x[1]); } -void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void iadst8_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __zero = _mm_setzero_si128(); @@ -1994,8 +1993,8 @@ static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) { output[15] = _mm_subs_epi16(__zero, x[1]); } -static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void iadst16_low1_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -2043,8 +2042,8 @@ static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output, iadst16_stage9_ssse3(output, x); } -static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void iadst16_low8_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -2079,7 +2078,8 @@ static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output, iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage9_ssse3(output, x); } -void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { +static void iadst16_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -2139,8 +2139,8 @@ void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { iadst16_stage9_ssse3(output, x); } -void iadst16_w4_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void iadst16_w4_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); @@ -2233,8 +2233,8 @@ void iadst16_w4_new_sse2(const __m128i *input, __m128i *output, iadst16_stage9_ssse3(output, x); } -static void iidentity4_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void iidentity4_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits)); const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); @@ -2244,16 +2244,16 @@ static void iidentity4_new_ssse3(const __m128i *input, __m128i *output, } } -static void iidentity8_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void iidentity8_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; for (int i = 0; i < 8; ++i) { output[i] = _mm_adds_epi16(input[i], input[i]); } } -static void iidentity16_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { +static void iidentity16_ssse3(const __m128i *input, __m128i *output, + int8_t cos_bit) { (void)cos_bit; const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits)); const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); @@ -2300,11 +2300,11 @@ static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, // 1D functions process process 8 pixels at one time. static const transform_1d_ssse3 lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { - { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 }, - { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 }, - { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 }, - { idct32_new_sse2, NULL, NULL }, - { idct64_low32_new_ssse3, NULL, NULL }, + { idct4_sse2, iadst4_sse2, iidentity4_ssse3 }, + { idct8_sse2, iadst8_sse2, iidentity8_sse2 }, + { idct16_sse2, iadst16_sse2, iidentity16_ssse3 }, + { idct32_sse2, NULL, NULL }, + { idct64_low32_ssse3, NULL, NULL }, }; // functions for blocks with eob at DC and within @@ -2312,26 +2312,24 @@ static const transform_1d_ssse3 static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { { - { idct4_new_sse2, idct4_new_sse2, NULL, NULL }, - { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL }, - { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL }, + { idct4_sse2, idct4_sse2, NULL, NULL }, + { iadst4_sse2, iadst4_sse2, NULL, NULL }, + { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL }, }, - { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL }, - { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL }, - { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } }, + { { idct8_low1_ssse3, idct8_sse2, NULL, NULL }, + { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL }, + { iidentity8_sse2, iidentity8_sse2, NULL, NULL } }, { - { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2, - NULL }, - { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2, - NULL }, + { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL }, + { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL }, { NULL, NULL, NULL, NULL }, }, - { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3, - idct32_new_sse2 }, + { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3, + idct32_sse2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, - { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3, - idct64_low32_new_ssse3 }, + { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3, + idct64_low32_ssse3 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; @@ -2340,9 +2338,9 @@ static const transform_1d_ssse3 // used in 4x4, 4x8, 4x16, 8x4, 16x4 static const transform_1d_ssse3 lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { - { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 }, - { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 }, - { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 }, + { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 }, + { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 }, + { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 }, { NULL, NULL, NULL }, { NULL, NULL, NULL }, }; @@ -2419,7 +2417,7 @@ static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride, static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, int stride, TX_SIZE tx_size) { - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; @@ -2437,18 +2435,19 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, } } -void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { +static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { (void)tx_size_; (void)eob; __m128i buf[4]; const TX_SIZE tx_size = TX_4X4; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; @@ -2510,11 +2509,11 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3( __m128i buf1[64 * 8]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 3; @@ -2580,12 +2579,12 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3( static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; int eobx, eoby; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = (eobx + 8) >> 3; @@ -2626,10 +2625,10 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3( __m128i buf1[64]; int eobx, eoby; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 3; @@ -2708,18 +2707,19 @@ static INLINE void lowbd_inv_txfm2d_add_universe_ssse3( } } -void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { +static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { (void)tx_size_; (void)eob; __m128i buf[8]; const TX_SIZE tx_size = TX_4X8; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; @@ -2747,18 +2747,19 @@ void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output, lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); } -void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { +static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { (void)tx_size_; (void)eob; __m128i buf[8]; const TX_SIZE tx_size = TX_8X4; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; @@ -2786,18 +2787,19 @@ void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output, lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); } -void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { +static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { (void)tx_size_; (void)eob; __m128i buf[16]; const TX_SIZE tx_size = TX_4X16; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; @@ -2816,8 +2818,22 @@ void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output, load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur, row_one_loop); transpose_16bit_4x8(buf_cur, buf_cur); - row_txfm(buf_cur, buf_cur, cos_bit_row); - round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); + if (row_txfm == iidentity4_ssse3) { + const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 4; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf_cur, buf_cur, cos_bit_row); + round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); + } if (lr_flip) { __m128i temp[8]; flip_buf_sse2(buf_cur, temp, txfm_size_col); @@ -2831,18 +2847,19 @@ void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output, lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); } -void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { +static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { (void)tx_size_; (void)eob; __m128i buf[16]; const TX_SIZE tx_size = TX_16X4; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 3; @@ -2862,8 +2879,22 @@ void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output, txfm_size_row); transpose_16bit_8x4(buf_cur, buf_cur); } - row_txfm(buf, buf, cos_bit_row); - round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); + if (row_txfm == iidentity16_ssse3) { + const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 16; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); + } if (lr_flip) { __m128i temp[16]; flip_buf_sse2(buf, temp, 16); @@ -2911,12 +2942,14 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, break; } } + void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; if (!txfm_param->lossless) { + const TX_TYPE tx_type = txfm_param->tx_type; av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, txfm_param->tx_size, txfm_param->eob); + } else { av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); } diff --git a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h index 66bd339d1..7d5055deb 100644 --- a/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h +++ b/media/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h @@ -72,13 +72,13 @@ static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) { } // 1D itx types -typedef enum ATTRIBUTE_PACKED { +enum { IDCT_1D, IADST_1D, IFLIPADST_1D = IADST_1D, IIDENTITY_1D, ITX_TYPES_1D, -} ITX_TYPE_1D; +} UENUM1BYTE(ITX_TYPE_1D); static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, diff --git a/media/libaom/src/av1/common/x86/av1_txfm_sse4.c b/media/libaom/src/av1/common/x86/av1_txfm_sse4.c index 90b9879cc..65ccd1952 100644 --- a/media/libaom/src/av1/common/x86/av1_txfm_sse4.c +++ b/media/libaom/src/av1/common/x86/av1_txfm_sse4.c @@ -9,7 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse4.h" diff --git a/media/libaom/src/av1/common/x86/cfl_avx2.c b/media/libaom/src/av1/common/x86/cfl_avx2.c index a8bfdcce6..d9c6f99d5 100644 --- a/media/libaom/src/av1/common/x86/cfl_avx2.c +++ b/media/libaom/src/av1/common/x86/cfl_avx2.c @@ -16,34 +16,34 @@ #include "av1/common/x86/cfl_simd.h" -#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ - cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ - TX_SIZE tx_size) { \ - static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ - subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ - subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ - subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ - subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ - cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ - subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ - subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ - subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ - subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ - subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ - cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \ - cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ - subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ - subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ - subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ - cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \ - cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \ - }; \ - return subfn_##sub[tx_size]; \ +#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ + TX_SIZE tx_size) { \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + return subfn_##sub[tx_size]; \ } /** @@ -147,6 +147,7 @@ static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) +#if CONFIG_AV1_HIGHBITDEPTH /** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. @@ -238,6 +239,7 @@ static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, } CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, __m256i alpha_sign, __m256i dc_q0) { @@ -273,33 +275,34 @@ CFL_PREDICT_X(avx2, 32, 8, lbd); CFL_PREDICT_X(avx2, 32, 16, lbd); CFL_PREDICT_X(avx2, 32, 32, lbd); -cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) { +cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { - predict_lbd_4x4_ssse3, /* 4x4 */ - predict_lbd_8x8_ssse3, /* 8x8 */ - predict_lbd_16x16_ssse3, /* 16x16 */ - predict_lbd_32x32_avx2, /* 32x32 */ - cfl_predict_lbd_null, /* 64x64 (invalid CFL size) */ - predict_lbd_4x8_ssse3, /* 4x8 */ - predict_lbd_8x4_ssse3, /* 8x4 */ - predict_lbd_8x16_ssse3, /* 8x16 */ - predict_lbd_16x8_ssse3, /* 16x8 */ - predict_lbd_16x32_ssse3, /* 16x32 */ - predict_lbd_32x16_avx2, /* 32x16 */ - cfl_predict_lbd_null, /* 32x64 (invalid CFL size) */ - cfl_predict_lbd_null, /* 64x32 (invalid CFL size) */ - predict_lbd_4x16_ssse3, /* 4x16 */ - predict_lbd_16x4_ssse3, /* 16x4 */ - predict_lbd_8x32_ssse3, /* 8x32 */ - predict_lbd_32x8_avx2, /* 32x8 */ - cfl_predict_lbd_null, /* 16x64 (invalid CFL size) */ - cfl_predict_lbd_null, /* 64x16 (invalid CFL size) */ + cfl_predict_lbd_4x4_ssse3, /* 4x4 */ + cfl_predict_lbd_8x8_ssse3, /* 8x8 */ + cfl_predict_lbd_16x16_ssse3, /* 16x16 */ + cfl_predict_lbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_lbd_4x8_ssse3, /* 4x8 */ + cfl_predict_lbd_8x4_ssse3, /* 8x4 */ + cfl_predict_lbd_8x16_ssse3, /* 8x16 */ + cfl_predict_lbd_16x8_ssse3, /* 16x8 */ + cfl_predict_lbd_16x32_ssse3, /* 16x32 */ + cfl_predict_lbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_lbd_4x16_ssse3, /* 4x16 */ + cfl_predict_lbd_16x4_ssse3, /* 16x4 */ + cfl_predict_lbd_8x32_ssse3, /* 8x32 */ + cfl_predict_lbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the // function pointer array out of bounds. return pred[tx_size % TX_SIZES_ALL]; } +#if CONFIG_AV1_HIGHBITDEPTH static __m256i highbd_max_epi16(int bd) { const __m256i neg_one = _mm256_set1_epi16(-1); // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) @@ -346,32 +349,33 @@ CFL_PREDICT_X(avx2, 32, 8, hbd) CFL_PREDICT_X(avx2, 32, 16, hbd) CFL_PREDICT_X(avx2, 32, 32, hbd) -cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) { +cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { - predict_hbd_4x4_ssse3, /* 4x4 */ - predict_hbd_8x8_ssse3, /* 8x8 */ - predict_hbd_16x16_avx2, /* 16x16 */ - predict_hbd_32x32_avx2, /* 32x32 */ - cfl_predict_hbd_null, /* 64x64 (invalid CFL size) */ - predict_hbd_4x8_ssse3, /* 4x8 */ - predict_hbd_8x4_ssse3, /* 8x4 */ - predict_hbd_8x16_ssse3, /* 8x16 */ - predict_hbd_16x8_avx2, /* 16x8 */ - predict_hbd_16x32_avx2, /* 16x32 */ - predict_hbd_32x16_avx2, /* 32x16 */ - cfl_predict_hbd_null, /* 32x64 (invalid CFL size) */ - cfl_predict_hbd_null, /* 64x32 (invalid CFL size) */ - predict_hbd_4x16_ssse3, /* 4x16 */ - predict_hbd_16x4_avx2, /* 16x4 */ - predict_hbd_8x32_ssse3, /* 8x32 */ - predict_hbd_32x8_avx2, /* 32x8 */ - cfl_predict_hbd_null, /* 16x64 (invalid CFL size) */ - cfl_predict_hbd_null, /* 64x16 (invalid CFL size) */ + cfl_predict_hbd_4x4_ssse3, /* 4x4 */ + cfl_predict_hbd_8x8_ssse3, /* 8x8 */ + cfl_predict_hbd_16x16_avx2, /* 16x16 */ + cfl_predict_hbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_hbd_4x8_ssse3, /* 4x8 */ + cfl_predict_hbd_8x4_ssse3, /* 8x4 */ + cfl_predict_hbd_8x16_ssse3, /* 8x16 */ + cfl_predict_hbd_16x8_avx2, /* 16x8 */ + cfl_predict_hbd_16x32_avx2, /* 16x32 */ + cfl_predict_hbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_hbd_4x16_ssse3, /* 4x16 */ + cfl_predict_hbd_16x4_avx2, /* 16x4 */ + cfl_predict_hbd_8x32_ssse3, /* 8x32 */ + cfl_predict_hbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the // function pointer array out of bounds. return pred[tx_size % TX_SIZES_ALL]; } +#endif // CONFIG_AV1_HIGHBITDEPTH // Returns a vector where all the (32-bits) elements are the sum of all the // lanes in a. @@ -463,27 +467,27 @@ CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) // Based on the observation that for small blocks AVX2 does not outperform // SSE2, we call the SSE2 code for block widths 4 and 8. -cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) { +cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { - subtract_average_4x4_sse2, /* 4x4 */ - subtract_average_8x8_sse2, /* 8x8 */ - subtract_average_16x16_avx2, /* 16x16 */ - subtract_average_32x32_avx2, /* 32x32 */ - cfl_subtract_average_null, /* 64x64 (invalid CFL size) */ - subtract_average_4x8_sse2, /* 4x8 */ - subtract_average_8x4_sse2, /* 8x4 */ - subtract_average_8x16_sse2, /* 8x16 */ - subtract_average_16x8_avx2, /* 16x8 */ - subtract_average_16x32_avx2, /* 16x32 */ - subtract_average_32x16_avx2, /* 32x16 */ - cfl_subtract_average_null, /* 32x64 (invalid CFL size) */ - cfl_subtract_average_null, /* 64x32 (invalid CFL size) */ - subtract_average_4x16_sse2, /* 4x16 */ - subtract_average_16x4_avx2, /* 16x4 */ - subtract_average_8x32_sse2, /* 8x32 */ - subtract_average_32x8_avx2, /* 32x8 */ - cfl_subtract_average_null, /* 16x64 (invalid CFL size) */ - cfl_subtract_average_null, /* 64x16 (invalid CFL size) */ + cfl_subtract_average_4x4_sse2, /* 4x4 */ + cfl_subtract_average_8x8_sse2, /* 8x8 */ + cfl_subtract_average_16x16_avx2, /* 16x16 */ + cfl_subtract_average_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_sse2, /* 4x8 */ + cfl_subtract_average_8x4_sse2, /* 8x4 */ + cfl_subtract_average_8x16_sse2, /* 8x16 */ + cfl_subtract_average_16x8_avx2, /* 16x8 */ + cfl_subtract_average_16x32_avx2, /* 16x32 */ + cfl_subtract_average_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_sse2, /* 4x16 */ + cfl_subtract_average_16x4_avx2, /* 16x4 */ + cfl_subtract_average_8x32_sse2, /* 8x32 */ + cfl_subtract_average_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to // index the function pointer array out of bounds. diff --git a/media/libaom/src/av1/common/x86/cfl_simd.h b/media/libaom/src/av1/common/x86/cfl_simd.h index 3b342cd4e..03ae02a92 100644 --- a/media/libaom/src/av1/common/x86/cfl_simd.h +++ b/media/libaom/src/av1/common/x86/cfl_simd.h @@ -15,229 +15,232 @@ #include "av1/common/blockd.h" // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +#endif // CONFIG_AV1_HIGHBITDEPTH // SSE2 version is optimal for with == 4, we reuse them in AVX2 -void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); // SSE2 version is optimal for with == 8, we reuse them in AVX2 -void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); - -void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); - -void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); - -void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); - -void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); - -void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); - -void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); - +void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); + +void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +#endif // CONFIG_AV1_HIGHBITDEPTH #endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ diff --git a/media/libaom/src/av1/common/x86/cfl_ssse3.c b/media/libaom/src/av1/common/x86/cfl_ssse3.c index bbf007295..476b6609a 100644 --- a/media/libaom/src/av1/common/x86/cfl_ssse3.c +++ b/media/libaom/src/av1/common/x86/cfl_ssse3.c @@ -168,6 +168,7 @@ static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, } while (pred_buf_m128i < end); } +#if CONFIG_AV1_HIGHBITDEPTH /** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. @@ -296,6 +297,7 @@ static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, pred_buf_q3 += CFL_BUF_LINE; } while (pred_buf_q3 < end); } +#endif // CONFIG_AV1_HIGHBITDEPTH CFL_GET_SUBSAMPLE_FUNCTION(ssse3) @@ -341,6 +343,7 @@ static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, CFL_PREDICT_FN(ssse3, lbd) +#if CONFIG_AV1_HIGHBITDEPTH static INLINE __m128i highbd_max_epi16(int bd) { const __m128i neg_one = _mm_set1_epi16(-1); // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) @@ -391,3 +394,4 @@ static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, } CFL_PREDICT_FN(ssse3, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c index 0acafd044..e19575d72 100644 --- a/media/libaom/src/av1/common/x86/convolve_2d_avx2.c +++ b/media/libaom/src/av1/common/x86/convolve_2d_avx2.c @@ -24,34 +24,18 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int bd = 8; - - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - + int i, is_horiz_4tap = 0, is_vert_4tap = 0; + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - __m256i filt[4], coeffs_h[4], coeffs_v[4]; - assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v); - const __m256i round_const_h = _mm256_set1_epi16( ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); @@ -65,58 +49,96 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, ((1 << (offset_bits - conv_params->round_1)) >> 1)); const __m128i round_shift_v = _mm_cvtsi32_si128(bits); - for (j = 0; j < w; j += 8) { - for (i = 0; i < im_h; i += 2) { - __m256i data = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + __m256i filt[4], coeffs_h[4], coeffs_v[4]; + + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0))) + is_vert_4tap = 1; - // Load the next line - if (i + 1 < im_h) + // horz_filt as 4 tap and vert_filt as 8 tap + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // horz-filter + for (int j = 0; j < w; j += 8) { + for (i = 0; i < (im_h - 2); i += 2) { + __m256i data = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + // Load the next line data = _mm256_inserti128_si256( data, _mm_loadu_si128( (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), 1); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), + round_shift_h); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } - __m256i res = convolve_lowbd_x(data, coeffs_h, filt); + __m256i data_1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); - _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); - } - /* Vertical filter */ - { + // vert filter + CONVOLVE_SR_VERTICAL_FILTER_8TAP; + } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + // horz_filter + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP; + // vert_filter + __m256i s[6]; __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); - __m256i s[8]; s[0] = _mm256_unpacklo_epi16(src_0, src_1); s[1] = _mm256_unpacklo_epi16(src_2, src_3); - s[2] = _mm256_unpacklo_epi16(src_4, src_5); - - s[4] = _mm256_unpackhi_epi16(src_0, src_1); - s[5] = _mm256_unpackhi_epi16(src_2, src_3); - s[6] = _mm256_unpackhi_epi16(src_4, src_5); + s[3] = _mm256_unpackhi_epi16(src_0, src_1); + s[4] = _mm256_unpackhi_epi16(src_2, src_3); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); - __m256i res_a = convolve(s, coeffs_v); - __m256i res_b = convolve(s + 4, coeffs_v); + __m256i res_a = convolve_4tap(s, coeffs_v + 1); + __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); // Combine V round and 2F-H-V round into a single rounding res_a = @@ -154,13 +176,25 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, s[0] = s[1]; s[1] = s[2]; - s[2] = s[3]; - + s[3] = s[4]; s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; } } + } else { + int j; + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (j = 0; j < w; j += 8) { + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP; + + CONVOLVE_SR_VERTICAL_FILTER_8TAP; + } } } @@ -180,12 +214,12 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; (void)conv_params; if (w >= 16) { @@ -195,20 +229,20 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, if (w == 2) { do { - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; diff --git a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c index b1a62a4f6..5376ea79b 100644 --- a/media/libaom/src/av1/common/x86/convolve_2d_sse2.c +++ b/media/libaom/src/av1/common/x86/convolve_2d_sse2.c @@ -22,7 +22,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int bd = 8; @@ -45,7 +45,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -111,7 +111,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -205,7 +205,7 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; if (w == 2) { - *(uint16_t *)p = _mm_cvtsi128_si32(res); + *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res); } else if (w == 4) { *(uint32_t *)p = _mm_cvtsi128_si32(res); } else { @@ -240,12 +240,12 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; (void)conv_params; if (w >= 16) { @@ -255,20 +255,20 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, if (w == 2) { do { - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; @@ -354,24 +354,23 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, } } -void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_copy_sse2( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const __m128i zero = _mm_setzero_si128(); const __m128i left_shift = _mm_cvtsi32_si128(bits); int i, j; @@ -411,14 +410,14 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, const __m128i data_ref_0_hi = _mm_loadu_si128((__m128i *)(&dst[j + 8])); - const __m128i comp_avg_res_lo = - comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg); const __m128i round_result_lo = convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - const __m128i comp_avg_res_hi = - comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg); + const __m128i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg); const __m128i round_result_hi = convolve_rounding( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); @@ -449,7 +448,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); diff --git a/media/libaom/src/av1/common/x86/convolve_avx2.c b/media/libaom/src/av1/common/x86/convolve_avx2.c index 0e91ea947..1d5bc6fbd 100644 --- a/media/libaom/src/av1/common/x86/convolve_avx2.c +++ b/media/libaom/src/av1/common/x86/convolve_avx2.c @@ -21,155 +21,241 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride; - + int i, j, is_vert_4tap = 0; // right shift is F-1 because we are already dividing // filter co-efficients by 2 const int right_shift_bits = (FILTER_BITS - 1); const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); const __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); - __m256i coeffs[4], s[8]; assert(conv_params->round_0 <= FILTER_BITS); assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); - prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs); - (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; (void)conv_params; + __m256i coeffs[4], s[8]; + __m128i d[6]; - for (j = 0; j < w; j += 16) { - const uint8_t *data = &src_ptr[j]; - __m256i src6; - - // Load lines a and b. Line a to lower 128, line b to upper 128 - const __m256i src_01a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - 0x20); - - const __m256i src_12a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - 0x20); - - const __m256i src_23a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - 0x20); - - const __m256i src_34a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - 0x20); - - const __m256i src_45a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); - const __m256i src_56a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - src6, 0x20); - - s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); - s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); - s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); - - s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); - s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); - s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); - - for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - const __m256i src_67a = _mm256_permute2x128_si256( - src6, - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - 0x20); + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - src6, 0x20); + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + // vert_filt as 4 tap + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); - s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); - s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); - const __m256i res_lo = convolve_lowbd(s, coeffs); + s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); - /* rounding code */ - // shift by F - 1 - const __m256i res_16b_lo = _mm256_sra_epi16( - _mm256_add_epi16(res_lo, right_shift_const), right_shift); - // 8 bit conversion and saturation to uint8 - __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); - if (w - j > 8) { - const __m256i res_hi = convolve_lowbd(s + 4, coeffs); + d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); /* rounding code */ // shift by F - 1 - const __m256i res_16b_hi = _mm256_sra_epi16( - _mm256_add_epi16(res_hi, right_shift_const), right_shift); + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 - __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); - - __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); - - const __m128i res_0 = _mm256_castsi256_si128(res_a); - const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); - - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_1); - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); - const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); - if (w - j > 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); - } else if (w - j > 2) { - xx_storel_32(&dst[i * dst_stride + j], res_0); - xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); } else { - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; - *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); - *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + const __m256i src_56a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + + s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); + s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_67a = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + const __m256i res_lo = convolve_lowbd(s, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd(s + 4, coeffs); - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } } } } @@ -178,81 +264,119 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_0; - __m256i filt[4], coeffs[4]; - - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); - const __m256i round_0_const = _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(bits); - + int i, is_horiz_4tap = 0; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); assert(conv_params->round_0 > 0); - if (w <= 8) { - for (i = 0; i < h; i += 2) { - const __m256i data = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&src_ptr[i * src_stride + src_stride]))), - 0x20); - - __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); - - res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), - round_0_shift); - - res_16b = - _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); - - /* rounding code */ - // 8 bit conversion and saturation to uint8 - __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); - - const __m128i res_0 = _mm256_castsi256_si128(res_8b); - const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); - if (w > 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); - } else if (w > 2) { - xx_storel_32(&dst[i * dst_stride], res_0); - xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); - } else { - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; - *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); - *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + __m256i coeffs[4], filt[4]; + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; + + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } } } } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18 - // 19 20 21 22 23 - const __m256i data = _mm256_inserti128_si256( - _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), - 1); + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); @@ -266,11 +390,49 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); - // Store values into the destination buffer - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - res_8b = _mm256_permute4x64_epi64(res_8b, 216); - __m128i res = _mm256_castsi256_si128(res_8b); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } } } } diff --git a/media/libaom/src/av1/common/x86/convolve_sse2.c b/media/libaom/src/av1/common/x86/convolve_sse2.c index 5016642de..4323ac4d1 100644 --- a/media/libaom/src/av1/common/x86/convolve_sse2.c +++ b/media/libaom/src/av1/common/x86/convolve_sse2.c @@ -79,7 +79,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *src_ptr = src - fo_vert * src_stride; @@ -88,14 +88,14 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, __m128i coeffs[4]; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; (void)conv_params; assert(conv_params->round_0 <= FILTER_BITS); assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); if (w <= 4) { __m128i s[8], src6, res, res_round, res16; @@ -132,7 +132,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); if (w == 2) - *(uint16_t *)dst = res_int; + *(uint16_t *)dst = (uint16_t)res_int; else *(uint32_t *)dst = res_int; @@ -145,7 +145,7 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); if (w == 2) - *(uint16_t *)dst = res_int; + *(uint16_t *)dst = (uint16_t)res_int; else *(uint32_t *)dst = res_int; @@ -240,7 +240,7 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, + const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - fo_horiz; @@ -253,13 +253,13 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, __m128i coeffs[4]; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); if (w <= 4) { do { @@ -284,7 +284,7 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, uint32_t r = _mm_cvtsi128_si32(res); if (w == 2) - *(uint16_t *)dst = r; + *(uint16_t *)dst = (uint16_t)r; else *(uint32_t *)dst = r; diff --git a/media/libaom/src/av1/common/x86/filterintra_sse4.c b/media/libaom/src/av1/common/x86/filterintra_sse4.c index c11edc1d4..99f4d9967 100644 --- a/media/libaom/src/av1/common/x86/filterintra_sse4.c +++ b/media/libaom/src/av1/common/x86/filterintra_sse4.c @@ -27,10 +27,6 @@ void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, assert(bw <= 32 && bh <= 32); - // The initialization is just for silencing Jenkins static analysis warnings - for (r = 0; r < bh + 1; ++r) - memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0])); - for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c index ae68f0bbb..396aed01b 100644 --- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c +++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c @@ -24,8 +24,8 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, + const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); int im_h = h + filter_params_y->taps - 1; @@ -58,8 +58,8 @@ void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { /* Horizontal filter */ @@ -222,12 +222,12 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { void av1_highbd_convolve_2d_copy_sr_avx2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; (void)conv_params; (void)bd; @@ -238,10 +238,10 @@ void av1_highbd_convolve_2d_copy_sr_avx2( if (w == 2) { do { - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c index 15f8872c1..f758775ee 100644 --- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c +++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c @@ -74,12 +74,12 @@ static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { void av1_highbd_convolve_2d_copy_sr_sse2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; (void)conv_params; (void)bd; if (w >= 16) { diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c index 3f8dafb4b..d2ff47c1f 100644 --- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c +++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c @@ -21,23 +21,23 @@ #include "aom_dsp/x86/convolve_sse4_1.h" #include "av1/common/convolve.h" -void av1_highbd_jnt_convolve_2d_copy_sse4_1( +void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); @@ -75,15 +75,17 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( const __m128i res_unsigned_lo = _mm_add_epi32(res_32b_lo, offset_const); - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); const __m128i res_unsigned_hi = _mm_add_epi32(res_32b_hi, offset_const); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -132,9 +134,9 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( _mm_add_epi32(res_32b_hi, offset_const); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -166,11 +168,11 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( } } -void av1_highbd_jnt_convolve_2d_sse4_1( +void av1_highbd_dist_wtd_convolve_2d_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst = conv_params->dst; @@ -179,7 +181,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1( int im_stride = MAX_SB_SIZE; int i, j; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; @@ -206,7 +208,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1( /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -273,7 +275,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1( /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -359,8 +361,9 @@ void av1_highbd_jnt_convolve_2d_sse4_1( const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); - const __m128i comp_avg_res = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result = highbd_convolve_rounding_sse2( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -391,10 +394,12 @@ void av1_highbd_jnt_convolve_2d_sse4_1( const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, diff --git a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c index 1d029db39..5318fcaa8 100644 --- a/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c +++ b/media/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -22,8 +22,8 @@ void av1_highbd_convolve_2d_sr_ssse3( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); int im_h = h + filter_params_y->taps - 1; int im_stride = 8; @@ -54,8 +54,8 @@ void av1_highbd_convolve_2d_sr_ssse3( _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i zero = _mm_setzero_si128(); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { /* Horizontal filter */ diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c index ade2af03e..93e98e4b3 100644 --- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c +++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c @@ -18,6 +18,7 @@ #include "av1/common/idct.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "aom_dsp/x86/txfm_common_avx2.h" // Note: // Total 32x4 registers to represent 32x32 block coefficients. @@ -46,6 +47,47 @@ static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { return clamped; } +static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) { + if (shift != 0) { + __m256i rnding = _mm256_set1_epi32(1 << (shift - 1)); + in[0] = _mm256_add_epi32(in[0], rnding); + in[1] = _mm256_add_epi32(in[1], rnding); + in[2] = _mm256_add_epi32(in[2], rnding); + in[3] = _mm256_add_epi32(in[3], rnding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + } +} + +static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) { + round_shift_4x4_avx2(in, shift); + round_shift_4x4_avx2(in + 4, shift); + round_shift_4x4_avx2(in + 8, shift); + round_shift_4x4_avx2(in + 12, shift); +} + +static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out, + const __m256i *clamp_lo, + const __m256i *clamp_hi, int size) { + __m256i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm256_max_epi32(in[i], *clamp_lo); + out[i] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm256_min_epi32(a1, *clamp_hi); + + a0 = _mm256_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm256_min_epi32(a1, *clamp_hi); + } +} + static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred, __m256i res0, __m256i res1, const int bd) { @@ -72,30 +114,48 @@ static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, _mm256_storeu_si256((__m256i *)(output + i * stride), u); } } - -static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { - __m256i tmp, round; - round = _mm256_set1_epi32(1 << (bit - 1)); - tmp = _mm256_add_epi32(vec, round); - return _mm256_srai_epi32(tmp, bit); +static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res, + const int bd) { + __m256i x0 = pred; + x0 = _mm256_add_epi32(res, x0); + x0 = _mm256_packus_epi32(x0, x0); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; } -static INLINE void av1_round_shift_array_32_avx2(__m256i *input, - __m256i *output, - const int size, - const int bit) { - if (bit > 0) { - int i; - for (i = 0; i < size; i++) { - output[i] = av1_round_shift_32_avx2(input[i], bit); - } - } else { - int i; - for (i = 0; i < size; i++) { - output[i] = _mm256_slli_epi32(input[i], -bit); - } +static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + __m128i temp; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m256i v = _mm256_cvtepi16_epi32(temp); + __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd); + __m128i u1 = _mm256_castsi256_si128(u); + _mm_storeu_si128((__m128i *)(output + i * stride), u1); } } +static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0, + __m256i *out1, const __m256i *clamp_lo, + const __m256i *clamp_hi, int shift) { + __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); + __m256i a0 = _mm256_add_epi32(offset, in0); + __m256i a1 = _mm256_sub_epi32(offset, in1); + + a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + + a0 = _mm256_max_epi32(a0, *clamp_lo); + a0 = _mm256_min_epi32(a0, *clamp_hi); + a1 = _mm256_max_epi32(a1, *clamp_lo); + a1 = _mm256_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { __m256i u0, u1, u2, u3, u4, u5, u6, u7; @@ -134,6 +194,43 @@ static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); } +static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[7], in[6]); + u1 = _mm256_unpackhi_epi32(in[7], in[6]); + + u2 = _mm256_unpacklo_epi32(in[5], in[4]); + u3 = _mm256_unpackhi_epi32(in[5], in[4]); + + u4 = _mm256_unpacklo_epi32(in[3], in[2]); + u5 = _mm256_unpackhi_epi32(in[3], in[2]); + + u6 = _mm256_unpacklo_epi32(in[1], in[0]); + u7 = _mm256_unpackhi_epi32(in[1], in[0]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); +} + static void load_buffer_32x32(const int32_t *coeff, __m256i *in, int input_stiride, int size) { int i; @@ -179,36 +276,6 @@ static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0, *out1 = a1; } -static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1, - __m256i *out0, __m256i *out1) { - __m256i a0 = _mm256_add_epi32(in0, in1); - __m256i a1 = _mm256_sub_epi32(in0, in1); - - *out0 = a0; - *out1 = a1; -} - -static void addsub_shift_avx2(const __m256i in0, const __m256i in1, - __m256i *out0, __m256i *out1, - const __m256i *clamp_lo, const __m256i *clamp_hi, - int shift) { - __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); - __m256i in0_w_offset = _mm256_add_epi32(in0, offset); - __m256i a0 = _mm256_add_epi32(in0_w_offset, in1); - __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1); - - a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - - a0 = _mm256_max_epi32(a0, *clamp_lo); - a0 = _mm256_min_epi32(a0, *clamp_hi); - a1 = _mm256_max_epi32(a1, *clamp_lo); - a1 = _mm256_min_epi32(a1, *clamp_hi); - - *out0 = a0; - *out1 = a1; -} - static INLINE void idct32_stage4_avx2( __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, @@ -344,63 +411,32 @@ static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out, const int do_cols, const int bd, const int out_shift, - const int log_range) { - if (do_cols) { - addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31); - addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30); - addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29); - addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28); - addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27); - addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26); - addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25); - addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24); - addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23); - addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22); - addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21); - addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20); - addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19); - addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18); - addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17); - addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16); - } else { + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } @@ -410,8 +446,8 @@ static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); - const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i x; // stage 0 // stage 1 @@ -427,22 +463,16 @@ static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, // stage 7 // stage 8 // stage 9 - if (do_cols) { - x = _mm256_max_epi32(x, clamp_lo); - x = _mm256_min_epi32(x, clamp_hi); - } else { + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); x = _mm256_add_epi32(offset, x); x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); - x = _mm256_max_epi32(x, clamp_lo_out); - x = _mm256_min_epi32(x, clamp_hi_out); } - + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; @@ -586,7 +616,7 @@ static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, &rounding, bit); // stage 9 - idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } @@ -736,7 +766,7 @@ static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, &rounding, bit); // stage 9 - idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } @@ -1094,66 +1124,2958 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, bf0[31] = bf1[31]; // stage 9 + addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } + } +} +static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + { + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm256_mullo_epi32(in[0], cospi32); + in[0] = _mm256_add_epi32(in[0], rnding); + in[0] = _mm256_srai_epi32(in[0], bit); + + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm256_add_epi32(in[0], offset); + in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + } + in[0] = _mm256_max_epi32(in[0], clamp_lo); + in[0] = _mm256_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; + } +} + +static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit); + + addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; + + // stage 5 + addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[5], cospi32); + y = _mm256_mullo_epi32(u[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + u[10] = _mm256_sub_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[13] = _mm256_add_epi32(x, y); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_add_epi32(x, y); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + // stage 7 + addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + y = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(x, y); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(x, y); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm256_mullo_epi32(v[5], cospi32); + y = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_sub_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_add_epi32(x, y); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_add_epi32(x, y); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i v[16], x, y, temp1, temp2; + + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(x, rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(zero, x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = _mm256_mullo_epi32(v[8], cospi8); + x = _mm256_mullo_epi32(v[9], cospi56); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[8], cospi56); + x = _mm256_mullo_epi32(v[9], cospi8); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm256_mullo_epi32(v[12], cospi16); + x = _mm256_mullo_epi32(v[13], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[12], cospi48); + x = _mm256_mullo_epi32(v[13], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + y = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + y = _mm256_mullo_epi32(v[10], cospi32); + x = _mm256_mullo_epi32(v[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + y = _mm256_mullo_epi32(v[14], cospi32); + x = _mm256_mullo_epi32(v[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + __m256i zero = _mm256_setzero_si256(); + x = _mm256_mullo_epi32(in[0], cospi62); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + u[1] = _mm256_sub_epi32(zero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + x = _mm256_mullo_epi32(in[2], cospi54); + u[2] = _mm256_add_epi32(x, rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + x = _mm256_mullo_epi32(in[2], cospi10); + u[3] = _mm256_sub_epi32(zero, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + x = _mm256_mullo_epi32(in[4], cospi46); + u[4] = _mm256_add_epi32(x, rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(in[4], cospi18); + u[5] = _mm256_sub_epi32(zero, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(in[6], cospi38); + u[6] = _mm256_add_epi32(x, rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(in[6], cospi26); + u[7] = _mm256_sub_epi32(zero, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[8] = _mm256_mullo_epi32(in[7], cospi34); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + u[9] = _mm256_mullo_epi32(in[7], cospi30); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + u[10] = _mm256_mullo_epi32(in[5], cospi42); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_mullo_epi32(in[5], cospi22); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_mullo_epi32(in[3], cospi50); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + u[13] = _mm256_mullo_epi32(in[3], cospi14); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + u[14] = _mm256_mullo_epi32(in[1], cospi58); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_mullo_epi32(in[1], cospi6); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 3 + addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi56); + u[8] = _mm256_mullo_epi32(u[8], cospi8); + u[8] = _mm256_add_epi32(u[8], x); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + x = _mm256_mullo_epi32(u[9], cospi8); + u[9] = _mm256_sub_epi32(y, x); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + x = _mm256_mullo_epi32(u[11], cospi24); + y = _mm256_mullo_epi32(u[10], cospi24); + u[10] = _mm256_mullo_epi32(u[10], cospi40); + u[10] = _mm256_add_epi32(u[10], x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + x = _mm256_mullo_epi32(u[11], cospi40); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + x = _mm256_mullo_epi32(u[13], cospi8); + y = _mm256_mullo_epi32(u[12], cospi8); + u[12] = _mm256_mullo_epi32(u[12], cospim56); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospim56); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi40); + y = _mm256_mullo_epi32(u[14], cospi40); + u[14] = _mm256_mullo_epi32(u[14], cospim24); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim24); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 5 + addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm256_mullo_epi32(u[5], cospi48); + y = _mm256_mullo_epi32(u[4], cospi48); + u[4] = _mm256_mullo_epi32(u[4], cospi16); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(u[5], cospi16); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(u[7], cospi16); + y = _mm256_mullo_epi32(u[6], cospi16); + u[6] = _mm256_mullo_epi32(u[6], cospim48); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(u[7], cospim48); + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + x = _mm256_mullo_epi32(u[13], cospi48); + y = _mm256_mullo_epi32(u[12], cospi48); + u[12] = _mm256_mullo_epi32(u[12], cospi16); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospi16); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi16); + y = _mm256_mullo_epi32(u[14], cospi16); + u[14] = _mm256_mullo_epi32(u[14], cospim48); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim48); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 7 + addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + u[2] = _mm256_add_epi32(y, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(y, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + u[10] = _mm256_add_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + u[14] = _mm256_add_epi32(y, x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]); + out[2] = u[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]); + out[4] = u[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]); + out[6] = u[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]); + out[8] = u[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]); + out[10] = u[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]); + out[12] = u[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]); + out[14] = u[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm256_mullo_epi32(in[15], cospi2); + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(v[0], x); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_mullo_epi32(in[15], cospi62); + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(v[1], x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(in[13], cospi10); + x = _mm256_mullo_epi32(in[2], cospi54); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(in[13], cospi54); + x = _mm256_mullo_epi32(in[2], cospi10); + v[3] = _mm256_sub_epi32(v[3], x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_mullo_epi32(in[11], cospi18); + x = _mm256_mullo_epi32(in[4], cospi46); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(in[11], cospi46); + x = _mm256_mullo_epi32(in[4], cospi18); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(in[9], cospi26); + x = _mm256_mullo_epi32(in[6], cospi38); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(in[9], cospi38); + x = _mm256_mullo_epi32(in[6], cospi26); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = _mm256_mullo_epi32(in[7], cospi34); + x = _mm256_mullo_epi32(in[8], cospi30); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(in[7], cospi30); + x = _mm256_mullo_epi32(in[8], cospi34); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(in[5], cospi42); + x = _mm256_mullo_epi32(in[10], cospi22); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(in[5], cospi22); + x = _mm256_mullo_epi32(in[10], cospi42); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(in[3], cospi50); + x = _mm256_mullo_epi32(in[12], cospi14); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(in[3], cospi14); + x = _mm256_mullo_epi32(in[12], cospi50); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(in[1], cospi58); + x = _mm256_mullo_epi32(in[14], cospi6); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(in[1], cospi6); + x = _mm256_mullo_epi32(in[14], cospi58); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi8); + x = _mm256_mullo_epi32(u[9], cospi56); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi8); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi40); + x = _mm256_mullo_epi32(u[11], cospi24); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(u[10], cospi24); + x = _mm256_mullo_epi32(u[11], cospi40); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[12], cospim56); + x = _mm256_mullo_epi32(u[13], cospi8); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi8); + x = _mm256_mullo_epi32(u[13], cospim56); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim24); + x = _mm256_mullo_epi32(u[15], cospi40); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi40); + x = _mm256_mullo_epi32(u[15], cospim24); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 5 + addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm256_mullo_epi32(u[4], cospi16); + x = _mm256_mullo_epi32(u[5], cospi48); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(u[4], cospi48); + x = _mm256_mullo_epi32(u[5], cospi16); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(u[6], cospim48); + x = _mm256_mullo_epi32(u[7], cospi16); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(u[6], cospi16); + x = _mm256_mullo_epi32(u[7], cospim48); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm256_mullo_epi32(u[12], cospi16); + x = _mm256_mullo_epi32(u[13], cospi48); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi48); + x = _mm256_mullo_epi32(u[13], cospi16); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim48); + x = _mm256_mullo_epi32(u[15], cospi16); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi16); + x = _mm256_mullo_epi32(u[15], cospim48); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 7 + addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 if (do_cols) { - addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31); - addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30); - addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29); - addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28); - addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27); - addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26); - addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25); - addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24); - addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23); - addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22); - addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21); - addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20); - addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19); - addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18); - addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17); - addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16); + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); - const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} +static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rnding); + x = _mm256_srai_epi32(x, bit); + + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} +static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm256_mullo_epi32(in[1], cospi56); + y = _mm256_mullo_epi32(in[7], cospim8); + u4 = _mm256_add_epi32(x, y); + u4 = _mm256_add_epi32(u4, rnding); + u4 = _mm256_srai_epi32(u4, bit); + + x = _mm256_mullo_epi32(in[1], cospi8); + y = _mm256_mullo_epi32(in[7], cospi56); + u7 = _mm256_add_epi32(x, y); + u7 = _mm256_add_epi32(u7, rnding); + u7 = _mm256_srai_epi32(u7, bit); + + x = _mm256_mullo_epi32(in[5], cospi24); + y = _mm256_mullo_epi32(in[3], cospim40); + u5 = _mm256_add_epi32(x, y); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + x = _mm256_mullo_epi32(in[5], cospi40); + y = _mm256_mullo_epi32(in[3], cospi24); + u6 = _mm256_add_epi32(x, y); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + // stage 3 + x = _mm256_mullo_epi32(u0, cospi32); + y = _mm256_mullo_epi32(u1, cospi32); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + v1 = _mm256_sub_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi48); + y = _mm256_mullo_epi32(u3, cospim16); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi16); + y = _mm256_mullo_epi32(u3, cospi48); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm256_mullo_epi32(v5, cospi32); + y = _mm256_mullo_epi32(v6, cospi32); + u6 = _mm256_add_epi32(y, x); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + u5 = _mm256_sub_epi32(y, x); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4_avx2(out, out_shift); + round_shift_4x4_avx2(out + 4, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} +static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + __m256i u[8], x; + + // stage 0 + // stage 1 + // stage 2 + + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(kZero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // stage 3 + // stage 4 + __m256i temp1, temp2; + temp1 = _mm256_mullo_epi32(u[0], cospi16); + x = _mm256_mullo_epi32(u[1], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm256_mullo_epi32(u[0], cospi48); + x = _mm256_mullo_epi32(u[1], cospi16); + u[5] = _mm256_sub_epi32(temp2, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + // stage 5 + // stage 6 + temp1 = _mm256_mullo_epi32(u[0], cospi32); + x = _mm256_mullo_epi32(u[1], cospi32); + u[2] = _mm256_add_epi32(temp1, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(temp1, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + temp1 = _mm256_mullo_epi32(u[4], cospi32); + x = _mm256_mullo_epi32(u[5], cospi32); + u[6] = _mm256_add_epi32(temp1, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(temp1, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[8], v[8], x; + + // stage 0 + // stage 1 + // stage 2 + + u[0] = _mm256_mullo_epi32(in[7], cospi4); + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(u[0], x); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_mullo_epi32(in[7], cospi60); + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(u[1], x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + u[2] = _mm256_mullo_epi32(in[5], cospi20); + x = _mm256_mullo_epi32(in[2], cospi44); + u[2] = _mm256_add_epi32(u[2], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_mullo_epi32(in[5], cospi44); + x = _mm256_mullo_epi32(in[2], cospi20); + u[3] = _mm256_sub_epi32(u[3], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_mullo_epi32(in[3], cospi36); + x = _mm256_mullo_epi32(in[4], cospi28); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(in[3], cospi28); + x = _mm256_mullo_epi32(in[4], cospi36); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(in[1], cospi52); + x = _mm256_mullo_epi32(in[6], cospi12); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(in[1], cospi12); + x = _mm256_mullo_epi32(in[6], cospi52); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 3 + addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[6], cospim48); + x = _mm256_mullo_epi32(v[7], cospi16); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(v[6], cospi16); + x = _mm256_mullo_epi32(v[7], cospim48); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 5 + addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + u[2] = _mm256_add_epi32(v[0], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(v[0], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + v[0] = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + u[6] = _mm256_add_epi32(v[0], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(v[0], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} +static INLINE void idct64_stage8_avx2( + __m256i *u, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + __m256i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols, + int bd, int out_shift, + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } +} + +static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + + { + __m256i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; } } +static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + + { + __m256i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m256i temp1, temp2; + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + // stage 6 + temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64]; + __m256i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi17 = _mm256_set1_epi32(cospi[17]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi19 = _mm256_set1_epi32(cospi[19]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi21 = _mm256_set1_epi32(cospi[21]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi23 = _mm256_set1_epi32(cospi[23]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi25 = _mm256_set1_epi32(cospi[25]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi27 = _mm256_set1_epi32(cospi[27]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi29 = _mm256_set1_epi32(cospi[29]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi31 = _mm256_set1_epi32(cospi[31]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi35 = _mm256_set1_epi32(cospi[35]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi39 = _mm256_set1_epi32(cospi[39]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi43 = _mm256_set1_epi32(cospi[43]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi47 = _mm256_set1_epi32(cospi[47]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit); + u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit); + u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit); + u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit); + u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit); + u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit); + u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit); + u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit); + + for (i = 32; i < 64; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + + for (i = 16; i < 32; i += 4) { + addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } + } +} typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift); @@ -1164,19 +4086,21 @@ static const transform_1d_avx2 { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, }, - { { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, { + { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL }, + { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL }, { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, + }, + { + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, - { { NULL, NULL, NULL, NULL }, + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; @@ -1186,10 +4110,10 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { - __m256i buf1[64 * 2]; + __m256i buf1[64 * 8]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; @@ -1198,7 +4122,7 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_col); - + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_avx2 row_txfm = @@ -1213,7 +4137,7 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, // 1st stage: column transform for (int i = 0; i < buf_size_nonzero_h_div8; i++) { - __m256i buf0[32]; + __m256i buf0[64]; const int32_t *input_row = input + i * input_stride * 8; for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { __m256i *buf0_cur = buf0 + j * 8; @@ -1221,18 +4145,29 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]); } - - row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_avx2( + buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); __m256i *_buf1 = buf1 + i * 8; - for (int j = 0; j < buf_size_w_div8; ++j) { - transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_flip_avx2( + &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); + } } } // 2nd stage: column transform for (int i = 0; i < buf_size_w_div8; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, - inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, @@ -1240,12 +4175,15 @@ static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, } // write to buffer - { + if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, output + 16 * i, stride, ud_flip, txfm_size_row, bd); } + } else if (txfm_size_col == 8) { + highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row, + bd); } } @@ -1255,95 +4193,54 @@ void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, int eob, const int bd) { switch (tx_type) { case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; - default: assert(0); break; - } -} - -void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - const int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type, + tx_size, eob, bd); break; - - default: assert(0); + default: assert(0); break; } } - void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { - case TX_32X32: - av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param); - break; - case TX_16X16: - av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_8X8: - av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); - break; case TX_4X8: - av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); break; case TX_8X4: - av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); - break; - case TX_8X16: - av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X8: - av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X32: - av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); - break; - case TX_32X16: - av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); - break; - case TX_32X64: - av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); - break; - case TX_64X32: - av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X4: av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); break; case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); - break; - case TX_8X32: - av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); - break; - case TX_32X8: - av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); break; - case TX_64X64: - case TX_16X64: - case TX_64X16: - av1_highbd_inv_txfm2d_add_universe_sse4_1( + default: + av1_highbd_inv_txfm2d_add_universe_avx2( input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd); break; - default: assert(0 && "Invalid transform size"); break; } } diff --git a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c index e29e0baf5..03eaef832 100644 --- a/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/media/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c @@ -17,6 +17,7 @@ #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/idct.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse2.h" #include "av1/common/x86/av1_txfm_sse4.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" @@ -36,19 +37,87 @@ static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { return clamped; } +static INLINE void round_shift_4x4(__m128i *in, int shift) { + if (shift != 0) { + __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); + in[0] = _mm_add_epi32(in[0], rnding); + in[1] = _mm_add_epi32(in[1], rnding); + in[2] = _mm_add_epi32(in[2], rnding); + in[3] = _mm_add_epi32(in[3], rnding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + } +} + +static void round_shift_8x8(__m128i *in, int shift) { + round_shift_4x4(&in[0], shift); + round_shift_4x4(&in[4], shift); + round_shift_4x4(&in[8], shift); + round_shift_4x4(&in[12], shift); +} + +static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int size) { + __m128i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm_max_epi32(in[i], *clamp_lo); + out[i] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm_min_epi32(a1, *clamp_hi); + + a0 = _mm_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm_min_epi32(a1, *clamp_hi); + } +} + static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, __m128i res0, __m128i res1, const int bd) { __m128i x0 = _mm_cvtepi16_epi32(pred); __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); - + __m128i min_clip_val = _mm_setzero_si128(); + __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1); x0 = _mm_add_epi32(res0, x0); x1 = _mm_add_epi32(res1, x1); + x0 = _mm_max_epi32(x0, min_clip_val); + x0 = _mm_min_epi32(x0, max_clip_val); + x1 = _mm_max_epi32(x1, min_clip_val); + x1 = _mm_min_epi32(x1, max_clip_val); x0 = _mm_packus_epi32(x0, x1); + return x0; +} + +static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred, + __m128i res0, const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + + x0 = _mm_add_epi32(res0, x0); + x0 = _mm_packus_epi32(x0, x0); x0 = highbd_clamp_epi16(x0, bd); return x0; } +static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd); + + _mm_storel_epi64((__m128i *)(output + i * stride), u); + } +} + static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, int stride, int flipud, int height, const int bd) { @@ -91,34 +160,23 @@ static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, *out1 = a1; } -static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1, - __m128i *out0, __m128i *out1) { - __m128i a0 = _mm_add_epi32(in0, in1); - __m128i a1 = _mm_sub_epi32(in0, in1); - - *out0 = a0; - *out1 = a1; -} - -static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1, - __m128i *out0, __m128i *out1, - const __m128i *clamp_lo, - const __m128i *clamp_hi, int shift) { +static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int shift) { __m128i offset = _mm_set1_epi32((1 << shift) >> 1); - __m128i in0_w_offset = _mm_add_epi32(in0, offset); - __m128i a0 = _mm_add_epi32(in0_w_offset, in1); - __m128i a1 = _mm_sub_epi32(in0_w_offset, in1); + __m128i in0_w_offset = _mm_add_epi32(*in0, offset); + __m128i in1_w_offset = _mm_add_epi32(*in1, offset); - a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift)); + in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift)); - a0 = _mm_max_epi32(a0, *clamp_lo); - a0 = _mm_min_epi32(a0, *clamp_hi); - a1 = _mm_max_epi32(a1, *clamp_lo); - a1 = _mm_min_epi32(a1, *clamp_hi); + in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo); + in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi); + in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo); + in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi); - *out0 = a0; - *out1 = a1; + *in0 = in0_w_offset; + *in1 = in1_w_offset; } static INLINE void idct32_stage4_sse4_1( @@ -274,63 +332,34 @@ static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, const int do_cols, const int bd, const int out_shift, - const int log_range) { - if (do_cols) { - addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31); - addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30); - addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29); - addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28); - addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27); - addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26); - addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25); - addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24); - addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23); - addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22); - addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21); - addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20); - addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19); - addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18); - addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17); - addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16); - } else { + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 32; i += 8) { + round_shift_4x4(out + i, out_shift); + round_shift_4x4(out + i + 4, out_shift); + } + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } @@ -354,17 +383,23 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, *out1 = a1; } -static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { +static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3, x, y; + // Stage 0 + // Stage 1 + // Stage 2 v0 = _mm_unpacklo_epi32(in[0], in[1]); v1 = _mm_unpackhi_epi32(in[0], in[1]); v2 = _mm_unpacklo_epi32(in[2], in[3]); @@ -397,21 +432,27 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); - if (do_cols) { - addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3); - addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2); - } else { - const int log_range = AOMMAX(16, bd + 6); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi); - addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi); + // Stage 3 + addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); + + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift); + shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift); } } -static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { +static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { const int32_t *sinpi = sinpi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_set1_epi32(0); + __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1)); + rnding = _mm_unpacklo_epi32(rnding, zero); + const __m128i mul = _mm_set1_epi32(1 << 4); const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); @@ -421,6 +462,8 @@ static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { __m128i x0, x1, x2, x3; __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; + __m128i u0_low, u1_low, u2_low, u3_low; + __m128i u0_high, u1_high, u2_high, u3_high; v0 = _mm_unpacklo_epi32(in[0], in[1]); v1 = _mm_unpackhi_epi32(in[0], in[1]); @@ -455,51 +498,78 @@ static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { t = _mm_add_epi32(s0, s1); u3 = _mm_sub_epi32(t, s3); - u0 = _mm_add_epi32(u0, rnding); - u0 = _mm_srai_epi32(u0, bit); + // u0 + u0_low = _mm_mul_epi32(u0, mul); + u0_low = _mm_add_epi64(u0_low, rnding); + + u0 = _mm_srli_si128(u0, 4); + u0_high = _mm_mul_epi32(u0, mul); + u0_high = _mm_add_epi64(u0_high, rnding); + + u0_low = _mm_srli_si128(u0_low, 2); + u0_high = _mm_srli_si128(u0_high, 2); + + u0 = _mm_unpacklo_epi32(u0_low, u0_high); + u0_high = _mm_unpackhi_epi32(u0_low, u0_high); + u0 = _mm_unpacklo_epi64(u0, u0_high); + + // u1 + u1_low = _mm_mul_epi32(u1, mul); + u1_low = _mm_add_epi64(u1_low, rnding); + + u1 = _mm_srli_si128(u1, 4); + u1_high = _mm_mul_epi32(u1, mul); + u1_high = _mm_add_epi64(u1_high, rnding); + + u1_low = _mm_srli_si128(u1_low, 2); + u1_high = _mm_srli_si128(u1_high, 2); + + u1 = _mm_unpacklo_epi32(u1_low, u1_high); + u1_high = _mm_unpackhi_epi32(u1_low, u1_high); + u1 = _mm_unpacklo_epi64(u1, u1_high); + + // u2 + u2_low = _mm_mul_epi32(u2, mul); + u2_low = _mm_add_epi64(u2_low, rnding); + + u2 = _mm_srli_si128(u2, 4); + u2_high = _mm_mul_epi32(u2, mul); + u2_high = _mm_add_epi64(u2_high, rnding); + + u2_low = _mm_srli_si128(u2_low, 2); + u2_high = _mm_srli_si128(u2_high, 2); + + u2 = _mm_unpacklo_epi32(u2_low, u2_high); + u2_high = _mm_unpackhi_epi32(u2_low, u2_high); + u2 = _mm_unpacklo_epi64(u2, u2_high); + + // u3 + u3_low = _mm_mul_epi32(u3, mul); + u3_low = _mm_add_epi64(u3_low, rnding); + + u3 = _mm_srli_si128(u3, 4); + u3_high = _mm_mul_epi32(u3, mul); + u3_high = _mm_add_epi64(u3_high, rnding); - u1 = _mm_add_epi32(u1, rnding); - u1 = _mm_srai_epi32(u1, bit); + u3_low = _mm_srli_si128(u3_low, 2); + u3_high = _mm_srli_si128(u3_high, 2); - u2 = _mm_add_epi32(u2, rnding); - u2 = _mm_srai_epi32(u2, bit); + u3 = _mm_unpacklo_epi32(u3_low, u3_high); + u3_high = _mm_unpackhi_epi32(u3_low, u3_high); + u3 = _mm_unpacklo_epi64(u3, u3_high); - u3 = _mm_add_epi32(u3, rnding); - u3 = _mm_srai_epi32(u3, bit); + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - u0 = _mm_max_epi32(u0, clamp_lo); - u0 = _mm_min_epi32(u0, clamp_hi); - u1 = _mm_max_epi32(u1, clamp_lo); - u1 = _mm_min_epi32(u1, clamp_hi); - u2 = _mm_max_epi32(u2, clamp_lo); - u2 = _mm_min_epi32(u2, clamp_hi); - u3 = _mm_max_epi32(u3, clamp_lo); - u3 = _mm_min_epi32(u3, clamp_hi); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); } - - in[0] = u0; - in[1] = u1; - in[2] = u2; - in[3] = u3; -} - -static INLINE void round_shift_4x4(__m128i *in, int shift) { - __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); - - in[0] = _mm_add_epi32(in[0], rnding); - in[1] = _mm_add_epi32(in[1], rnding); - in[2] = _mm_add_epi32(in[2], rnding); - in[3] = _mm_add_epi32(in[3], rnding); - - in[0] = _mm_srai_epi32(in[0], shift); - in[1] = _mm_srai_epi32(in[1], shift); - in[2] = _mm_srai_epi32(in[2], shift); - in[3] = _mm_srai_epi32(in[3], shift); } static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, @@ -556,68 +626,164 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); } -void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, +static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i v[4]; + __m128i zero = _mm_set1_epi32(0); + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a1_low; + __m128i a0_high, a1_high; + + offset = _mm_unpacklo_epi32(offset, zero); + + for (int i = 0; i < 4; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } + + // Transpose for 4x4 + v[0] = _mm_unpacklo_epi32(out[0], out[1]); + v[1] = _mm_unpackhi_epi32(out[0], out[1]); + v[2] = _mm_unpacklo_epi32(out[2], out[3]); + v[3] = _mm_unpackhi_epi32(out[2], out[3]); + + out[0] = _mm_unpacklo_epi64(v[0], v[2]); + out[1] = _mm_unpackhi_epi64(v[0], v[2]); + out[2] = _mm_unpacklo_epi64(v[1], v[3]); + out[3] = _mm_unpackhi_epi64(v[1], v[3]); +} +void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { __m128i in[4]; - const int8_t *shift = inv_txfm_shift_ls[TX_4X4]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; const int txw_idx = get_txw_idx(TX_4X4); const int txh_idx = get_txh_idx(TX_4X4); switch (tx_type) { case DCT_DCT: - load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: - load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: - load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); break; case ADST_FLIPADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_ADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case IDTX: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_DCT: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_ADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_FLIPADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + 0); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; + case H_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, + 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; default: assert(0); } } @@ -745,26 +911,22 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, u5 = _mm_srai_epi32(u5, bit); // stage 5 - if (do_cols) { - addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col); - addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col); - addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col); - addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - } + addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } @@ -1089,11 +1251,26 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, } } -static void round_shift_8x8(__m128i *in, int shift) { - round_shift_4x4(&in[0], shift); - round_shift_4x4(&in[4], shift); - round_shift_4x4(&in[8], shift); - round_shift_4x4(&in[12], shift); +static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + out[0] = _mm_add_epi32(in[0], in[0]); + out[1] = _mm_add_epi32(in[1], in[1]); + out[2] = _mm_add_epi32(in[2], in[2]); + out[3] = _mm_add_epi32(in[3], in[3]); + out[4] = _mm_add_epi32(in[4], in[4]); + out[5] = _mm_add_epi32(in[5], in[5]); + out[6] = _mm_add_epi32(in[6], in[6]); + out[7] = _mm_add_epi32(in[7], in[7]); + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8); + } } static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi, @@ -1165,93 +1342,93 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, _mm_store_si128((__m128i *)(output + 7 * stride), u7); } -void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output, +void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { __m128i in[16], out[16]; - const int8_t *shift = inv_txfm_shift_ls[TX_8X8]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; const int txw_idx = get_txw_idx(TX_8X8); const int txh_idx = get_txh_idx(TX_8X8); switch (tx_type) { case DCT_DCT: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); break; case ADST_FLIPADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd); break; case FLIPADST_ADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd); break; default: assert(0); @@ -1264,6 +1441,8 @@ static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i x; // stage 0 @@ -1278,18 +1457,16 @@ static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, // stage 5 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); x = _mm_add_epi32(x, offset); x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); - x = _mm_max_epi32(x, clamp_lo_out); - x = _mm_min_epi32(x, clamp_hi_out); } + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; @@ -1396,25 +1573,19 @@ static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, u5 = _mm_srai_epi32(u5, bit); // stage 5 - if (do_cols) { - addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7); - addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6); - addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5); - addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4); - } else { + addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out, - out_shift); - addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out, - out_shift); - addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out, - out_shift); - addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out, - out_shift); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8); } } @@ -1683,56 +1854,50 @@ static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - { - // stage 0 - // stage 1 - // stage 2 - // stage 3 - // stage 4 - in[0] = _mm_mullo_epi32(in[0], cospi32); - in[0] = _mm_add_epi32(in[0], rnding); - in[0] = _mm_srai_epi32(in[0], bit); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm_mullo_epi32(in[0], cospi32); + in[0] = _mm_add_epi32(in[0], rnding); + in[0] = _mm_srai_epi32(in[0], bit); - // stage 5 - // stage 6 - // stage 7 - if (do_cols) { - in[0] = _mm_max_epi32(in[0], clamp_lo); - in[0] = _mm_min_epi32(in[0], clamp_hi); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + if (out_shift != 0) { __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); in[0] = _mm_add_epi32(in[0], offset); in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); - in[0] = _mm_max_epi32(in[0], clamp_lo_out); - in[0] = _mm_min_epi32(in[0], clamp_hi_out); } - - out[0] = in[0]; - out[1] = in[0]; - out[2] = in[0]; - out[3] = in[0]; - out[4] = in[0]; - out[5] = in[0]; - out[6] = in[0]; - out[7] = in[0]; - out[8] = in[0]; - out[9] = in[0]; - out[10] = in[0]; - out[11] = in[0]; - out[12] = in[0]; - out[13] = in[0]; - out[14] = in[0]; - out[15] = in[0]; } + + in[0] = _mm_max_epi32(in[0], clamp_lo); + in[0] = _mm_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; } static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, @@ -1760,140 +1925,120 @@ static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u[16], x, y; + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; - { - // stage 0 - // stage 1 - u[0] = in[0]; - u[2] = in[4]; - u[4] = in[2]; - u[6] = in[6]; - u[8] = in[1]; - u[10] = in[5]; - u[12] = in[3]; - u[14] = in[7]; - - // stage 2 - u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); - u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); - - u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); - u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); - - u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); - u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); - - u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); - u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + // stage 2 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); - // stage 3 - u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); - u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); - u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); - u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); + u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); - addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); - // stage 4 - x = _mm_mullo_epi32(u[0], cospi32); - u[0] = _mm_add_epi32(x, rnding); - u[0] = _mm_srai_epi32(u[0], bit); - u[1] = u[0]; + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); - u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); - u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + // stage 3 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); - addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); - x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - u[9] = x; - y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - u[10] = y; + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + u[1] = u[0]; - // stage 5 - addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); - x = _mm_mullo_epi32(u[5], cospi32); - y = _mm_mullo_epi32(u[6], cospi32); - u[5] = _mm_sub_epi32(y, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); + addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); - u[6] = _mm_add_epi32(y, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); + x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; - addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + // stage 5 + addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); - // stage 6 - addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + x = _mm_mullo_epi32(u[5], cospi32); + y = _mm_mullo_epi32(u[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - x = _mm_mullo_epi32(u[10], cospi32); - y = _mm_mullo_epi32(u[13], cospi32); - u[10] = _mm_sub_epi32(y, x); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - u[13] = _mm_add_epi32(x, y); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); - x = _mm_mullo_epi32(u[11], cospi32); - y = _mm_mullo_epi32(u[12], cospi32); - u[11] = _mm_sub_epi32(y, x); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); + // stage 6 + addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + u[10] = _mm_sub_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[13] = _mm_add_epi32(x, y); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + u[12] = _mm_add_epi32(x, y); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + // stage 7 + addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); - u[12] = _mm_add_epi32(x, y); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); - // stage 7 - if (do_cols) { - addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15); - addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14); - addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13); - addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12); - addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11); - addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10); - addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9); - addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out, - &clamp_hi_out, out_shift); - } + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } @@ -1910,167 +2055,162 @@ static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i zero = _mm_setzero_si128(); __m128i v[16], x, y, temp1, temp2; + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(x, rnding); + v[0] = _mm_srai_epi32(v[0], bit); - // Calculate the column 0, 1, 2, 3 - { - // stage 0 - // stage 1 - // stage 2 - x = _mm_mullo_epi32(in[0], cospi62); - v[0] = _mm_add_epi32(x, rnding); - v[0] = _mm_srai_epi32(v[0], bit); - - x = _mm_mullo_epi32(in[0], cospi2); - v[1] = _mm_sub_epi32(zero, x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(zero, x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); - // stage 3 - v[8] = v[0]; - v[9] = v[1]; + // stage 3 + v[8] = v[0]; + v[9] = v[1]; - // stage 4 - temp1 = _mm_mullo_epi32(v[8], cospi8); - x = _mm_mullo_epi32(v[9], cospi56); - temp1 = _mm_add_epi32(temp1, x); - temp1 = _mm_add_epi32(temp1, rnding); - temp1 = _mm_srai_epi32(temp1, bit); - - temp2 = _mm_mullo_epi32(v[8], cospi56); - x = _mm_mullo_epi32(v[9], cospi8); - temp2 = _mm_sub_epi32(temp2, x); - temp2 = _mm_add_epi32(temp2, rnding); - temp2 = _mm_srai_epi32(temp2, bit); - v[8] = temp1; - v[9] = temp2; + // stage 4 + temp1 = _mm_mullo_epi32(v[8], cospi8); + x = _mm_mullo_epi32(v[9], cospi56); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); - // stage 5 - v[4] = v[0]; - v[5] = v[1]; - v[12] = v[8]; - v[13] = v[9]; + temp2 = _mm_mullo_epi32(v[8], cospi56); + x = _mm_mullo_epi32(v[9], cospi8); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; - // stage 6 - temp1 = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - temp1 = _mm_add_epi32(temp1, x); - temp1 = _mm_add_epi32(temp1, rnding); - temp1 = _mm_srai_epi32(temp1, bit); - - temp2 = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - temp2 = _mm_sub_epi32(temp2, x); - temp2 = _mm_add_epi32(temp2, rnding); - temp2 = _mm_srai_epi32(temp2, bit); - v[4] = temp1; - v[5] = temp2; - - temp1 = _mm_mullo_epi32(v[12], cospi16); - x = _mm_mullo_epi32(v[13], cospi48); - temp1 = _mm_add_epi32(temp1, x); - temp1 = _mm_add_epi32(temp1, rnding); - temp1 = _mm_srai_epi32(temp1, bit); - - temp2 = _mm_mullo_epi32(v[12], cospi48); - x = _mm_mullo_epi32(v[13], cospi16); - temp2 = _mm_sub_epi32(temp2, x); - temp2 = _mm_add_epi32(temp2, rnding); - temp2 = _mm_srai_epi32(temp2, bit); - v[12] = temp1; - v[13] = temp2; + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; - // stage 7 - v[2] = v[0]; - v[3] = v[1]; - v[6] = v[4]; - v[7] = v[5]; - v[10] = v[8]; - v[11] = v[9]; - v[14] = v[12]; - v[15] = v[13]; + // stage 6 + temp1 = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); - // stage 8 - y = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - v[2] = _mm_add_epi32(y, x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); + temp2 = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm_mullo_epi32(v[12], cospi16); + x = _mm_mullo_epi32(v[13], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); - v[3] = _mm_sub_epi32(y, x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); + temp2 = _mm_mullo_epi32(v[12], cospi48); + x = _mm_mullo_epi32(v[13], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; - y = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - v[6] = _mm_add_epi32(y, x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_sub_epi32(y, x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - y = _mm_mullo_epi32(v[10], cospi32); - x = _mm_mullo_epi32(v[11], cospi32); - v[10] = _mm_add_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + // stage 8 + y = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); - y = _mm_mullo_epi32(v[14], cospi32); - x = _mm_mullo_epi32(v[15], cospi32); - v[14] = _mm_add_epi32(y, x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); - v[15] = _mm_sub_epi32(y, x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + y = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + y = _mm_mullo_epi32(v[10], cospi32); + x = _mm_mullo_epi32(v[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + y = _mm_mullo_epi32(v[14], cospi32); + x = _mm_mullo_epi32(v[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); - // stage 9 - if (do_cols) { - out[0] = v[0]; - out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); - out[2] = v[12]; - out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); - out[4] = v[6]; - out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); - out[6] = v[10]; - out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); - out[8] = v[3]; - out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); - out[10] = v[15]; - out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); - out[12] = v[5]; - out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); - out[14] = v[9]; - out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = - _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - } + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); } } @@ -2107,291 +2247,287 @@ static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i zero = _mm_setzero_si128(); __m128i u[16], x, y; - // Calculate the column 0, 1, 2, 3 - { - // stage 0 - // stage 1 - // stage 2 - __m128i zero = _mm_setzero_si128(); - x = _mm_mullo_epi32(in[0], cospi62); - u[0] = _mm_add_epi32(x, rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - x = _mm_mullo_epi32(in[0], cospi2); - u[1] = _mm_sub_epi32(zero, x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - x = _mm_mullo_epi32(in[2], cospi54); - u[2] = _mm_add_epi32(x, rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - x = _mm_mullo_epi32(in[2], cospi10); - u[3] = _mm_sub_epi32(zero, x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - x = _mm_mullo_epi32(in[4], cospi46); - u[4] = _mm_add_epi32(x, rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - x = _mm_mullo_epi32(in[4], cospi18); - u[5] = _mm_sub_epi32(zero, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); - x = _mm_mullo_epi32(in[6], cospi38); - u[6] = _mm_add_epi32(x, rnding); - u[6] = _mm_srai_epi32(u[6], bit); + x = _mm_mullo_epi32(in[0], cospi2); + u[1] = _mm_sub_epi32(zero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); - x = _mm_mullo_epi32(in[6], cospi26); - u[7] = _mm_sub_epi32(zero, x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); + x = _mm_mullo_epi32(in[2], cospi54); + u[2] = _mm_add_epi32(x, rnding); + u[2] = _mm_srai_epi32(u[2], bit); - u[8] = _mm_mullo_epi32(in[7], cospi34); - u[8] = _mm_add_epi32(u[8], rnding); - u[8] = _mm_srai_epi32(u[8], bit); + x = _mm_mullo_epi32(in[2], cospi10); + u[3] = _mm_sub_epi32(zero, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); - u[9] = _mm_mullo_epi32(in[7], cospi30); - u[9] = _mm_add_epi32(u[9], rnding); - u[9] = _mm_srai_epi32(u[9], bit); + x = _mm_mullo_epi32(in[4], cospi46); + u[4] = _mm_add_epi32(x, rnding); + u[4] = _mm_srai_epi32(u[4], bit); - u[10] = _mm_mullo_epi32(in[5], cospi42); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); + x = _mm_mullo_epi32(in[4], cospi18); + u[5] = _mm_sub_epi32(zero, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - u[11] = _mm_mullo_epi32(in[5], cospi22); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); + x = _mm_mullo_epi32(in[6], cospi38); + u[6] = _mm_add_epi32(x, rnding); + u[6] = _mm_srai_epi32(u[6], bit); - u[12] = _mm_mullo_epi32(in[3], cospi50); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); + x = _mm_mullo_epi32(in[6], cospi26); + u[7] = _mm_sub_epi32(zero, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - u[13] = _mm_mullo_epi32(in[3], cospi14); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); + u[8] = _mm_mullo_epi32(in[7], cospi34); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); - u[14] = _mm_mullo_epi32(in[1], cospi58); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); + u[9] = _mm_mullo_epi32(in[7], cospi30); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); - u[15] = _mm_mullo_epi32(in[1], cospi6); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); + u[10] = _mm_mullo_epi32(in[5], cospi42); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); - // stage 3 - addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + u[11] = _mm_mullo_epi32(in[5], cospi22); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); - // stage 4 - y = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi56); - u[8] = _mm_mullo_epi32(u[8], cospi8); - u[8] = _mm_add_epi32(u[8], x); - u[8] = _mm_add_epi32(u[8], rnding); - u[8] = _mm_srai_epi32(u[8], bit); - - x = _mm_mullo_epi32(u[9], cospi8); - u[9] = _mm_sub_epi32(y, x); - u[9] = _mm_add_epi32(u[9], rnding); - u[9] = _mm_srai_epi32(u[9], bit); - - x = _mm_mullo_epi32(u[11], cospi24); - y = _mm_mullo_epi32(u[10], cospi24); - u[10] = _mm_mullo_epi32(u[10], cospi40); - u[10] = _mm_add_epi32(u[10], x); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); - - x = _mm_mullo_epi32(u[11], cospi40); - u[11] = _mm_sub_epi32(y, x); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); - - x = _mm_mullo_epi32(u[13], cospi8); - y = _mm_mullo_epi32(u[12], cospi8); - u[12] = _mm_mullo_epi32(u[12], cospim56); - u[12] = _mm_add_epi32(u[12], x); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); - - x = _mm_mullo_epi32(u[13], cospim56); - u[13] = _mm_sub_epi32(y, x); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); - - x = _mm_mullo_epi32(u[15], cospi40); - y = _mm_mullo_epi32(u[14], cospi40); - u[14] = _mm_mullo_epi32(u[14], cospim24); - u[14] = _mm_add_epi32(u[14], x); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); - - x = _mm_mullo_epi32(u[15], cospim24); - u[15] = _mm_sub_epi32(y, x); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); + u[12] = _mm_mullo_epi32(in[3], cospi50); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); - // stage 5 - addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + u[13] = _mm_mullo_epi32(in[3], cospi14); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); - // stage 6 - x = _mm_mullo_epi32(u[5], cospi48); - y = _mm_mullo_epi32(u[4], cospi48); - u[4] = _mm_mullo_epi32(u[4], cospi16); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - x = _mm_mullo_epi32(u[5], cospi16); - u[5] = _mm_sub_epi32(y, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); + u[14] = _mm_mullo_epi32(in[1], cospi58); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); - x = _mm_mullo_epi32(u[7], cospi16); - y = _mm_mullo_epi32(u[6], cospi16); - u[6] = _mm_mullo_epi32(u[6], cospim48); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); + u[15] = _mm_mullo_epi32(in[1], cospi6); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); - x = _mm_mullo_epi32(u[7], cospim48); - u[7] = _mm_sub_epi32(y, x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - x = _mm_mullo_epi32(u[13], cospi48); - y = _mm_mullo_epi32(u[12], cospi48); - u[12] = _mm_mullo_epi32(u[12], cospi16); - u[12] = _mm_add_epi32(u[12], x); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); - - x = _mm_mullo_epi32(u[13], cospi16); - u[13] = _mm_sub_epi32(y, x); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); - - x = _mm_mullo_epi32(u[15], cospi16); - y = _mm_mullo_epi32(u[14], cospi16); - u[14] = _mm_mullo_epi32(u[14], cospim48); - u[14] = _mm_add_epi32(u[14], x); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); - - x = _mm_mullo_epi32(u[15], cospim48); - u[15] = _mm_sub_epi32(y, x); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); + // stage 3 + addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); - // stage 7 - addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + // stage 4 + y = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi56); + u[8] = _mm_mullo_epi32(u[8], cospi8); + u[8] = _mm_add_epi32(u[8], x); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); + + x = _mm_mullo_epi32(u[9], cospi8); + u[9] = _mm_sub_epi32(y, x); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); + + x = _mm_mullo_epi32(u[11], cospi24); + y = _mm_mullo_epi32(u[10], cospi24); + u[10] = _mm_mullo_epi32(u[10], cospi40); + u[10] = _mm_add_epi32(u[10], x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + x = _mm_mullo_epi32(u[11], cospi40); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + x = _mm_mullo_epi32(u[13], cospi8); + y = _mm_mullo_epi32(u[12], cospi8); + u[12] = _mm_mullo_epi32(u[12], cospim56); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospim56); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi40); + y = _mm_mullo_epi32(u[14], cospi40); + u[14] = _mm_mullo_epi32(u[14], cospim24); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim24); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); - // stage 8 - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); - u[2] = _mm_add_epi32(y, x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_sub_epi32(y, x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); - u[6] = _mm_add_epi32(y, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); + // stage 5 + addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); - u[7] = _mm_sub_epi32(y, x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); + // stage 6 + x = _mm_mullo_epi32(u[5], cospi48); + y = _mm_mullo_epi32(u[4], cospi48); + u[4] = _mm_mullo_epi32(u[4], cospi16); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); - u[10] = _mm_add_epi32(y, x); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); + x = _mm_mullo_epi32(u[5], cospi16); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(u[7], cospi16); + y = _mm_mullo_epi32(u[6], cospi16); + u[6] = _mm_mullo_epi32(u[6], cospim48); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - u[11] = _mm_sub_epi32(y, x); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); + x = _mm_mullo_epi32(u[7], cospim48); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); - u[14] = _mm_add_epi32(y, x); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); + x = _mm_mullo_epi32(u[13], cospi48); + y = _mm_mullo_epi32(u[12], cospi48); + u[12] = _mm_mullo_epi32(u[12], cospi16); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospi16); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi16); + y = _mm_mullo_epi32(u[14], cospi16); + u[14] = _mm_mullo_epi32(u[14], cospim48); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim48); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); - u[15] = _mm_sub_epi32(y, x); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); + // stage 7 + addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); - // stage 9 - if (do_cols) { - out[0] = u[0]; - out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]); - out[2] = u[12]; - out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]); - out[4] = u[6]; - out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]); - out[6] = u[10]; - out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]); - out[8] = u[3]; - out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]); - out[10] = u[15]; - out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]); - out[12] = u[5]; - out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]); - out[14] = u[9]; - out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = - _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + // stage 8 + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + u[2] = _mm_add_epi32(y, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); - neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - } + u[3] = _mm_sub_epi32(y, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + u[10] = _mm_add_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + u[14] = _mm_add_epi32(y, x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(zero, u[8]); + out[2] = u[12]; + out[3] = _mm_sub_epi32(zero, u[4]); + out[4] = u[6]; + out[5] = _mm_sub_epi32(zero, u[14]); + out[6] = u[10]; + out[7] = _mm_sub_epi32(zero, u[2]); + out[8] = u[3]; + out[9] = _mm_sub_epi32(zero, u[11]); + out[10] = u[15]; + out[11] = _mm_sub_epi32(zero, u[7]); + out[12] = u[5]; + out[13] = _mm_sub_epi32(zero, u[13]); + out[14] = u[9]; + out[15] = _mm_sub_epi32(zero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); } } @@ -2557,38 +2693,22 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[15] = u[15]; // stage 7 - if (do_cols) { - addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15); - addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14); - addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13); - addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12); - addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11); - addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10); - addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9); - addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8); - } else { + addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out, - &clamp_hi_out, out_shift); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } } @@ -2626,353 +2746,381 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + const __m128i zero = _mm_setzero_si128(); __m128i u[16], v[16], x, y; - // Calculate the column 0, 1, 2, 3 - { - // stage 0 - // stage 1 - // stage 2 - v[0] = _mm_mullo_epi32(in[15], cospi2); - x = _mm_mullo_epi32(in[0], cospi62); - v[0] = _mm_add_epi32(v[0], x); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); - - v[1] = _mm_mullo_epi32(in[15], cospi62); - x = _mm_mullo_epi32(in[0], cospi2); - v[1] = _mm_sub_epi32(v[1], x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); - - v[2] = _mm_mullo_epi32(in[13], cospi10); - x = _mm_mullo_epi32(in[2], cospi54); - v[2] = _mm_add_epi32(v[2], x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); - - v[3] = _mm_mullo_epi32(in[13], cospi54); - x = _mm_mullo_epi32(in[2], cospi10); - v[3] = _mm_sub_epi32(v[3], x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); - - v[4] = _mm_mullo_epi32(in[11], cospi18); - x = _mm_mullo_epi32(in[4], cospi46); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(in[11], cospi46); - x = _mm_mullo_epi32(in[4], cospi18); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(in[9], cospi26); - x = _mm_mullo_epi32(in[6], cospi38); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_mullo_epi32(in[9], cospi38); - x = _mm_mullo_epi32(in[6], cospi26); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = _mm_mullo_epi32(in[7], cospi34); - x = _mm_mullo_epi32(in[8], cospi30); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(in[7], cospi30); - x = _mm_mullo_epi32(in[8], cospi34); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); - - v[10] = _mm_mullo_epi32(in[5], cospi42); - x = _mm_mullo_epi32(in[10], cospi22); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_mullo_epi32(in[5], cospi22); - x = _mm_mullo_epi32(in[10], cospi42); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = _mm_mullo_epi32(in[3], cospi50); - x = _mm_mullo_epi32(in[12], cospi14); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(in[3], cospi14); - x = _mm_mullo_epi32(in[12], cospi50); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(in[1], cospi58); - x = _mm_mullo_epi32(in[14], cospi6); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(in[1], cospi6); - x = _mm_mullo_epi32(in[14], cospi58); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 3 - addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); - - // stage 4 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; - - v[8] = _mm_mullo_epi32(u[8], cospi8); - x = _mm_mullo_epi32(u[9], cospi56); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi8); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); - - v[10] = _mm_mullo_epi32(u[10], cospi40); - x = _mm_mullo_epi32(u[11], cospi24); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_mullo_epi32(u[10], cospi24); - x = _mm_mullo_epi32(u[11], cospi40); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = _mm_mullo_epi32(u[12], cospim56); - x = _mm_mullo_epi32(u[13], cospi8); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi8); - x = _mm_mullo_epi32(u[13], cospim56); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim24); - x = _mm_mullo_epi32(u[15], cospi40); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi40); - x = _mm_mullo_epi32(u[15], cospim24); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 5 - addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); - - // stage 6 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - - v[4] = _mm_mullo_epi32(u[4], cospi16); - x = _mm_mullo_epi32(u[5], cospi48); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(u[4], cospi48); - x = _mm_mullo_epi32(u[5], cospi16); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(u[6], cospim48); - x = _mm_mullo_epi32(u[7], cospi16); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_mullo_epi32(u[6], cospi16); - x = _mm_mullo_epi32(u[7], cospim48); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = u[8]; - v[9] = u[9]; - v[10] = u[10]; - v[11] = u[11]; - - v[12] = _mm_mullo_epi32(u[12], cospi16); - x = _mm_mullo_epi32(u[13], cospi48); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi48); - x = _mm_mullo_epi32(u[13], cospi16); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim48); - x = _mm_mullo_epi32(u[15], cospi16); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi16); - x = _mm_mullo_epi32(u[15], cospim48); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 7 - addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); - - // stage 8 - v[0] = u[0]; - v[1] = u[1]; - - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); - v[2] = _mm_add_epi32(y, x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); - - v[3] = _mm_sub_epi32(y, x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); - - v[4] = u[4]; - v[5] = u[5]; + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm_mullo_epi32(in[15], cospi2); + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(v[0], x); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_mullo_epi32(in[15], cospi62); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(v[1], x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(in[13], cospi10); + x = _mm_mullo_epi32(in[2], cospi54); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(in[13], cospi54); + x = _mm_mullo_epi32(in[2], cospi10); + v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_mullo_epi32(in[11], cospi18); + x = _mm_mullo_epi32(in[4], cospi46); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(in[11], cospi46); + x = _mm_mullo_epi32(in[4], cospi18); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(in[9], cospi26); + x = _mm_mullo_epi32(in[6], cospi38); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(in[9], cospi38); + x = _mm_mullo_epi32(in[6], cospi26); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = _mm_mullo_epi32(in[7], cospi34); + x = _mm_mullo_epi32(in[8], cospi30); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(in[7], cospi30); + x = _mm_mullo_epi32(in[8], cospi34); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(in[5], cospi42); + x = _mm_mullo_epi32(in[10], cospi22); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(in[5], cospi22); + x = _mm_mullo_epi32(in[10], cospi42); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(in[3], cospi50); + x = _mm_mullo_epi32(in[12], cospi14); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(in[3], cospi14); + x = _mm_mullo_epi32(in[12], cospi50); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(in[1], cospi58); + x = _mm_mullo_epi32(in[14], cospi6); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(in[1], cospi6); + x = _mm_mullo_epi32(in[14], cospi58); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); - v[6] = _mm_add_epi32(y, x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); + // stage 3 + addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); - v[7] = _mm_sub_epi32(y, x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi8); + x = _mm_mullo_epi32(u[9], cospi56); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi8); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi40); + x = _mm_mullo_epi32(u[11], cospi24); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(u[10], cospi24); + x = _mm_mullo_epi32(u[11], cospi40); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[12], cospim56); + x = _mm_mullo_epi32(u[13], cospi8); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi8); + x = _mm_mullo_epi32(u[13], cospim56); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim24); + x = _mm_mullo_epi32(u[15], cospi40); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi40); + x = _mm_mullo_epi32(u[15], cospim24); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); - v[8] = u[8]; - v[9] = u[9]; + // stage 5 + addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); - v[10] = _mm_add_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm_mullo_epi32(u[4], cospi16); + x = _mm_mullo_epi32(u[5], cospi48); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(u[4], cospi48); + x = _mm_mullo_epi32(u[5], cospi16); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(u[6], cospim48); + x = _mm_mullo_epi32(u[7], cospi16); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(u[6], cospi16); + x = _mm_mullo_epi32(u[7], cospim48); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm_mullo_epi32(u[12], cospi16); + x = _mm_mullo_epi32(u[13], cospi48); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi48); + x = _mm_mullo_epi32(u[13], cospi16); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim48); + x = _mm_mullo_epi32(u[15], cospi16); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi16); + x = _mm_mullo_epi32(u[15], cospim48); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + // stage 7 + addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); - v[12] = u[12]; - v[13] = u[13]; + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); - v[14] = _mm_add_epi32(y, x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - v[15] = _mm_sub_epi32(y, x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} +static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a0_high, a1_low, a1_high; + __m128i zero = _mm_set1_epi32(0); + offset = _mm_unpacklo_epi32(offset, zero); - // stage 9 - if (do_cols) { - out[0] = v[0]; - out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); - out[2] = v[12]; - out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); - out[4] = v[6]; - out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); - out[6] = v[10]; - out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); - out[8] = v[3]; - out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); - out[10] = v[15]; - out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); - out[12] = v[5]; - out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); - out[14] = v[9]; - out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = - _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 16; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } - neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - } + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16); } } - static INLINE void idct64_stage8_sse4_1( __m128i *u, const __m128i *cospim32, const __m128i *cospi32, const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, @@ -3091,21 +3239,21 @@ static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, int bd, int out_shift, - const int log_range) { - if (do_cols) { - for (int i = 0; i < 32; i++) { - addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]); - } - } else { + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); + } + + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - for (int i = 0; i < 32; i++) { - addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)], - &clamp_lo_out, &clamp_hi_out, out_shift); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + for (int i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out, + 4); } } } @@ -3115,8 +3263,8 @@ static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, const int32_t *cospi = cospi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); @@ -3135,88 +3283,82 @@ static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, // stage 9 // stage 10 // stage 11 - if (do_cols) { - x = _mm_max_epi32(x, clamp_lo); - x = _mm_min_epi32(x, clamp_hi); - } else { + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); - x = _mm_add_epi32(x, offset); - x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); - - x = _mm_max_epi32(x, clamp_lo_out); - x = _mm_min_epi32(x, clamp_hi_out); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } } - + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); out[0] = x; - out[63] = x; out[1] = x; - out[62] = x; out[2] = x; - out[61] = x; out[3] = x; - out[60] = x; out[4] = x; - out[59] = x; out[5] = x; - out[58] = x; out[6] = x; - out[57] = x; out[7] = x; - out[56] = x; out[8] = x; - out[55] = x; out[9] = x; - out[54] = x; out[10] = x; - out[53] = x; out[11] = x; - out[52] = x; out[12] = x; - out[51] = x; out[13] = x; - out[50] = x; out[14] = x; - out[49] = x; out[15] = x; - out[48] = x; out[16] = x; - out[47] = x; out[17] = x; - out[46] = x; out[18] = x; - out[45] = x; out[19] = x; - out[44] = x; out[20] = x; - out[43] = x; out[21] = x; - out[42] = x; out[22] = x; - out[41] = x; out[23] = x; - out[40] = x; out[24] = x; - out[39] = x; out[25] = x; - out[38] = x; out[26] = x; - out[37] = x; out[27] = x; - out[36] = x; out[28] = x; - out[35] = x; out[29] = x; - out[34] = x; out[30] = x; - out[33] = x; out[31] = x; out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; } } @@ -3434,7 +3576,6 @@ static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, u[6] = u[1]; u[5] = u[2]; u[4] = u[3]; - u[9] = u[9]; idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); @@ -3448,7 +3589,7 @@ static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, bit); // stage 11 - idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } @@ -3758,7 +3899,7 @@ static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, bit); // stage 11 - idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } @@ -4221,20 +4362,20 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, for (i = 56; i < 64; i++) v[i] = u[i]; // stage 11 - if (do_cols) { - for (i = 0; i < 32; i++) { - addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]); - } - } else { + for (i = 0; i < 32; i++) { + addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - for (i = 0; i < 32; i++) { - addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], - &clamp_lo_out, &clamp_hi_out, out_shift); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, + &clamp_hi_out, 4); } } } @@ -4246,8 +4387,8 @@ static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i bf1; // stage 0 @@ -4269,17 +4410,17 @@ static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, bf1 = _mm_min_epi32(bf1, clamp_hi); } else { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); - bf1 = _mm_add_epi32(bf1, offset); - bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); - bf1 = _mm_max_epi32(bf1, clamp_lo_out); - bf1 = _mm_min_epi32(bf1, clamp_hi_out); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + bf1 = _mm_add_epi32(bf1, offset); + bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); + } } + + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); out[0] = bf1; out[1] = bf1; out[2] = bf1; @@ -4422,7 +4563,7 @@ static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, &rounding, bit); // stage 9 - idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, @@ -4568,9 +4709,8 @@ static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, // stage 8 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); - // stage 9 - idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, @@ -4926,62 +5066,30 @@ static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, bf0[31] = bf1[31]; // stage 9 - if (do_cols) { - addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31); - addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30); - addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29); - addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28); - addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27); - addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26); - addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25); - addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24); - addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23); - addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22); - addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21); - addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20); - addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19); - addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18); - addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17); - addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16); - } else { + addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + + if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } @@ -4992,127 +5100,23 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: case IDTX: - av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: case H_DCT: - case V_ADST: case H_ADST: - case V_FLIPADST: case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. case V_DCT: - case H_DCT: case V_ADST: - case H_ADST: case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, txfm_param->tx_size, txfm_param->eob, bd); break; - } -} - -void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; default: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. - case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); + av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); break; - default: assert(0); } } - void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { @@ -5127,53 +5131,268 @@ void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); return; } - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - } + av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); } +static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + for (int i = 0; i < 32; i += 16) { + out[i] = _mm_slli_epi32(in[i], 2); + out[i + 1] = _mm_slli_epi32(in[i + 1], 2); + out[i + 2] = _mm_slli_epi32(in[i + 2], 2); + out[i + 3] = _mm_slli_epi32(in[i + 3], 2); + out[i + 4] = _mm_slli_epi32(in[i + 4], 2); + out[i + 5] = _mm_slli_epi32(in[i + 5], 2); + out[i + 6] = _mm_slli_epi32(in[i + 6], 2); + out[i + 7] = _mm_slli_epi32(in[i + 7], 2); + out[i + 8] = _mm_slli_epi32(in[i + 8], 2); + out[i + 9] = _mm_slli_epi32(in[i + 9], 2); + out[i + 10] = _mm_slli_epi32(in[i + 10], 2); + out[i + 11] = _mm_slli_epi32(in[i + 11], 2); + out[i + 12] = _mm_slli_epi32(in[i + 12], 2); + out[i + 13] = _mm_slli_epi32(in[i + 13], 2); + out[i + 14] = _mm_slli_epi32(in[i + 14], 2); + out[i + 15] = _mm_slli_epi32(in[i + 15], 2); + } + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} static const transform_1d_sse4_1 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { { - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, + { idct4x4_sse4_1, NULL, NULL, NULL }, + { iadst4x4_sse4_1, NULL, NULL, NULL }, + { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL }, }, { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, + { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } }, { { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, NULL }, { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, NULL }, - { NULL, NULL, NULL, NULL }, + { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL }, }, { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, idct32x32_sse4_1 }, { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, + { iidentity32_sse4_1, NULL, NULL, NULL } }, { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, idct64x64_sse4_1 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; +static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = input_stride >> 2; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { + __m128i buf0[16]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_w_div4; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + + for (int j = 0; j < buf_size_w_div4; ++j) { + _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0]; + _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1]; + _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2]; + _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } +} +static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int buf_size_w_div8 = input_stride >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[16]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1( + buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} +static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[64 * 4]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[32]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < (input_stride >> 2); ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + for (int j = 0; j < (input_stride >> 2); ++j) { + _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0]; + _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1]; + _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2]; + _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3]; + } + } + for (int i = 0; i < (input_stride >> 2); i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, 0, txfm_size_row, + bd); + } + } +} static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, @@ -5182,7 +5401,7 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, __m128i buf1[64 * 16]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; @@ -5220,7 +5439,8 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, av1_round_shift_rect_array_32_sse4_1( buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2); } - row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); __m128i *_buf1 = buf1 + i * 4; if (lr_flip) { @@ -5244,7 +5464,7 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, // 2nd stage: column transform for (int i = 0; i < buf_size_w_div8; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, - inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, @@ -5261,6 +5481,230 @@ static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, } } +static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; + const int input_stride = AOMMIN(32, txfm_size_col); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + const int32_t *input_row = input; + __m128i *buf0_cur = buf0; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row); + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, + -shift[0]); + + if (lr_flip) { + TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], + buf1[7]); + } else { + TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2], + buf1[3]); + TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + + av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < 2; i++) { + col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + // write to buffer + highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip, + txfm_size_row, bd); +} + +static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_col); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + __m128i *buf0_cur = buf0; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row); + for (int i = 0; i < (txfm_size_row >> 2); i++) { + row_txfm(buf0 + (i << 2), buf0 + (i << 2), + av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + for (int j = 0; j < buf_size_w_div8; j++) { + TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j], + buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); + } + row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row, + av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } +} + void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, @@ -5279,70 +5723,99 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + highbd_inv_txfm2d_add_h_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + highbd_inv_txfm2d_add_v_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case IDTX: + highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; default: assert(0); break; } } +void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { - case TX_32X32: - av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X16: - av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); - break; case TX_8X8: av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); break; case TX_4X8: - av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); break; case TX_8X4: - av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); - break; - case TX_8X16: - av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X8: - av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X32: - av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); - break; - case TX_32X16: - av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); - break; - case TX_32X64: - av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); - break; - case TX_64X32: - av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X4: av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); break; case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); - break; - case TX_8X32: - av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); break; - case TX_32X8: - av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); - break; - case TX_64X64: - case TX_16X64: - case TX_64X16: + default: av1_highbd_inv_txfm2d_add_universe_sse4_1( - input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, - txfm_param->eob, txfm_param->bd); + input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob, + txfm_param->bd); break; - default: assert(0 && "Invalid transform size"); break; } } diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c index e298cf653..70f1ec709 100644 --- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c +++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -22,23 +22,23 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" -void av1_highbd_jnt_convolve_2d_copy_avx2( +void av1_highbd_dist_wtd_convolve_2d_copy_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); @@ -78,15 +78,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b_lo, offset_const); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); const __m256i res_unsigned_hi = _mm256_add_epi32(res_32b_hi, offset_const); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -135,8 +137,9 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b, offset_const); - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -179,15 +182,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b_lo, offset_const); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); const __m256i res_unsigned_hi = _mm256_add_epi32(res_32b_hi, offset_const); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, @@ -223,11 +228,11 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( } } -void av1_highbd_jnt_convolve_2d_avx2( +void av1_highbd_dist_wtd_convolve_2d_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -244,7 +249,7 @@ void av1_highbd_jnt_convolve_2d_avx2( __m256i s[8], coeffs_y[4], coeffs_x[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; @@ -272,8 +277,8 @@ void av1_highbd_jnt_convolve_2d_avx2( const __m256i clip_pixel_to_bd = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { /* Horizontal filter */ @@ -364,8 +369,9 @@ void av1_highbd_jnt_convolve_2d_avx2( const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -409,10 +415,12 @@ void av1_highbd_jnt_convolve_2d_avx2( const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, @@ -456,24 +464,24 @@ void av1_highbd_jnt_convolve_2d_avx2( } } -void av1_highbd_jnt_convolve_x_avx2( +void av1_highbd_dist_wtd_convolve_x_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_1; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; int i, j; __m256i s[4], coeffs_x[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); @@ -496,7 +504,7 @@ void av1_highbd_jnt_convolve_x_avx2( _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); assert(bits >= 0); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ @@ -548,7 +556,7 @@ void av1_highbd_jnt_convolve_x_avx2( const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -588,10 +596,12 @@ void av1_highbd_jnt_convolve_x_avx2( const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -623,24 +633,24 @@ void av1_highbd_jnt_convolve_x_avx2( } } -void av1_highbd_jnt_convolve_y_avx2( +void av1_highbd_dist_wtd_convolve_y_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; const int bits = FILTER_BITS - conv_params->round_0; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; assert(bits >= 0); int i, j; __m256i s[8], coeffs_y[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; @@ -662,7 +672,7 @@ void av1_highbd_jnt_convolve_y_avx2( _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; @@ -753,8 +763,9 @@ void av1_highbd_jnt_convolve_y_avx2( const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -799,10 +810,12 @@ void av1_highbd_jnt_convolve_y_avx2( const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, diff --git a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c index 1a29985b5..f033a6f94 100644 --- a/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c +++ b/media/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c @@ -17,23 +17,23 @@ #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_sse4_1.h" -void av1_highbd_jnt_convolve_y_sse4_1( +void av1_highbd_dist_wtd_convolve_y_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; const int bits = FILTER_BITS - conv_params->round_0; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; assert(bits >= 0); int i, j; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; @@ -56,7 +56,7 @@ void av1_highbd_jnt_convolve_y_sse4_1( const __m128i zero = _mm_setzero_si128(); __m128i s[16], coeffs_y[4]; - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; @@ -121,10 +121,12 @@ void av1_highbd_jnt_convolve_y_sse4_1( const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); - const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1( - &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_0 = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_1 = + highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_0 = highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const, @@ -186,16 +188,16 @@ void av1_highbd_jnt_convolve_y_sse4_1( const __m128i comp_avg_res_lo_0 = highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_lo_1 = highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi_0 = highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi_1 = highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo_0 = highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const, @@ -257,24 +259,24 @@ void av1_highbd_jnt_convolve_y_sse4_1( } } -void av1_highbd_jnt_convolve_x_sse4_1( +void av1_highbd_dist_wtd_convolve_x_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_1; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; int i, j; __m128i s[4], coeffs_x[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); @@ -297,7 +299,7 @@ void av1_highbd_jnt_convolve_x_sse4_1( _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); assert(bits >= 0); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ @@ -339,7 +341,7 @@ void av1_highbd_jnt_convolve_x_sse4_1( const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i comp_avg_res = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result = highbd_convolve_rounding_sse2( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -359,10 +361,12 @@ void av1_highbd_jnt_convolve_x_sse4_1( const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); diff --git a/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h b/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h index 6f24e5948..5734810f5 100644 --- a/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h +++ b/media/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h @@ -75,13 +75,20 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { out[63]); } -static INLINE void transpose_32x32(const __m128i *input, __m128i *output) { - for (int j = 0; j < 8; j++) { - for (int i = 0; i < 8; i++) { - TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8], - input[i * 32 + j + 16], input[i * 32 + j + 24], - output[j * 32 + i + 0], output[j * 32 + i + 8], - output[j * 32 + i + 16], output[j * 32 + i + 24]); +static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output, + const int width, const int height) { + const int numcol = height >> 2; + const int numrow = width >> 2; + for (int j = 0; j < numrow; j++) { + for (int i = 0; i < numcol; i++) { + TRANSPOSE_4X4(input[i * width + j + (numrow * 0)], + input[i * width + j + (numrow * 1)], + input[i * width + j + (numrow * 2)], + input[i * width + j + (numrow * 3)], + output[j * height + i + (numcol * 0)], + output[j * height + i + (numcol * 1)], + output[j * height + i + (numcol * 2)], + output[j * height + i + (numcol * 3)]); } } } diff --git a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c index 4bcab0564..60a819308 100644 --- a/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c +++ b/media/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c @@ -15,9 +15,9 @@ #include "av1/common/warped_motion.h" -static const uint8_t warp_highbd_arrange_bytes[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 -}; +static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, + 9, 11, 13, 15 }; static const uint8_t highbd_shuffle_alpha0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 @@ -25,24 +25,28 @@ static const uint8_t highbd_shuffle_alpha0_mask0[16] = { static const uint8_t highbd_shuffle_alpha0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 }; -static const uint8_t highbd_shuffle_alpha0_mask2[16] = { - 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 -}; -static const uint8_t highbd_shuffle_alpha0_mask3[16] = { - 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 -}; +static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9, + 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11 }; +static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13, + 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15 }; static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, __m128i *coeff) { // Filter even-index pixels - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); @@ -63,14 +67,18 @@ static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); // Filter odd-index pixels - const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); @@ -87,7 +95,7 @@ static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0( int sx, __m128i *coeff) { // Filter coeff const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); coeff[0] = _mm_shuffle_epi8( tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); @@ -454,16 +462,16 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, // Filter even-index pixels const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); @@ -491,16 +499,16 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + + (__m128i *)(av1_warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); @@ -537,7 +545,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p)); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), _mm_mullo_epi32(res_lo, wt1)); res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS); @@ -570,7 +578,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4)); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0), _mm_mullo_epi32(res_hi, wt1)); res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS); diff --git a/media/libaom/src/av1/common/x86/intra_edge_sse4.c b/media/libaom/src/av1/common/x86/intra_edge_sse4.c index 0c857b583..fc69f41d7 100644 --- a/media/libaom/src/av1/common/x86/intra_edge_sse4.c +++ b/media/libaom/src/av1/common/x86/intra_edge_sse4.c @@ -212,10 +212,10 @@ void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } }; - DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = { - { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, - { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - }; + DECLARE_ALIGNED( + 16, static const int8_t, + v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, + { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; // Extend first/last samples (upper-left p[-1], last p[sz-1]) // to support 4-tap filter diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c index 9f2e2b457..6de61573e 100644 --- a/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c +++ b/media/libaom/src/av1/common/x86/jnt_convolve_avx2.c @@ -23,8 +23,8 @@ static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); + const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); + const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); return wt; } @@ -35,22 +35,20 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) { _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); } -void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_horiz; + int i, j, is_horiz_4tap = 0; const int bits = FILTER_BITS - conv_params->round_1; const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -58,87 +56,147 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - __m256i filt[4], coeffs[4]; assert(bits >= 0); assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); - const __m256i round_const = _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; - for (i = 0; i < h; i += 2) { - const uint8_t *src_data = src_ptr + i * src_stride; - CONV_BUF_TYPE *dst_data = dst + i * dst_stride; - for (j = 0; j < w; j += 8) { - const __m256i data = - load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + __m256i filt[4], coeffs[4]; - __m256i res = convolve_lowbd_x(data, coeffs, filt); + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); - res = _mm256_slli_epi16(res, bits); + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; - const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); - // Accumulate values into the destination buffer - if (do_average) { - const __m256i data_ref_0 = - load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + res = _mm256_slli_epi16(res, bits); - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); - if (w > 4) { - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } } else { - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); } - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x(data, coeffs, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } } } } } -void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride; + int i, j, is_vert_4tap = 0; // +1 to compensate for dividing the filter coeffs by 2 const int left_shift = FILTER_BITS - conv_params->round_0 + 1; const __m256i round_const = @@ -146,7 +204,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -162,201 +220,395 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, assert((FILTER_BITS - conv_params->round_0) >= 0); - prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs); + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); (void)conv_params; (void)filter_params_x; - (void)subpel_x_q4; - - for (j = 0; j < w; j += 16) { - const uint8_t *data = &src_ptr[j]; - __m256i src6; - // Load lines a and b. Line a to lower 128, line b to upper 128 - { - __m256i src_ab[7]; - __m256i src_a[7]; - src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - for (int kk = 0; kk < 6; ++kk) { - data += src_stride; - src_a[kk + 1] = - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + (void)subpel_x_qn; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src4; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[4]; + __m256i src_a[5]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 4; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src4 = src_a[4]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + + s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); } - src6 = src_a[6]; - s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); - s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); - s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); - s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); - s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); - s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); - } - for (i = 0; i < h; i += 2) { - data = &src_ptr[(i + 7) * src_stride + j]; - const __m256i src7 = - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 5) * src_stride + j]; + const __m256i src5 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); + src4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); - s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); - s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); - __m256i res_lo = convolve_lowbd(s, coeffs); + __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); - res_lo = _mm256_add_epi16(res_lo, offset_const_1); + res_lo = _mm256_add_epi16(res_lo, offset_const_1); - const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); - const __m256i res_lo_0_shift = - _mm256_slli_epi32(res_lo_0_32b, left_shift); - const __m256i res_lo_0_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); - const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); - const __m256i res_lo_1_shift = - _mm256_slli_epi32(res_lo_1_32b, left_shift); - const __m256i res_lo_1_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); - const __m256i res_lo_round = - _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); - const __m256i res_lo_unsigned = - _mm256_add_epi16(res_lo_round, offset_const_2); + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); - if (w - j < 16) { - if (do_average) { - const __m256i data_ref_0 = load_line2_avx2( - &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg); + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - if (w - j > 4) { - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } } else { - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = - _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); } } else { - const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); - const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } } - } else { - __m256i res_hi = convolve_lowbd(s + 4, coeffs); + s[0] = s[1]; + s[1] = s[2]; - res_hi = _mm256_add_epi16(res_hi, offset_const_1); + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[7]; + __m256i src_a[7]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 6; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src6 = src_a[6]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); + s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); + } - const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); - const __m256i res_hi_0_shift = - _mm256_slli_epi32(res_hi_0_32b, left_shift); - const __m256i res_hi_0_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 7) * src_stride + j]; + const __m256i src7 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); - const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); - const __m256i res_hi_1_shift = - _mm256_slli_epi32(res_hi_1_32b, left_shift); - const __m256i res_hi_1_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); - const __m256i res_hi_round = - _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); - const __m256i res_hi_unsigned = - _mm256_add_epi16(res_hi_round, offset_const_2); + __m256i res_lo = convolve_lowbd(s, coeffs); - if (do_average) { - const __m256i data_ref_0_lo = load_line2_avx2( - &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); + res_lo = _mm256_add_epi16(res_lo, offset_const_1); - const __m256i data_ref_0_hi = - load_line2_avx2(&dst[i * dst_stride + j + 8], - &dst[i * dst_stride + j + 8 + dst_stride]); + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); - const __m256i comp_avg_res_lo = - comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg); + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); - const __m256i comp_avg_res_hi = - comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg); + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); - const __m256i round_result_lo = convolve_rounding( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); - const __m256i round_result_hi = convolve_rounding( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); - const __m256i res_8 = - _mm256_packus_epi16(round_result_lo, round_result_hi); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_store_si128( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } } else { - const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + __m256i res_hi = convolve_lowbd(s + 4, coeffs); - const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_lo_1); + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); - const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0); + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); - const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1); - _mm_store_si128( - (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1); + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } } - } - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } } } } -void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + int i, is_horiz_4tap = 0, is_vert_4tap = 0; const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -364,18 +616,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4]; assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - const __m256i round_const_h = _mm256_set1_epi16( ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); @@ -385,9 +628,29 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - { + __m256i filt[4], coeffs_x[4], coeffs_y[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) + is_vert_4tap = 1; + + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ const uint8_t *src_h = src_ptr + j; for (i = 0; i < im_h; i += 2) { __m256i data = @@ -396,49 +659,59 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, data = _mm256_inserti128_si256( data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); src_h += (src_stride << 1); - __m256i res = convolve_lowbd_x(data, coeffs_x, filt); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); } + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; - /* Vertical filter */ - { + /* Vertical filter */ + __m256i s[6]; __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); s[0] = _mm256_unpacklo_epi16(s0, s1); s[1] = _mm256_unpacklo_epi16(s2, s3); - s[2] = _mm256_unpacklo_epi16(s4, s5); - s[4] = _mm256_unpackhi_epi16(s0, s1); - s[5] = _mm256_unpackhi_epi16(s2, s3); - s[6] = _mm256_unpackhi_epi16(s4, s5); + s[3] = _mm256_unpackhi_epi16(s0, s1); + s[4] = _mm256_unpackhi_epi16(s2, s3); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); - const __m256i res_a = convolve(s, coeffs_y); + const __m256i res_a = convolve_4tap(s, coeffs_y + 1); const __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_v), round_shift_v); if (w - j > 4) { - const __m256i res_b = convolve(s + 4, coeffs_y); + const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); const __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_v), round_shift_v); const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); @@ -448,8 +721,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i data_ref_0 = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -479,8 +752,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -504,38 +777,49 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, res_1); } } - s[0] = s[1]; s[1] = s[2]; - s[2] = s[3]; - + s[3] = s[4]; s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; } } + } else { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } } } -void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_copy_avx2( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; (void)filter_params_x; (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; + (void)subpel_x_qn; + (void)subpel_y_qn; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const __m256i wt = unpack_weights_avx2(conv_params); const __m256i zero = _mm256_setzero_si256(); @@ -562,7 +846,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -600,7 +884,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, const __m256i data_ref_0 = load_line2_avx2( &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c index 87dc3242e..f8f640a11 100644 --- a/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c +++ b/media/libaom/src/av1/common/x86/jnt_convolve_sse2.c @@ -16,12 +16,12 @@ #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" -void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; @@ -37,7 +37,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i wt1 = _mm_set1_epi16(w1); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -48,9 +48,9 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, __m128i coeffs[4]; (void)filter_params_y; - (void)subpel_y_q4; + (void)subpel_y_qn; - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); if (w == 4) { do { @@ -77,7 +77,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -134,7 +134,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -150,12 +150,12 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, } } -void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; @@ -167,7 +167,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -180,9 +180,9 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, __m128i coeffs[4]; (void)filter_params_x; - (void)subpel_x_q4; + (void)subpel_x_qn; - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); if (w == 4) { __m128i s[8], src6, res, res_shift; @@ -225,7 +225,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -254,7 +254,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -331,7 +331,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -360,7 +360,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -383,3 +383,233 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, } while (j < w); } } + +void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i temp_lo, temp_hi; + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + const __m128i src_lo = _mm_unpacklo_epi8(data, zero); + const __m128i src_hi = _mm_unpackhi_epi8(data, zero); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 4); + temp_hi = _mm_slli_si128(src_hi, 12); + const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 8); + temp_hi = _mm_slli_si128(src_hi, 8); + const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 12); + temp_hi = _mm_slli_si128(src_hi, 4); + const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + temp_lo = _mm_srli_si128(src_lo, 2); + temp_hi = _mm_slli_si128(src_hi, 14); + const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 6); + temp_hi = _mm_slli_si128(src_hi, 10); + const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 10); + temp_hi = _mm_slli_si128(src_hi, 6); + const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 14); + temp_hi = _mm_slli_si128(src_hi, 2); + const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + else + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + } + } + } +} diff --git a/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c b/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c index 822772782..f45e3b267 100644 --- a/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c +++ b/media/libaom/src/av1/common/x86/jnt_convolve_ssse3.c @@ -16,12 +16,11 @@ #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" -void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_ssse3( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; @@ -34,7 +33,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const __m128i zero = _mm_setzero_si128(); @@ -56,7 +55,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); + filter_params_x, subpel_x_qn & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -124,7 +123,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); + filter_params_y, subpel_y_qn & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 @@ -211,7 +210,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); diff --git a/media/libaom/src/av1/common/x86/reconinter_avx2.c b/media/libaom/src/av1/common/x86/reconinter_avx2.c index f645e0454..a38bd8317 100644 --- a/media/libaom/src/av1/common/x86/reconinter_avx2.c +++ b/media/libaom/src/av1/common/x86/reconinter_avx2.c @@ -28,8 +28,8 @@ static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, } void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, - const uint8_t *src0, int stride0, - const uint8_t *src1, int stride1, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, int h, int w) { const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); @@ -37,18 +37,18 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, if (4 == w) { do { const __m128i s0A = xx_loadl_32(src0); - const __m128i s0B = xx_loadl_32(src0 + stride0); - const __m128i s0C = xx_loadl_32(src0 + stride0 * 2); - const __m128i s0D = xx_loadl_32(src0 + stride0 * 3); + const __m128i s0B = xx_loadl_32(src0 + src0_stride); + const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3); const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); const __m128i s1A = xx_loadl_32(src1); - const __m128i s1B = xx_loadl_32(src1 + stride1); - const __m128i s1C = xx_loadl_32(src1 + stride1 * 2); - const __m128i s1D = xx_loadl_32(src1 + stride1 * 3); + const __m128i s1B = xx_loadl_32(src1 + src1_stride); + const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3); const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); @@ -58,40 +58,40 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, const __m128i x_m8 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); xx_storeu_128(mask, x_m8); - src0 += (stride0 << 2); - src1 += (stride1 << 2); + src0 += (src0_stride << 2); + src1 += (src1_stride << 2); mask += 16; i += 4; } while (i < h); } else if (8 == w) { do { const __m128i s0A = xx_loadl_64(src0); - const __m128i s0B = xx_loadl_64(src0 + stride0); - const __m128i s0C = xx_loadl_64(src0 + stride0 * 2); - const __m128i s0D = xx_loadl_64(src0 + stride0 * 3); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); const __m128i s1A = xx_loadl_64(src1); - const __m128i s1B = xx_loadl_64(src1 + stride1); - const __m128i s1C = xx_loadl_64(src1 + stride1 * 2); - const __m128i s1D = xx_loadl_64(src1 + stride1 * 3); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); yy_storeu_256(mask, m8); - src0 += stride0 << 2; - src1 += stride1 << 2; + src0 += src0_stride << 2; + src1 += src1_stride << 2; mask += 32; i += 4; } while (i < h); } else if (16 == w) { do { const __m128i s0A = xx_load_128(src0); - const __m128i s0B = xx_load_128(src0 + stride0); + const __m128i s0B = xx_load_128(src0 + src0_stride); const __m128i s1A = xx_load_128(src1); - const __m128i s1B = xx_load_128(src1 + stride1); + const __m128i s1B = xx_load_128(src1 + src1_stride); const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); @@ -103,8 +103,8 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); yy_storeu_256(mask, m8); - src0 += stride0 << 1; - src1 += stride1 << 1; + src0 += src0_stride << 1; + src1 += src1_stride << 1; mask += 32; i += 2; } while (i < h); @@ -127,8 +127,8 @@ void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, yy_storeu_256(mask + j, m8); j += 32; } while (j < w); - src0 += stride0; - src1 += stride1; + src0 += src0_stride; + src1 += src1_stride; mask += w; i += 1; } while (i < h); diff --git a/media/libaom/src/av1/common/x86/selfguided_avx2.c b/media/libaom/src/av1/common/x86/selfguided_avx2.c index 0aaf1f454..3c5558dda 100644 --- a/media/libaom/src/av1/common/x86/selfguided_avx2.c +++ b/media/libaom/src/av1/common/x86/selfguided_avx2.c @@ -219,12 +219,12 @@ static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]); + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); @@ -263,7 +263,7 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, SGRPROJ_MTABLE_BITS), _mm256_set1_epi32(255)); - const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4); + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); yy_storeu_256(A + i * buf_stride + j, a_res); @@ -356,12 +356,12 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]); + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); @@ -400,7 +400,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, SGRPROJ_MTABLE_BITS), _mm256_set1_epi32(255)); - const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4); + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); yy_storeu_256(A + i * buf_stride + j, a_res); @@ -604,7 +604,7 @@ int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, buf_stride); - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; // Write to flt0 and flt1 // If params->r == 0 we skip the corresponding filter. We only allow one of // the radii to be 0, as having both equal to 0 would be equivalent to @@ -630,11 +630,11 @@ int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, return 0; } -void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, - int height, int stride, int eps, - const int *xqd, uint8_t *dst8, - int dst_stride, int32_t *tmpbuf, - int bit_depth, int highbd) { +void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); @@ -642,9 +642,9 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); (void)ret; assert(!ret); - const sgr_params_type *const params = &sgr_params[eps]; + const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; - decode_xq(xqd, xq, params); + av1_decode_xq(xqd, xq, params); __m256i xq0 = _mm256_set1_epi32(xq[0]); __m256i xq1 = _mm256_set1_epi32(xq[1]); diff --git a/media/libaom/src/av1/common/x86/selfguided_sse4.c b/media/libaom/src/av1/common/x86/selfguided_sse4.c index ea3f6d942..72c7708f1 100644 --- a/media/libaom/src/av1/common/x86/selfguided_sse4.c +++ b/media/libaom/src/av1/common/x86/selfguided_sse4.c @@ -170,12 +170,12 @@ static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) { static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m128i s = _mm_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]); + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); @@ -216,10 +216,11 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, // 'Gather' type instructions are not available pre-AVX2, so synthesize a // gather using scalar loads. - const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)], - x_by_xplus1[_mm_extract_epi32(z, 2)], - x_by_xplus1[_mm_extract_epi32(z, 1)], - x_by_xplus1[_mm_extract_epi32(z, 0)]); + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); xx_storeu_128(A + i * buf_stride + j, a_res); @@ -310,12 +311,12 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m128i s = _mm_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]); + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); @@ -356,10 +357,11 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, // 'Gather' type instructions are not available pre-AVX2, so synthesize a // gather using scalar loads. - const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)], - x_by_xplus1[_mm_extract_epi32(z, 2)], - x_by_xplus1[_mm_extract_epi32(z, 1)], - x_by_xplus1[_mm_extract_epi32(z, 0)]); + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); xx_storeu_128(A + i * buf_stride + j, a_res); @@ -554,7 +556,7 @@ int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, buf_stride); - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; // Write to flt0 and flt1 // If params->r == 0 we skip the corresponding filter. We only allow one of // the radii to be 0, as having both equal to 0 would be equivalent to @@ -580,11 +582,11 @@ int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, return 0; } -void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, - int height, int stride, int eps, - const int *xqd, uint8_t *dst8, - int dst_stride, int32_t *tmpbuf, - int bit_depth, int highbd) { +void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); @@ -592,9 +594,9 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); (void)ret; assert(!ret); - const sgr_params_type *const params = &sgr_params[eps]; + const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; - decode_xq(xqd, xq, params); + av1_decode_xq(xqd, xq, params); __m128i xq0 = _mm_set1_epi32(xq[0]); __m128i xq1 = _mm_set1_epi32(xq[1]); diff --git a/media/libaom/src/av1/common/x86/warp_plane_avx2.c b/media/libaom/src/av1/common/x86/warp_plane_avx2.c new file mode 100644 index 000000000..53a928d76 --- /dev/null +++ b/media/libaom/src/av1/common/x86/warp_plane_avx2.c @@ -0,0 +1,1318 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "aom_dsp/x86/synonyms.h" + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = { + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = { + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = { + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = { + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, + 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, + 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, + 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10, + 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, + 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, + 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, + 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11, + 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 }; + +static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out, + __m256i *coeff, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift, int row) { + const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]); + const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]); + const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]); + const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]); + + const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]); + const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]); + const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]); + const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]); + + const __m256i res_even = _mm256_add_epi16(res_02, res_46); + const __m256i res_odd = _mm256_add_epi16(res_13, res_57); + const __m256i res = + _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const); + horz_out[row] = _mm256_srl_epi16(res, *shift); +} + +static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta, + int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0); + __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2); + __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1); + __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3); + + __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4); + __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6); + __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5); + __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7); + + __m128i tmp_8 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1); + + __m128i tmp_9 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1); + + __m128i tmp_10 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1); + + __m128i tmp_11 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1); + + tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1); + + tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1); + + tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1); + + tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1); + + const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256); + const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256); + const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256); + const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2); + tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3); + tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6); + tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0); + const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1); + const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4); + const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx, + __m256i *coeff) { + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]); + + const __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1); + + coeff[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2)); + coeff[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2)); + coeff[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2)); + coeff[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2)); +} + +static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out, + int sx, int alpha, int beta, int row, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift) { + __m256i coeff[4]; + prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff); + filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift, + row); +} +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m256i *coeff) { + const __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14)); + coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14)); + coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15)); + coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15)); +} + +static INLINE void warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src, + round_const, shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)beta; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void unpack_weights_and_set_round_const_avx2( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) { + *res_sub_const = + _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16((short)w0); + const __m256i wt1 = _mm256_set1_epi16((short)w1); + *wt = _mm256_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta, + int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m128i filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + __m256i filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + __m256i filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + __m256i filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00); + __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01); + __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02); + __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = _mm256_broadcastsi128_si256(filt_00); + filt_1 = _mm256_broadcastsi128_si256(filt_01); + filt_2 = _mm256_broadcastsi128_si256(filt_02); + filt_3 = _mm256_broadcastsi128_si256(filt_03); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy, + __m256i *coeffs) { + const __m128i filt_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + const __m128i filt_1 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS))); + + __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1); + + coeffs[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2)); + coeffs[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2)); + coeffs[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2)); + coeffs[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2)); + + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out, + __m256i *src, + __m256i *coeffs, + __m256i *res_lo, + __m256i *res_hi, int row) { + const __m256i src_6 = horz_out[row + 3]; + const __m256i src_7 = + _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21); + + src[6] = _mm256_unpacklo_epi16(src_6, src_7); + + const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]); + const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]); + const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]); + const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]); + + const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2), + _mm256_add_epi32(res_4, res_6)); + + src[7] = _mm256_unpackhi_epi16(src_6, src_7); + + const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]); + const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]); + const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]); + const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]); + + const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3), + _mm256_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm256_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output_avx2( + const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const, + const __m256i *wt, const __m256i *res_sub_const, + const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params, + int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m256i res_lo_1 = *res_lo; + __m256i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p_0 = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j]; + + res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + + const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1); + __m256i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const dst8_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + const __m128i p_16_0 = _mm_loadl_epi64(p_0); + const __m128i p_16_1 = _mm_loadl_epi64(p_1); + const __m256i p_16 = + _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16); + const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1); + } + res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const); + res_lo_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits); + const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16); + const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo); + const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1); + *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0); + *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1); + } else { + const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16); + const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1); + _mm_storel_epi64(p_0, temp_lo_16_0); + _mm_storel_epi64(p_1, temp_lo_16_1); + } + if (p_width > 4) { + __m128i *const p4_0 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + __m128i *const p4_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1); + __m256i res_hi_16; + if (conv_params->do_average) { + __m128i *const dst8_4_0 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + __m128i *const dst8_4_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4]; + const __m128i p4_16_0 = _mm_loadl_epi64(p4_0); + const __m128i p4_16_1 = _mm_loadl_epi64(p4_1); + const __m256i p4_16 = _mm256_inserti128_si256( + _mm256_castsi128_si256(p4_16_0), p4_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16); + const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const); + res_hi_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits); + __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16); + const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi); + const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1); + *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0); + *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1); + } else { + const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16); + const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1); + _mm_storel_epi64(p4_0, temp_hi_16_0); + _mm_storel_epi64(p4_1, temp_hi_16_1); + } + } + } else { + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round); + const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit); + const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit); + const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + + if (p_width == 4) { + *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0); + *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1); + } else { + _mm_storel_epi64(p, res_8bit0); + _mm_storel_epi64(p1, res_8bit1); + } + } +} + +static INLINE void warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)delta; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void prepare_warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else + warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta, + p_height, p_stride, p_width, i, j, sy4, + reduce_bits_vert, res_add_const, round_bits, + res_sub_const, round_bits_const, wt); +} + +static INLINE void prepare_warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else + warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, round_const, shift, + shuffle_src); +} + +int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int p_width, + int p_height, int dst_stride) { + int64_t sum_error = 0; + int i, j; + __m256i row_error, col_error; + __m256i zero = _mm256_set1_epi16(0); + __m256i dup_255 = _mm256_set1_epi16(255); + col_error = zero; + + for (i = 0; i < (p_height / 4); i++) { + row_error = _mm256_set1_epi16(0); + for (j = 0; j < (p_width / 16); j++) { + __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride)))); + __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride)))); + __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride)))); + __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride)))); + __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride)))); + __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride)))); + __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride)))); + __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128( + (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride)))); + + __m256i diff_1 = + _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255); + __m256i diff_2 = + _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255); + __m256i diff_3 = + _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255); + __m256i diff_4 = + _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255); + + __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero); + __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero); + __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero); + __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero); + __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero); + __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero); + __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero); + __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero); + + __m256i error_1_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4); + __m256i error_1_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4); + __m256i error_2_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4); + __m256i error_2_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4); + __m256i error_3_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4); + __m256i error_3_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4); + __m256i error_4_lo = + _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4); + __m256i error_4_hi = + _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4); + + __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi); + __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi); + __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi); + __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi); + + __m256i error_1_2 = _mm256_add_epi32(error_1, error_2); + __m256i error_3_4 = _mm256_add_epi32(error_3, error_4); + + __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4); + row_error = _mm256_add_epi32(row_error, error_1_2_3_4); + } + __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero); + __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero); + __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi); + col_error = _mm256_add_epi64(col_error, col_error_temp); + // Error summation for remaining width, which is not multiple of 16 + if (p_width & 0xf) { + for (int k = 0; k < 4; ++k) { + for (int l = j * 16; l < p_width; ++l) { + sum_error += + (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] - + ref[l + ((i * 4) + k) * ref_stride]); + } + } + } + } + __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error); + __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1); + sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1); + int64_t sum_error_d_0, sum_error_d_1; + xx_storel_64(&sum_error_d_0, sum_error_q_0); + xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8)); + sum_error = (sum_error + sum_error_d_0 + sum_error_d_1); + // Error summation for remaining height, which is not multiple of 4 + if (p_height & 0x3) { + for (int k = i * 4; k < p_height; ++k) { + for (int l = 0; l < p_width; ++l) { + sum_error += (int64_t)error_measure(dst[l + k * dst_stride] - + ref[l + k * ref_stride]); + } + } + } + return sum_error; +} + +void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m256i horz_out[8]; + int i, j, k; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m256i reduce_bits_vert_const = + _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + const __m256i round_const = _mm256_set1_epi16( + (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); + const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz); + + __m256i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, + &wt); + + __m256i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } + const int32_t const1 = alpha * (-4) + beta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const2 = gamma * (-4) + delta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1); + const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)); + const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz)); + + __m256i shuffle_src[4]; + shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0); + shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1); + shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2); + shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3); + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += const1; + sy4 += const2; + + sx4 &= ~const3; + sy4 &= ~const3; + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + + if (ix4 <= -7) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + } else if (ix4 >= width + 6) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = + _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5); + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + int iy, sx, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_left); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_right); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, + shuffle_src, &round_const, &shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src = _mm_shuffle_epi8(src, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = + _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]); + src = _mm_shuffle_epi8(src, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = _mm256_castsi128_si256(src); + __m256i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, + &round_const, &shift, row); + } else { + prepare_warp_horizontal_filter_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, + i, &round_const, &shift, shuffle_src); + } + + // Vertical filter + prepare_warp_vertical_filter_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, + p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, + &res_sub_const, &round_bits_const, &wt); + } + } +} diff --git a/media/libaom/src/av1/common/x86/warp_plane_sse2.c b/media/libaom/src/av1/common/x86/warp_plane_sse2.c new file mode 100644 index 000000000..6ff666518 --- /dev/null +++ b/media/libaom/src/av1/common/x86/warp_plane_sse2.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" + +int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int p_width, + int p_height, int dst_stride) { + int64_t sum_error = 0; + int i, j; + __m128i row_error, col_error; + __m128i zero = _mm_set1_epi16(0); + __m128i dup_255 = _mm_set1_epi16(255); + col_error = zero; + for (i = 0; i < (p_height); i++) { + row_error = zero; + for (j = 0; j < (p_width / 16); j++) { + __m128i ref_8 = + _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride))); + __m128i dst_8 = + _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride))); + __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero); + __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero); + __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero); + __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero); + + __m128i diff_1 = + _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255); + __m128i diff_2 = + _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255); + + __m128i error_1_lo = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)], + error_measure_lut[_mm_extract_epi16(diff_1, 2)], + error_measure_lut[_mm_extract_epi16(diff_1, 1)], + error_measure_lut[_mm_extract_epi16(diff_1, 0)]); + __m128i error_1_hi = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)], + error_measure_lut[_mm_extract_epi16(diff_1, 6)], + error_measure_lut[_mm_extract_epi16(diff_1, 5)], + error_measure_lut[_mm_extract_epi16(diff_1, 4)]); + __m128i error_2_lo = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)], + error_measure_lut[_mm_extract_epi16(diff_2, 2)], + error_measure_lut[_mm_extract_epi16(diff_2, 1)], + error_measure_lut[_mm_extract_epi16(diff_2, 0)]); + __m128i error_2_hi = + _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)], + error_measure_lut[_mm_extract_epi16(diff_2, 6)], + error_measure_lut[_mm_extract_epi16(diff_2, 5)], + error_measure_lut[_mm_extract_epi16(diff_2, 4)]); + + __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi); + __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi); + __m128i error_1_2 = _mm_add_epi32(error_1, error_2); + + row_error = _mm_add_epi32(row_error, error_1_2); + } + __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero); + __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero); + __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi); + col_error = _mm_add_epi64(col_error, col_error_temp); + // Error summation for remaining width, which is not multiple of 16 + if (p_width & 0xf) { + for (int l = j * 16; l < p_width; ++l) { + sum_error += (int64_t)error_measure(dst[l + i * dst_stride] - + ref[l + i * ref_stride]); + } + } + } + int64_t sum_error_d_0, sum_error_d_1; + xx_storel_64(&sum_error_d_0, col_error); + xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8)); + sum_error = (sum_error + sum_error_d_0 + sum_error_d_1); + return sum_error; +} diff --git a/media/libaom/src/av1/common/x86/warp_plane_sse4.c b/media/libaom/src/av1/common/x86/warp_plane_sse4.c index b810cea2e..10ddf92d0 100644 --- a/media/libaom/src/av1/common/x86/warp_plane_sse4.c +++ b/media/libaom/src/av1/common/x86/warp_plane_sse4.c @@ -16,7 +16,7 @@ #include "av1/common/warped_motion.h" -/* This is a modified version of 'warped_filter' from warped_motion.c: +/* This is a modified version of 'av1_warped_filter' from warped_motion.c: * Each coefficient is stored in 8 bits instead of 16 bits * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 @@ -31,8 +31,8 @@ coefficients into the correct order more quickly. */ /* clang-format off */ -DECLARE_ALIGNED(8, static const int8_t, - filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { +DECLARE_ALIGNED(8, const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { #if WARPEDPIXEL_PREC_BITS == 6 // [-1, 0) { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, @@ -198,40 +198,53 @@ DECLARE_ALIGNED(8, static const int8_t, // in an SSE register into two sequences: // 0, 2, 2, 4, ..., 12, 12, 14, <don't care> // 1, 3, 3, 5, ..., 13, 13, 15, <don't care> -static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8, - 8, 10, 10, 12, 12, 14, 14, 0 }; -static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9, - 9, 11, 11, 13, 13, 15, 15, 0 }; - -static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1 }; - -static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3, - 2, 3, 2, 3, 2, 3, 2, 3 }; - -static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5, - 4, 5, 4, 5, 4, 5, 4, 5 }; - -static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7, - 6, 7, 6, 7, 6, 7, 6, 7 }; - -static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3 }; -static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7, - 4, 5, 6, 7, 4, 5, 6, 7 }; -static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11, - 8, 9, 10, 11, 8, 9, 10, 11 }; -static const uint8_t shuffle_gamma0_mask3[16] = { - 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 -}; +DECLARE_ALIGNED(16, static const uint8_t, + even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9, + 9, 11, 11, 13, 13, 15, 15, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15 }; static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, const int offset_bits_horiz, const int reduce_bits_horiz, int k) { const __m128i src_even = - _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask)); + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask)); const __m128i src_odd = - _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask)); + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask)); // The pixel order we need for 'src' is: // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); @@ -271,21 +284,21 @@ static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_1 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_2 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_3 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_4 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_5 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_6 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_7 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); @@ -319,20 +332,20 @@ static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx, __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = - _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 - coeff[0] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01)); + coeff[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01)); // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 - coeff[1] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23)); + coeff[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23)); // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 - coeff[2] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45)); + coeff[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45)); // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 - coeff[3] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67)); + coeff[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67)); } static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, @@ -449,21 +462,25 @@ static INLINE void unpack_weights_and_set_round_const( const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt0 = _mm_set1_epi16((int16_t)w0); + const __m128i wt1 = _mm_set1_epi16((int16_t)w1); *wt = _mm_unpacklo_epi16(wt0, wt1); } static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, __m128i *coeffs) { - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); @@ -476,14 +493,18 @@ static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); - const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); @@ -500,17 +521,17 @@ static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy, __m128i *coeffs) { const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); // even coeffs coeffs[0] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0)); coeffs[1] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1)); coeffs[2] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2)); coeffs[3] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3)); + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3)); // odd coeffs coeffs[4] = coeffs[0]; @@ -577,7 +598,7 @@ static INLINE void store_vertical_filter_output( __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; const __m128i p_16 = _mm_loadl_epi64(p); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); const __m128i shifted_32 = @@ -610,7 +631,7 @@ static INLINE void store_vertical_filter_output( (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; const __m128i p4_16 = _mm_loadl_epi64(p4); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); const __m128i shifted_32 = diff --git a/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c b/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c index 87a6e1239..b7ac68383 100644 --- a/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c +++ b/media/libaom/src/av1/common/x86/wiener_convolve_avx2.c @@ -17,6 +17,7 @@ #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" @@ -25,6 +26,20 @@ // on the left. // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ]. + +// Exploiting the range of wiener filter coefficients, +// horizontal filtering can be done in 16 bit intermediate precision. +// The details are as follows : +// Consider the horizontal wiener filter coefficients of the following form : +// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0] +// Subtracting 2^(FILTER_BITS) from the centre tap we get the following : +// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0] +// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3 +// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit +// precision. Finally, after rounding the above result by round_0, we multiply +// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the +// horizontal filter output. + void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -37,224 +52,190 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, (void)x_step_q4; (void)y_step_q4; - DECLARE_ALIGNED(32, uint16_t, - temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 2; - memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); - const int center_tap = ((SUBPEL_TAPS - 1) / 2); + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]); + int im_h = h + SUBPEL_TAPS - 2; + int im_stride = 8; + memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE); + int i, j; + const int center_tap = (SUBPEL_TAPS - 1) / 2; const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; - const __m128i zero_128 = _mm_setzero_si128(); - const __m256i zero_256 = _mm256_setzero_si256(); - - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); - - const __m256i clamp_low = zero_256; + __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center; + + assert(conv_params->round_0 > 0); + + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2); + + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x); + const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_h[0] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_h[1] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_h[2] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_h[3] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu)); + + const __m256i round_const_h = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1))); + const __m256i round_const_horz = + _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1))); + const __m256i clamp_low = _mm256_setzero_si256(); const __m256i clamp_high = _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0); - /* Horizontal filter */ - { - // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] - const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); - - // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] - const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] - const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); - - const __m256i round_const = _mm256_set1_epi32( - (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); - - for (int i = 0; i < intermediate_height; ++i) { - for (int j = 0; j < w; j += 16) { - const uint8_t *data_ij = src_ptr + i * src_stride + j; - - // Load 8-bit src data - const __m128i data_0 = xx_loadu_128(data_ij + 0); - const __m128i data_1 = xx_loadu_128(data_ij + 1); - const __m128i data_2 = xx_loadu_128(data_ij + 2); - const __m128i data_3 = xx_loadu_128(data_ij + 3); - const __m128i data_4 = xx_loadu_128(data_ij + 4); - const __m128i data_5 = xx_loadu_128(data_ij + 5); - const __m128i data_6 = xx_loadu_128(data_ij + 6); - const __m128i data_7 = xx_loadu_128(data_ij + 7); - - // (Zero-)Extend 8-bit data to 16-bit data - const __m256i src_0 = _mm256_cvtepu8_epi16(data_0); - const __m256i src_1 = _mm256_cvtepu8_epi16(data_1); - const __m256i src_2 = _mm256_cvtepu8_epi16(data_2); - const __m256i src_3 = _mm256_cvtepu8_epi16(data_3); - const __m256i src_4 = _mm256_cvtepu8_epi16(data_4); - const __m256i src_5 = _mm256_cvtepu8_epi16(data_5); - const __m256i src_6 = _mm256_cvtepu8_epi16(data_6); - const __m256i src_7 = _mm256_cvtepu8_epi16(data_7); - - // Multiply src data by filter coeffs and sum pairs - const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); - const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); - - // Calculate scalar product for even- and odd-indices separately, - // increasing to 32-bit precision - const __m256i res_even_sum = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); - const __m256i res_odd_sum = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); - - const __m256i res_even = _mm256_srai_epi32( - _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); - const __m256i res_odd = _mm256_srai_epi32( - _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); - - // Reduce to 16-bit precision and pack even- and odd-index results - // back into one register. The _mm256_packs_epi32 intrinsic returns - // a register with the pixels ordered as follows: - // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] - const __m256i res = _mm256_packs_epi32(res_even, res_odd); - const __m256i res_clamped = - _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); - - // Store in a temporary array - yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); - } + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i zero_128 = _mm_setzero_si128(); + const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0); + + const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff); + + const __m256i round_const_v = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + for (j = 0; j < w; j += 8) { + for (i = 0; i < im_h; i += 2) { + __m256i data = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + // Load the next line + if (i + 1 < im_h) + data = _mm256_inserti128_si256( + data, + _mm_loadu_si128( + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), + 1); + + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); + + res = + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); + + __m256i data_0 = _mm256_shuffle_epi8(data, filt_center); + + // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to + // the result + data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0); + res = _mm256_add_epi16(res, data_0); + res = _mm256_add_epi16(res, round_const_horz); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped); } - } - /* Vertical filter */ - { - // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ] - const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); - - // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ] - const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ] - const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ] - const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); - // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ] - const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); - // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ] - const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); - // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ] - const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); - - // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ] - const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); - // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ] - const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); - // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ] - const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); - // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ] - const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); - - const __m256i round_const = - _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - - (1 << (bd + conv_params->round_1 - 1))); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j; - - // Load 16-bit data from the output of the horizontal filter in - // which the pixels are ordered as follows: - // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] - const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE); - const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE); - const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE); - const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE); - const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE); - const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE); - const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE); - const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE); - - // Filter the even-indices, increasing to 32-bit precision - const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); - const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); - const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); - const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); - - const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); - - const __m256i res_even = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); - - // Filter the odd-indices, increasing to 32-bit precision - const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); - const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); - const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); - const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); - - const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); - - const __m256i res_odd = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); - - // Pixels are currently in the following order: - // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] - // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] - // - // Rearrange the pixels into the following order: - // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] - // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] - const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); - const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); - - const __m256i res_lo_round = _mm256_srai_epi32( - _mm256_add_epi32(res_lo, round_const), conv_params->round_1); - const __m256i res_hi_round = _mm256_srai_epi32( - _mm256_add_epi32(res_hi, round_const), conv_params->round_1); - - // Reduce to 16-bit precision and pack into the correct order: - // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] - const __m256i res_16bit = - _mm256_packs_epi32(res_lo_round, res_hi_round); - - // Reduce to 8-bit precision. This messes up the order: - // [ - - - - - - - - 15 14 13 12 11 10 9 8 ] - // [ - - - - - - - - 7 6 5 4 3 2 1 0 ] - const __m256i res_8bit = - _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */); - - // Swap the two central 32-bit values to get the order: - // [ - - - - - - - - - - - - - - - - ] - // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ] - const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8); - - // Store the lower 128-bit lane in the dst array - xx_storeu_128(dst + i * dst_stride + j, - _mm256_castsi256_si128(res_8bit2)); + /* Vertical filter */ + { + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + __m256i s[8]; + s[0] = _mm256_unpacklo_epi16(src_0, src_1); + s[1] = _mm256_unpacklo_epi16(src_2, src_3); + s[2] = _mm256_unpacklo_epi16(src_4, src_5); + + s[4] = _mm256_unpackhi_epi16(src_0, src_1); + s[5] = _mm256_unpackhi_epi16(src_2, src_3); + s[6] = _mm256_unpackhi_epi16(src_4, src_5); + + for (i = 0; i < h - 1; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + __m256i res_a = convolve(s, coeffs_v); + __m256i res_b = convolve(s + 4, coeffs_v); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + // 8 bit conversion and saturation to uint8 + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + // Store values into the destination buffer + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; + + _mm_storel_epi64(p_0, res_0); + _mm_storel_epi64(p_1, res_1); + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + if (h - i) { + s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20); + s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20); + s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20); + + const int16_t *data = &im_block[i * im_stride]; + const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride)); + const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); + + __m128i s3 = _mm_unpacklo_epi16(s6_, s7_); + __m128i s7 = _mm_unpackhi_epi16(s6_, s7_); + + s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1); + __m256i convolveres = convolve(s, coeffs_v); + + const __m256i res_round = _mm256_sra_epi32( + _mm256_add_epi32(convolveres, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + __m128i reslo = _mm256_castsi256_si128(res_round); + __m128i reshi = _mm256_extracti128_si256(res_round, 1); + const __m128i res_16bit = _mm_packus_epi32(reslo, reshi); + + // 8 bit conversion and saturation to uint8 + const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit); + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p_0, res_8b); } } } |