diff options
Diffstat (limited to 'third_party/aom/av1/encoder')
90 files changed, 10010 insertions, 5857 deletions
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c index b721b6d2b..80f8e2e66 100644 --- a/third_party/aom/av1/encoder/aq_complexity.c +++ b/third_party/aom/av1/encoder/aq_complexity.c @@ -143,9 +143,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth); aom_clear_system_state(); - low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy, - MIN_DEFAULT_LV_THRESH) - : DEFAULT_LV_THRESH; + low_var_thresh = + (cpi->oxcf.pass == 2) + ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes); logvar = av1_log_block_var(cpi, mb, bs); diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h index af525b36d..3421d74c9 100644 --- a/third_party/aom/av1/encoder/aq_complexity.h +++ b/third_party/aom/av1/encoder/aq_complexity.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_ -#define AV1_ENCODER_AQ_COMPLEXITY_H_ +#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ +#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ #ifdef __cplusplus extern "C" { @@ -34,4 +34,4 @@ void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_AQ_COMPLEXITY_H_ +#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c index dec2c730d..f532d48da 100644 --- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c @@ -80,9 +80,11 @@ CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { } void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { - aom_free(cr->map); - aom_free(cr->last_coded_q_map); - aom_free(cr); + if (cr != NULL) { + aom_free(cr->map); + aom_free(cr->last_coded_q_map); + aom_free(cr); + } } // Check if we should turn off cyclic refresh based on bitrate condition. diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h index 459ab80b8..b45781983 100644 --- a/third_party/aom/av1/encoder/aq_cyclicrefresh.h +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_ -#define AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ #include "av1/common/blockd.h" @@ -95,4 +95,4 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) { } // extern "C" #endif -#endif // AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c index 6cb6adc42..58f906bdc 100644 --- a/third_party/aom/av1/encoder/aq_variance.c +++ b/third_party/aom/av1/encoder/aq_variance.c @@ -14,34 +14,33 @@ #include "aom_ports/mem.h" #include "av1/encoder/aq_variance.h" - #include "av1/common/seg_common.h" +#include "av1/encoder/encodeframe.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/dwt.h" #include "aom_ports/system_state.h" +static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0, + 0.9, .8, .7, .6 }; + +static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, + 0.75, 1.0, 1.0, 1.0 }; #define ENERGY_MIN (-4) #define ENERGY_MAX (1) #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) #define ENERGY_IN_BOUNDS(energy) \ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) -static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, - 0.75, 1.0, 1.0, 1.0 }; -static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; - -#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] - DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; + DECLARE_ALIGNED(16, static const uint16_t, av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; -unsigned int av1_vaq_segment_id(int energy) { - ENERGY_IN_BOUNDS(energy); - return SEGMENT_ID(energy); -} +static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; + +#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] void av1_vaq_frame_setup(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; @@ -51,6 +50,12 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { int resolution_change = cm->prev_frame && (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height); + int avg_energy = (int)(cpi->twopass.mb_av_energy - 2); + double avg_ratio; + if (avg_energy > 7) avg_energy = 7; + if (avg_energy < 0) avg_energy = 0; + avg_ratio = rate_ratio[avg_energy]; + if (resolution_change) { memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); av1_clearall_segfeatures(seg); @@ -69,9 +74,11 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { aom_clear_system_state(); for (i = 0; i < MAX_SEGMENTS; ++i) { - int qindex_delta = - av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, - rate_ratio[i], cm->seq_params.bit_depth); + // Set up avg segment id to be 1.0 and adjust the other segments around + // it. + int qindex_delta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[i] / avg_ratio, + cm->seq_params.bit_depth); // We don't allow qindex 0 in a segment if the base value is not 0. // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment @@ -87,114 +94,58 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { } } -/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions - * of variance() and highbd_8_variance(). It should not. - */ -static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { - int i, j; - - *sum = 0; - *sse = 0; +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + // This functions returns a score for the blocks local variance as calculated + // by: sum of the log of the (4x4 variances) of each subblock to the current + // block (x,bs) + // * 32 / number of pixels in the block_size. + // This is used for segmentation because to avoid situations in which a large + // block with a gentle gradient gets marked high variance even though each + // subblock has a low variance. This allows us to assign the same segment + // number for the same sorts of area regardless of how the partitioning goes. - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } -} - -static void aq_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, uint64_t *sum) { + MACROBLOCKD *xd = &x->e_mbd; + double var = 0; + unsigned int sse; int i, j; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - a += a_stride; - b += b_stride; - } -} - -static void aq_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} - -static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bs) { - MACROBLOCKD *xd = &x->e_mbd; - unsigned int var, sse; int right_overflow = (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; int bottom_overflow = (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; - if (right_overflow || bottom_overflow) { - const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; - const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; - int avg; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, - CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh, - &sse, &avg); - sse >>= 2 * (xd->bd - 8); - avg >>= (xd->bd - 8); - } else { - aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0, - bw, bh, &sse, &avg); - } - var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh)); - return (unsigned int)((uint64_t)var * 256) / (bw * bh); - } else { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - var = - cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, - CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse); - } else { - var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, - av1_all_zeros, 0, &sse); + const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; + const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; + + aom_clear_system_state(); + + for (i = 0; i < bh; i += 4) { + for (j = 0; j < bw; j += 4) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var += + log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) / + 16); + } else { + var += + log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, av1_all_zeros, 0, &sse) / + 16); + } } - return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; } -} + // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561. + var /= (bw / 4 * bh / 4); + if (var > 7) var = 7; -double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { - unsigned int var = block_variance(cpi, x, bs); aom_clear_system_state(); - return log(var + 1.0); + return (int)(var); } #define DEFAULT_E_MIDPOINT 10.0 -int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { - double energy; - double energy_midpoint; - aom_clear_system_state(); - energy_midpoint = - (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT; - energy = av1_log_block_var(cpi, x, bs) - energy_midpoint; - return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); -} unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { MACROBLOCKD *xd = &x->e_mbd; @@ -231,17 +182,21 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi, int block_var_level) { - ENERGY_IN_BOUNDS(block_var_level); - - const int rate_level = SEGMENT_ID(block_var_level); + int rate_level; const AV1_COMMON *const cm = &cpi->common; + + if (DELTAQ_MODULATION == 1) { + ENERGY_IN_BOUNDS(block_var_level); + rate_level = SEGMENT_ID(block_var_level); + } else { + rate_level = block_var_level; + } int qindex_delta = av1_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level], + &cpi->rc, cm->frame_type, cm->base_qindex, deltaq_rate_ratio[rate_level], cm->seq_params.bit_depth); if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { qindex_delta = -cm->base_qindex + 1; } - return qindex_delta; } diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h index b1a8bc38a..2d22b663e 100644 --- a/third_party/aom/av1/encoder/aq_variance.h +++ b/third_party/aom/av1/encoder/aq_variance.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AQ_VARIANCE_H_ -#define AV1_ENCODER_AQ_VARIANCE_H_ +#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_ +#define AOM_AV1_ENCODER_AQ_VARIANCE_H_ #include "av1/encoder/encoder.h" @@ -18,11 +18,9 @@ extern "C" { #endif -unsigned int av1_vaq_segment_id(int energy); void av1_vaq_frame_setup(AV1_COMP *cpi); -int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); -double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi, int block_var_level); int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, @@ -32,4 +30,4 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, } // extern "C" #endif -#endif // AV1_ENCODER_AQ_VARIANCE_H_ +#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c index b92b3469f..98505e0b1 100644 --- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c @@ -11,24 +11,7 @@ #include <stdlib.h> #include "av1/encoder/av1_fwd_txfm1d.h" - -#if CONFIG_COEFFICIENT_RANGE_CHECKING -void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf, - int32_t size, int8_t bit); - -#define range_check(stage, input, buf, size, bit) \ - range_check_func(stage, input, buf, size, bit) -#else // CONFIG_COEFFICIENT_RANGE_CHECKING - -#define range_check(stage, input, buf, size, bit) \ - { \ - (void)stage; \ - (void)input; \ - (void)buf; \ - (void)size; \ - (void)bit; \ - } -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING +#include "av1/common/av1_txfm.h" void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { @@ -40,7 +23,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[4]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -49,7 +32,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = input[1] + input[2]; bf1[2] = -input[2] + input[1]; bf1[3] = -input[3] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -60,7 +43,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -70,7 +53,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = bf0[2]; bf1[2] = bf0[1]; bf1[3] = bf0[3]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -83,7 +66,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[8]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -96,7 +79,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = -input[5] + input[2]; bf1[6] = -input[6] + input[1]; bf1[7] = -input[7] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -111,7 +94,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); bf1[7] = bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -126,7 +109,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = -bf0[5] + bf0[4]; bf1[6] = -bf0[6] + bf0[7]; bf1[7] = bf0[7] + bf0[6]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -141,7 +124,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -155,7 +138,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5]; bf1[6] = bf0[3]; bf1[7] = bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -168,7 +151,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[16]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -189,7 +172,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -input[13] + input[2]; bf1[14] = -input[14] + input[1]; bf1[15] = -input[15] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -212,7 +195,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -235,7 +218,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -bf0[13] + bf0[14]; bf1[14] = bf0[14] + bf0[13]; bf1[15] = bf0[15] + bf0[12]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -258,7 +241,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); bf1[15] = bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -281,7 +264,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -bf0[13] + bf0[12]; bf1[14] = -bf0[14] + bf0[15]; bf1[15] = bf0[15] + bf0[14]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -304,7 +287,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -326,7 +309,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[11]; bf1[14] = bf0[7]; bf1[15] = bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -339,7 +322,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[32]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -376,7 +359,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = -input[29] + input[2]; bf1[30] = -input[30] + input[1]; bf1[31] = -input[31] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -415,7 +398,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -454,7 +437,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[29] + bf0[26]; bf1[30] = bf0[30] + bf0[25]; bf1[31] = bf0[31] + bf0[24]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -493,7 +476,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -532,7 +515,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = -bf0[29] + bf0[30]; bf1[30] = bf0[30] + bf0[29]; bf1[31] = bf0[31] + bf0[28]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -571,7 +554,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -610,7 +593,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = -bf0[29] + bf0[28]; bf1[30] = -bf0[30] + bf0[31]; bf1[31] = bf0[31] + bf0[30]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -649,7 +632,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -687,7 +670,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[23]; bf1[30] = bf0[15]; bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -698,7 +681,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t s0, s1, s2, s3, s4, s5, s6, s7; // stage 0 - range_check(0, input, input, 4, stage_range[0]); + av1_range_check_buf(0, input, input, 4, stage_range[0]); x0 = input[0]; x1 = input[1]; x2 = input[2]; @@ -746,7 +729,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, output[1] = round_shift(s1, bit); output[2] = round_shift(s2, bit); output[3] = round_shift(s3, bit); - range_check(6, input, output, 4, stage_range[6]); + av1_range_check_buf(6, input, output, 4, stage_range[6]); } void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -759,7 +742,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[8]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -773,7 +756,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = input[6]; bf1[6] = input[2]; bf1[7] = -input[5]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -788,7 +771,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -802,7 +785,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5] + bf0[7]; bf1[6] = bf0[4] - bf0[6]; bf1[7] = bf0[5] - bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -817,7 +800,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -831,7 +814,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[1] - bf0[5]; bf1[6] = bf0[2] - bf0[6]; bf1[7] = bf0[3] - bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -846,7 +829,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -860,7 +843,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[2]; bf1[6] = bf0[7]; bf1[7] = bf0[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -873,7 +856,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[16]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -895,7 +878,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -input[13]; bf1[14] = -input[5]; bf1[15] = input[10]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -918,7 +901,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[13]; bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -940,7 +923,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[13] + bf0[15]; bf1[14] = bf0[12] - bf0[14]; bf1[15] = bf0[13] - bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -963,7 +946,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -985,7 +968,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[9] - bf0[13]; bf1[14] = bf0[10] - bf0[14]; bf1[15] = bf0[11] - bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1008,7 +991,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1030,7 +1013,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[5] - bf0[13]; bf1[14] = bf0[6] - bf0[14]; bf1[15] = bf0[7] - bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1053,7 +1036,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1075,7 +1058,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[2]; bf1[14] = bf0[15]; bf1[15] = bf0[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -1084,14 +1067,14 @@ void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, for (int i = 0; i < 4; ++i) output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits); assert(stage_range[0] + NewSqrt2Bits <= 32); - range_check(0, input, output, 4, stage_range[0]); + av1_range_check_buf(0, input, output, 4, stage_range[0]); } void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 8; ++i) output[i] = input[i] * 2; - range_check(0, input, output, 8, stage_range[0]); + av1_range_check_buf(0, input, output, 8, stage_range[0]); } void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -1100,14 +1083,14 @@ void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, for (int i = 0; i < 16; ++i) output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits); assert(stage_range[0] + NewSqrt2Bits <= 32); - range_check(0, input, output, 16, stage_range[0]); + av1_range_check_buf(0, input, output, 16, stage_range[0]); } void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 32; ++i) output[i] = input[i] * 4; - range_check(0, input, output, 32, stage_range[0]); + av1_range_check_buf(0, input, output, 32, stage_range[0]); } void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -1120,7 +1103,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[64]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -1189,7 +1172,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = -input[61] + input[2]; bf1[62] = -input[62] + input[1]; bf1[63] = -input[63] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -1260,7 +1243,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -1331,7 +1314,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61] + bf0[50]; bf1[62] = bf0[62] + bf0[49]; bf1[63] = bf0[63] + bf0[48]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -1402,7 +1385,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -1473,7 +1456,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61] + bf0[58]; bf1[62] = bf0[62] + bf0[57]; bf1[63] = bf0[63] + bf0[56]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1544,7 +1527,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1615,7 +1598,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = -bf0[61] + bf0[62]; bf1[62] = bf0[62] + bf0[61]; bf1[63] = bf0[63] + bf0[60]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1686,7 +1669,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1757,7 +1740,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = -bf0[61] + bf0[60]; bf1[62] = -bf0[62] + bf0[63]; bf1[63] = bf0[63] + bf0[62]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 10 stage++; @@ -1828,7 +1811,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 11 stage++; @@ -1898,5 +1881,5 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[47]; bf1[62] = bf0[31]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h index 9472af8e6..9dcf16552 100644 --- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_FWD_TXFM1D_H_ -#define AV1_FWD_TXFM1D_H_ +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ #include "av1/common/av1_txfm.h" @@ -46,4 +46,4 @@ void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, } #endif -#endif // AV1_FWD_TXFM1D_H_ +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h index 174689a14..98b6530db 100644 --- a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h @@ -9,11 +9,11 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_FWD_TXFM2D_CFG_H_ -#define AV1_FWD_TXFM2D_CFG_H_ +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ #include "av1/common/enums.h" #include "av1/encoder/av1_fwd_txfm1d.h" extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL]; extern const int8_t fwd_cos_bit_col[5][5]; extern const int8_t fwd_cos_bit_row[5][5]; -#endif // AV1_FWD_TXFM2D_CFG_H_ +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c index d0477b35b..a0a926005 100644 --- a/third_party/aom/av1/encoder/av1_quantize.c +++ b/third_party/aom/av1/encoder/av1_quantize.c @@ -273,35 +273,32 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan, qm_ptr, iqm_ptr, - qparam->log_scale); + quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: - aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan); + aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; case 1: - aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan); + aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; case 2: - aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan); + aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; default: assert(0); } @@ -392,28 +389,25 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan, qm_ptr, iqm_ptr, - qparam->log_scale); + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: if (LIKELY(n_coeffs >= 8)) { - aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, - eob_ptr, sc->scan, sc->iscan); + aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); } else { // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size // quantization - aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); @@ -421,15 +415,15 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, break; case 1: aom_highbd_quantize_b_32x32( - coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, - p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); break; case 2: aom_highbd_quantize_b_64x64( - coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, - p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); break; default: assert(0); } diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h index eaf8374de..35af9a67a 100644 --- a/third_party/aom/av1/encoder/av1_quantize.h +++ b/third_party/aom/av1/encoder/av1_quantize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_QUANTIZE_H_ -#define AV1_ENCODER_QUANTIZE_H_ +#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_ +#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_ #include "config/aom_config.h" @@ -145,4 +145,4 @@ void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, } // extern "C" #endif -#endif // AV1_ENCODER_QUANTIZE_H_ +#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_ diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c index 2070755cd..2c4acdb02 100644 --- a/third_party/aom/av1/encoder/bitstream.c +++ b/third_party/aom/av1/encoder/bitstream.c @@ -18,6 +18,7 @@ #include "aom_dsp/binary_codes_writer.h" #include "aom_dsp/bitwriter_buffer.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" #include "aom_ports/mem_ops.h" #include "aom_ports/system_state.h" #if CONFIG_BITSTREAM_DEBUG @@ -30,7 +31,6 @@ #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" #include "av1/common/mvref_common.h" -#include "av1/common/odintrin.h" #include "av1/common/pred_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" @@ -66,11 +66,11 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, aom_writer *const w, int plane, FRAME_COUNTS *counts); -static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx, - const MB_MODE_INFO *mi, - const MB_MODE_INFO *above_mi, - const MB_MODE_INFO *left_mi, - PREDICTION_MODE mode, aom_writer *w) { +static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx, + const MB_MODE_INFO *mi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, + PREDICTION_MODE mode, aom_writer *w) { assert(!is_intrabc_block(mi)); (void)mi; aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi), @@ -297,7 +297,7 @@ static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex, DELTA_Q_PROBS + 1); if (!smallval) { - rem_bits = OD_ILOG_NZ(abs - 1) - 1; + rem_bits = get_msb(abs - 1); thr = (1 << rem_bits) + 1; aom_write_literal(w, rem_bits - 1, 3); aom_write_literal(w, abs - thr, rem_bits); @@ -326,7 +326,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd, } if (!smallval) { - rem_bits = OD_ILOG_NZ(abs - 1) - 1; + rem_bits = get_msb(abs - 1); thr = (1 << rem_bits) + 1; aom_write_literal(w, rem_bits - 1, 3); aom_write_literal(w, abs - thr, rem_bits); @@ -836,8 +836,8 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, } } -static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize, - PREDICTION_MODE mode, aom_writer *w) { +static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize, + PREDICTION_MODE mode, aom_writer *w) { aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]], INTRA_MODES); } @@ -933,45 +933,24 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w, } } -static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, - const int mi_col, aom_writer *w) { +// If delta q is present, writes delta_q index. +// Also writes delta_q loop filter levels, if present. +static void write_delta_q_params(AV1_COMP *cpi, const int mi_row, + const int mi_col, int skip, aom_writer *w) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->td.mb; - MACROBLOCKD *const xd = &x->e_mbd; - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - const struct segmentation *const seg = &cm->seg; - struct segmentation_probs *const segp = &ec_ctx->seg; - const MB_MODE_INFO *const mbmi = xd->mi[0]; - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - const PREDICTION_MODE mode = mbmi->mode; - const int segment_id = mbmi->segment_id; - const BLOCK_SIZE bsize = mbmi->sb_type; - const int allow_hp = cm->allow_high_precision_mv; - const int is_inter = is_inter_block(mbmi); - const int is_compound = has_second_ref(mbmi); - int skip, ref; - (void)mi_row; - (void)mi_col; - - write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1); - - write_skip_mode(cm, xd, segment_id, mbmi, w); - - assert(IMPLIES(mbmi->skip_mode, mbmi->skip)); - skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); - - write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0); - - write_cdef(cm, xd, w, skip, mi_col, mi_row); - if (cm->delta_q_present_flag) { - int super_block_upper_left = + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int super_block_upper_left = ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && ((mi_col & (cm->seq_params.mib_size - 1)) == 0); + if ((bsize != cm->seq_params.sb_size || skip == 0) && super_block_upper_left) { assert(mbmi->current_qindex > 0); - int reduced_delta_qindex = + const int reduced_delta_qindex = (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res; write_delta_qindex(xd, reduced_delta_qindex, w); xd->current_qindex = mbmi->current_qindex; @@ -996,37 +975,96 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, } } } +} - if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); +static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row, + const int mi_col, int is_keyframe, + aom_writer *w) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->sb_type; - if (mbmi->skip_mode) return; + // Y mode. + if (is_keyframe) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); + } else { + write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w); + } - if (!is_inter) { - write_intra_mode(ec_ctx, bsize, mode, w); - const int use_angle_delta = av1_use_angle_delta(bsize); + // Y angle delta. + const int use_angle_delta = av1_use_angle_delta(bsize); + if (use_angle_delta && av1_is_directional_mode(mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], + ec_ctx->angle_delta_cdf[mode - V_PRED]); + } - if (use_angle_delta && av1_is_directional_mode(mode)) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], - ec_ctx->angle_delta_cdf[mode - V_PRED]); + // UV mode and UV angle delta. + if (!cm->seq_params.monochrome && + is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y)) { + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); + if (uv_mode == UV_CFL_PRED) + write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); + if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], + ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); } + } - if (!cm->seq_params.monochrome && - is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y)) { - const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; - write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); - if (uv_mode == UV_CFL_PRED) - write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); - if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], - ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); - } - } + // Palette. + if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) { + write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); + } - if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) - write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); + // Filter intra. + write_filter_intra_mode_info(cm, xd, mbmi, w); +} - write_filter_intra_mode_info(cm, xd, mbmi, w); +static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, + const int mi_col, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const PREDICTION_MODE mode = mbmi->mode; + const int segment_id = mbmi->segment_id; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int allow_hp = cm->allow_high_precision_mv; + const int is_inter = is_inter_block(mbmi); + const int is_compound = has_second_ref(mbmi); + int ref; + + write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1); + + write_skip_mode(cm, xd, segment_id, mbmi, w); + + assert(IMPLIES(mbmi->skip_mode, mbmi->skip)); + const int skip = + mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); + + write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0); + + write_cdef(cm, xd, w, skip, mi_col, mi_row); + + write_delta_q_params(cpi, mi_row, mi_col, skip, w); + + if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); + + if (mbmi->skip_mode) return; + + if (!is_inter) { + write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w); } else { int16_t mode_ctx; @@ -1172,11 +1210,7 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd, FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const struct segmentation *const seg = &cm->seg; struct segmentation_probs *const segp = &ec_ctx->seg; - const MB_MODE_INFO *const above_mi = xd->above_mbmi; - const MB_MODE_INFO *const left_mi = xd->left_mbmi; const MB_MODE_INFO *const mbmi = xd->mi[0]; - const BLOCK_SIZE bsize = mbmi->sb_type; - const PREDICTION_MODE mode = mbmi->mode; if (seg->segid_preskip && seg->update_map) write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0); @@ -1188,69 +1222,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd, write_cdef(cm, xd, w, skip, mi_col, mi_row); - if (cm->delta_q_present_flag) { - int super_block_upper_left = - ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && - ((mi_col & (cm->seq_params.mib_size - 1)) == 0); - if ((bsize != cm->seq_params.sb_size || skip == 0) && - super_block_upper_left) { - assert(mbmi->current_qindex > 0); - int reduced_delta_qindex = - (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res; - write_delta_qindex(xd, reduced_delta_qindex, w); - xd->current_qindex = mbmi->current_qindex; - if (cm->delta_lf_present_flag) { - if (cm->delta_lf_multi) { - const int frame_lf_count = - av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; - for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { - int reduced_delta_lflevel = - (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / - cm->delta_lf_res; - write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w); - xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; - } - } else { - int reduced_delta_lflevel = - (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / - cm->delta_lf_res; - write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w); - xd->delta_lf_from_base = mbmi->delta_lf_from_base; - } - } - } - } + write_delta_q_params(cpi, mi_row, mi_col, skip, w); if (av1_allow_intrabc(cm)) { write_intrabc_info(xd, mbmi_ext, w); if (is_intrabc_block(mbmi)) return; } - write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); - - const int use_angle_delta = av1_use_angle_delta(bsize); - if (use_angle_delta && av1_is_directional_mode(mode)) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], - ec_ctx->angle_delta_cdf[mode - V_PRED]); - } - - if (!cm->seq_params.monochrome && - is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y)) { - const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; - write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); - if (uv_mode == UV_CFL_PRED) - write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); - if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], - ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); - } - } - - if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) - write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); - - write_filter_intra_mode_info(cm, xd, mbmi, w); + write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w); } #if CONFIG_RD_DEBUG @@ -1549,10 +1528,10 @@ static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w); } else { write_selected_tx_size(xd, w); - set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd); + set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd); } } else { - set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, skip && is_inter_block(mbmi), xd); } @@ -1694,15 +1673,14 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile, } static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, - aom_writer *const w, const TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end) { + aom_writer *const w, int tile_row, int tile_col) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; const int mi_row_start = tile->mi_row_start; const int mi_row_end = tile->mi_row_end; const int mi_col_start = tile->mi_col_start; const int mi_col_end = tile->mi_col_end; - int mi_row, mi_col; + int mi_row, mi_col, sb_row_in_tile; av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row); av1_init_above_context(cm, xd, tile->tile_row); @@ -1716,13 +1694,21 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->seq_params.mib_size) { + sb_row_in_tile = + (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2; + const TOKENEXTRA *tok = + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start; + const TOKENEXTRA *tok_end = + tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count; + av1_zero_left_context(xd); for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->seq_params.mib_size) { - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, + write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col, cm->seq_params.sb_size); } + assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop); } } @@ -2220,33 +2206,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm, } } -#if USE_GF16_MULTI_LAYER -static int get_refresh_mask_gf16(AV1_COMP *cpi) { - if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common)) - return 0xFF; - - int refresh_mask = 0; - - if (cpi->refresh_last_frame || cpi->refresh_golden_frame || - cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame || - cpi->refresh_alt_ref_frame) { - assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES); - refresh_mask |= (1 << cpi->refresh_fb_idx); - } - - return refresh_mask; -} -#endif // USE_GF16_MULTI_LAYER - static int get_refresh_mask(AV1_COMP *cpi) { if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) || frame_is_sframe(&cpi->common)) return 0xFF; int refresh_mask = 0; -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi); -#endif // USE_GF16_MULTI_LAYER // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be // notified to get LAST3_FRAME refreshed and then the virtual indexes for all @@ -2281,8 +2246,13 @@ static int get_refresh_mask(AV1_COMP *cpi) { // Note: This is highly specific to the use of ARF as a forward reference, // and this needs to be generalized as other uses are implemented // (like RTC/temporal scalability). - return refresh_mask | - (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]); + + if (cpi->preserve_arf_as_gld) { + return refresh_mask; + } else { + return refresh_mask | + (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]); + } } else { const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; return refresh_mask | @@ -2574,9 +2544,9 @@ static void write_film_grain_params(AV1_COMP *cpi, aom_wb_write_literal(wb, pars->random_seed, 16); - pars->random_seed += 3245; // For film grain test vectors purposes + pars->random_seed += 3381; // Changing random seed for film grain if (!pars->random_seed) // Random seed should not be zero - pars->random_seed += 1735; + pars->random_seed += 7391; if (cm->frame_type == INTER_FRAME) aom_wb_write_bit(wb, pars->update_parameters); else @@ -2685,7 +2655,8 @@ static void write_sb_size(SequenceHeader *seq_params, aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); } -void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) { +static void write_sequence_header(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *seq_params = &cm->seq_params; @@ -2695,8 +2666,10 @@ void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) { int max_frame_height = cpi->oxcf.forced_max_frame_height ? cpi->oxcf.forced_max_frame_height : cpi->oxcf.height; + // max((int)ceil(log2(max_frame_width)), 1) const int num_bits_width = (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1; + // max((int)ceil(log2(max_frame_height)), 1) const int num_bits_height = (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1; assert(num_bits_width <= 16); @@ -2954,7 +2927,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, assert(cm->frame_type == KEY_FRAME); } if (!seq_params->reduced_still_picture_hdr) { - if (cm->show_existing_frame) { + if (encode_show_existing_frame(cm)) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; @@ -3254,14 +3227,14 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, if (cm->base_qindex > 0) { aom_wb_write_bit(wb, cm->delta_q_present_flag); if (cm->delta_q_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2); + aom_wb_write_literal(wb, get_msb(cm->delta_q_res), 2); xd->current_qindex = cm->base_qindex; if (cm->allow_intrabc) assert(cm->delta_lf_present_flag == 0); else aom_wb_write_bit(wb, cm->delta_lf_present_flag); if (cm->delta_lf_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2); + aom_wb_write_literal(wb, get_msb(cm->delta_lf_res), 2); aom_wb_write_bit(wb, cm->delta_lf_multi); av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); } @@ -3508,7 +3481,7 @@ static void write_bitstream_level(BitstreamLevel bl, aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); } -static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { +uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { AV1_COMMON *const cm = &cpi->common; struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; @@ -3619,7 +3592,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, AV1_COMMON *const cm = &cpi->common; aom_writer mode_bc; int tile_row, tile_col; - TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok; TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers; uint32_t total_size = 0; const int tile_cols = cm->tile_cols; @@ -3684,8 +3656,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, for (tile_row = 0; tile_row < tile_rows; tile_row++) { TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; const int data_offset = have_tiles ? 4 : 0; const int tile_idx = tile_row * tile_cols + tile_col; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; @@ -3703,8 +3673,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, mode_bc.allow_update_cdf = mode_bc.allow_update_cdf && !cm->disable_cdf_update; aom_start_encode(&mode_bc, buf->data + data_offset); - write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); - assert(tok == tok_end); + write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); aom_stop_encode(&mode_bc); tile_size = mode_bc.pos; buf->size = tile_size; @@ -3794,8 +3763,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, const int tile_idx = tile_row * tile_cols + tile_col; TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; - const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; int is_last_tile_in_tg = 0; if (new_tg) { @@ -3847,7 +3814,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes); aom_start_encode(&mode_bc, dst + total_size); - write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); + write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); aom_stop_encode(&mode_bc); tile_size = mode_bc.pos; assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); @@ -3990,7 +3957,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { data += obu_header_size + obu_payload_size + length_field_size; } - const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame); + const int write_frame_header = + (cm->num_tg > 1 || encode_show_existing_frame(cm)); struct aom_write_bit_buffer saved_wb; if (write_frame_header) { // Write Frame Header OBU. @@ -4017,7 +3985,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { saved_wb.bit_buffer += length_field_size; } - if (cm->show_existing_frame) { + if (encode_show_existing_frame(cm)) { data_size = 0; } else { // Each tile group obu will be preceded by 4-byte size of the tile group diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h index 2047b6833..465ccaed5 100644 --- a/third_party/aom/av1/encoder/bitstream.h +++ b/third_party/aom/av1/encoder/bitstream.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_BITSTREAM_H_ -#define AV1_ENCODER_BITSTREAM_H_ +#ifndef AOM_AV1_ENCODER_BITSTREAM_H_ +#define AOM_AV1_ENCODER_BITSTREAM_H_ #ifdef __cplusplus extern "C" { @@ -20,8 +20,13 @@ extern "C" { struct aom_write_bit_buffer; -void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb); +// Writes only the OBU Sequence Header payload, and returns the size of the +// payload written to 'dst'. This function does not write the OBU header, the +// optional extension, or the OBU size to 'dst'. +uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst); +// Writes the OBU header byte, and the OBU header extension byte when +// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'. uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, uint8_t *const dst); @@ -32,8 +37,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size); static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) { // Do not swap gf and arf indices for internal overlay frames - return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref && - !cpi->rc.is_src_frame_ext_arf; + return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf; } void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, @@ -44,4 +48,4 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, } // extern "C" #endif -#endif // AV1_ENCODER_BITSTREAM_H_ +#endif // AOM_AV1_ENCODER_BITSTREAM_H_ diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h index 003e59e39..0bc5dea82 100644 --- a/third_party/aom/av1/encoder/block.h +++ b/third_party/aom/av1/encoder/block.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_BLOCK_H_ -#define AV1_ENCODER_BLOCK_H_ +#ifndef AOM_AV1_ENCODER_BLOCK_H_ +#define AOM_AV1_ENCODER_BLOCK_H_ #include "av1/common/entropymv.h" #include "av1/common/entropy.h" @@ -170,6 +170,7 @@ typedef struct { InterpFilters filters; int_mv mv[2]; int8_t ref_frames[2]; + COMPOUND_TYPE comp_type; } INTERPOLATION_FILTER_STATS; typedef struct macroblock MACROBLOCK; @@ -254,6 +255,19 @@ struct macroblock { PALETTE_BUFFER *palette_buffer; + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; + + // buffer for hash value calculation of a block + // used only in av1_get_block_hash_value() + // [first hash/second hash] + // [two buffers used ping-pong] + uint32_t *hash_value_buffer[2][2]; + + CRC_CALCULATOR crc_calculator1; + CRC_CALCULATOR crc_calculator2; + int g_crc_initialized; + // These define limits to motion vector components to prevent them // from extending outside the UMV borders MvLimits mv_limits; @@ -344,7 +358,6 @@ struct macroblock { #if CONFIG_DIST_8X8 int using_dist_8x8; aom_tune_metric tune_metric; - DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]); #endif // CONFIG_DIST_8X8 int comp_idx_cost[COMP_INDEX_CONTEXTS][2]; int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2]; @@ -352,6 +365,8 @@ struct macroblock { int tx_search_prune[EXT_TX_SET_TYPES]; int must_find_valid_partition; int tx_split_prune_flag; // Flag to skip tx split RD search. + int recalc_luma_mc_data; // Flag to indicate recalculation of MC data during + // interpolation filter search }; static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { @@ -400,8 +415,38 @@ static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) { return depth; } +static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx, + int skip) { + if (skip) + x->blk_skip[blk_idx] |= 1UL << plane; + else + x->blk_skip[blk_idx] &= ~(1UL << plane); +#ifndef NDEBUG + // Set chroma planes to uninitialized states when luma is set to check if + // it will be set later + if (plane == 0) { + x->blk_skip[blk_idx] |= 1UL << (1 + 4); + x->blk_skip[blk_idx] |= 1UL << (2 + 4); + } + + // Clear the initialization checking bit + x->blk_skip[blk_idx] &= ~(1UL << (plane + 4)); +#endif +} + +static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) { +#ifndef NDEBUG + // Check if this is initialized + assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4)))); + + // The magic number is 0x77, this is to test if there is garbage data + assert((x->blk_skip[blk_idx] & 0x88) == 0); +#endif + return (x->blk_skip[blk_idx] >> plane) & 1; +} + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_BLOCK_H_ +#endif // AOM_AV1_ENCODER_BLOCK_H_ diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c index 66dedd9ed..f7cff9e53 100644 --- a/third_party/aom/av1/encoder/blockiness.c +++ b/third_party/aom/av1/encoder/blockiness.c @@ -16,7 +16,6 @@ #include "av1/common/common.h" #include "av1/common/filter.h" #include "aom/aom_integer.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c index d6e556b93..57f59f304 100644 --- a/third_party/aom/av1/encoder/context_tree.c +++ b/third_party/aom/av1/encoder/context_tree.c @@ -175,14 +175,15 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) { } void av1_free_pc_tree(ThreadData *td, const int num_planes) { - const int tree_nodes_inc = 1024; - - const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; - int i; - for (i = 0; i < tree_nodes; ++i) - free_tree_contexts(&td->pc_tree[i], num_planes); - aom_free(td->pc_tree); - td->pc_tree = NULL; + if (td->pc_tree != NULL) { + const int tree_nodes_inc = 1024; + const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; + for (int i = 0; i < tree_nodes; ++i) { + free_tree_contexts(&td->pc_tree[i], num_planes); + } + aom_free(td->pc_tree); + td->pc_tree = NULL; + } } void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h index c05f48a7a..4efc34985 100644 --- a/third_party/aom/av1/encoder/context_tree.h +++ b/third_party/aom/av1/encoder/context_tree.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_CONTEXT_TREE_H_ -#define AV1_ENCODER_CONTEXT_TREE_H_ +#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_ +#define AOM_AV1_ENCODER_CONTEXT_TREE_H_ #include "av1/common/blockd.h" #include "av1/encoder/block.h" @@ -56,6 +56,8 @@ typedef struct { int hybrid_pred_diff; int comp_pred_diff; int single_pred_diff; + // Skip certain ref frames during RD search of rectangular partitions. + int skip_ref_frame_mask; // TODO(jingning) Use RD_COST struct here instead. This involves a boarder // scope of refactoring. @@ -109,4 +111,4 @@ void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, } // extern "C" #endif -#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */ +#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_ diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h index 0317db5b3..cab59a774 100644 --- a/third_party/aom/av1/encoder/corner_detect.h +++ b/third_party/aom/av1/encoder/corner_detect.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_CORNER_DETECT_H_ -#define AV1_ENCODER_CORNER_DETECT_H_ +#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_ +#define AOM_AV1_ENCODER_CORNER_DETECT_H_ #include <stdio.h> #include <stdlib.h> @@ -19,4 +19,4 @@ int fast_corner_detect(unsigned char *buf, int width, int height, int stride, int *points, int max_points); -#endif // AV1_ENCODER_CORNER_DETECT_H_ +#endif // AOM_AV1_ENCODER_CORNER_DETECT_H_ diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h index 3b16f9efc..535d2faed 100644 --- a/third_party/aom/av1/encoder/corner_match.h +++ b/third_party/aom/av1/encoder/corner_match.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_CORNER_MATCH_H_ -#define AV1_ENCODER_CORNER_MATCH_H_ +#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_ +#define AOM_AV1_ENCODER_CORNER_MATCH_H_ #include <stdio.h> #include <stdlib.h> @@ -30,4 +30,4 @@ int determine_correspondence(unsigned char *frm, int *frm_corners, int height, int frm_stride, int ref_stride, int *correspondence_pts); -#endif // AV1_ENCODER_CORNER_MATCH_H_ +#endif // AOM_AV1_ENCODER_CORNER_MATCH_H_ diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h index 5de7765c5..af5b09837 100644 --- a/third_party/aom/av1/encoder/cost.h +++ b/third_party/aom/av1/encoder/cost.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_COST_H_ -#define AV1_ENCODER_COST_H_ +#ifndef AOM_AV1_ENCODER_COST_H_ +#define AOM_AV1_ENCODER_COST_H_ #include "aom_dsp/prob.h" #include "aom/aom_integer.h" @@ -44,4 +44,4 @@ void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, } // extern "C" #endif -#endif // AV1_ENCODER_COST_H_ +#endif // AOM_AV1_ENCODER_COST_H_ diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h index 03318e5b7..37306c6a5 100644 --- a/third_party/aom/av1/encoder/dwt.h +++ b/third_party/aom/av1/encoder/dwt.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_ENCODER_DWT_H_ +#define AOM_AV1_ENCODER_DWT_H_ + #include "av1/common/common.h" #include "av1/common/enums.h" @@ -18,3 +21,5 @@ void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride); void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride, int hbd); int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd); + +#endif // AOM_AV1_ENCODER_DWT_H_ diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c index 27ca53761..cb226c59e 100644 --- a/third_party/aom/av1/encoder/encodeframe.c +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -40,11 +40,11 @@ #include "av1/common/reconinter.h" #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" +#include "av1/common/warped_motion.h" #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/aq_variance.h" -#include "av1/common/warped_motion.h" #include "av1/encoder/global_motion.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemb.h" @@ -56,6 +56,7 @@ #include "av1/encoder/partition_model_weights.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" @@ -348,8 +349,9 @@ static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi, x->skip = 0; } -static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data, - ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row, +static void update_state(const AV1_COMP *const cpi, + const TileDataEnc *const tile_data, ThreadData *td, + const PICK_MODE_CONTEXT *const ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) { int i, x_idx, y; const AV1_COMMON *const cm = &cpi->common; @@ -359,7 +361,7 @@ static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - MB_MODE_INFO *mi = &ctx->mic; + const MB_MODE_INFO *const mi = &ctx->mic; MB_MODE_INFO *const mi_addr = xd->mi[0]; const struct segmentation *const seg = &cm->seg; const int bw = mi_size_wide[mi->sb_type]; @@ -505,12 +507,12 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) { cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q); } -static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, +static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_STATS *rd_cost, PARTITION_TYPE partition, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { - const AV1_COMMON *const cm = &cpi->common; + AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; @@ -522,6 +524,13 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode; int i, orig_rdmult; + if (best_rd < 0) { + ctx->rdcost = INT64_MAX; + ctx->skip = 0; + av1_invalid_rd_stats(rd_cost); + return; + } + aom_clear_system_state(); set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); @@ -588,9 +597,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, if (aq_mode == VARIANCE_AQ) { if (cpi->vaq_refresh) { - const int energy = - bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize); - mbmi->segment_id = av1_vaq_segment_id(energy); + const int energy = bsize <= BLOCK_16X16 + ? x->mb_energy + : av1_log_block_var(cpi, x, bsize); + mbmi->segment_id = energy; } x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == COMPLEXITY_AQ) { @@ -1407,8 +1417,8 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, - PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx, - int *rate) { + PARTITION_TYPE partition, + const PICK_MODE_CONTEXT *const ctx, int *rate) { TileInfo *const tile = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; @@ -1691,7 +1701,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - x->mb_energy = av1_block_energy(cpi, x, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); } if (do_partition_search && @@ -1728,7 +1738,20 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, pc_tree->partitioning = partition; } } - + for (int b = 0; b < 2; ++b) { + pc_tree->horizontal[b].skip_ref_frame_mask = 0; + pc_tree->vertical[b].skip_ref_frame_mask = 0; + } + for (int b = 0; b < 3; ++b) { + pc_tree->horizontala[b].skip_ref_frame_mask = 0; + pc_tree->horizontalb[b].skip_ref_frame_mask = 0; + pc_tree->verticala[b].skip_ref_frame_mask = 0; + pc_tree->verticalb[b].skip_ref_frame_mask = 0; + } + for (int b = 0; b < 4; ++b) { + pc_tree->horizontal4[b].skip_ref_frame_mask = 0; + pc_tree->vertical4[b].skip_ref_frame_mask = 0; + } switch (partition) { case PARTITION_NONE: rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, @@ -1741,7 +1764,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) { RD_STATS tmp_rdc; - PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; + const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0]; av1_init_rd_stats(&tmp_rdc); update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, @@ -1765,7 +1788,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) { RD_STATS tmp_rdc; - PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0]; + const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0]; av1_init_rd_stats(&tmp_rdc); update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, @@ -1812,7 +1835,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, case PARTITION_HORZ_A: case PARTITION_HORZ_B: case PARTITION_HORZ_4: - case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types"); + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); default: assert(0); break; } @@ -2164,7 +2188,8 @@ static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); } -static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { +static INLINE void load_pred_mv(MACROBLOCK *x, + const PICK_MODE_CONTEXT *const ctx) { memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } @@ -2221,12 +2246,11 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, // Try searching for an encoding for the given subblock. Returns zero if the // rdcost is already too high (to tell the caller not to bother searching for // encodings of further subblocks) -static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, - TileDataEnc *tile_data, TOKENEXTRA **tp, - int is_first, int is_last, int mi_row, int mi_col, - BLOCK_SIZE subsize, RD_STATS *best_rdc, - RD_STATS *sum_rdc, RD_STATS *this_rdc, - PARTITION_TYPE partition, +static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last, + int mi_row, int mi_col, BLOCK_SIZE subsize, + RD_STATS *best_rdc, RD_STATS *sum_rdc, + RD_STATS *this_rdc, PARTITION_TYPE partition, PICK_MODE_CONTEXT *prev_ctx, PICK_MODE_CONTEXT *this_ctx) { #define RTS_X_RATE_NOCOEF_ARG @@ -2236,25 +2260,20 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx); - // On the first time around, write the rd stats straight to sum_rdc. Also, we - // should treat sum_rdc as containing zeros (even if it doesn't) to avoid - // having to zero it at the start. - if (is_first) this_rdc = sum_rdc; - const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost; - const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost; + const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc->rdcost - sum_rdc->rdcost); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx, rdcost_remaining); - if (!is_first) { - if (this_rdc->rate == INT_MAX) { - sum_rdc->rdcost = INT64_MAX; - } else { - sum_rdc->rate += this_rdc->rate; - sum_rdc->dist += this_rdc->dist; - sum_rdc->rdcost += this_rdc->rdcost; - } + if (this_rdc->rate == INT_MAX) { + sum_rdc->rdcost = INT64_MAX; + } else { + sum_rdc->rate += this_rdc->rate; + sum_rdc->dist += this_rdc->dist; + sum_rdc->rdcost += this_rdc->rdcost; } if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0; @@ -2271,7 +2290,7 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, #undef RTS_MAX_RDCOST } -static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, +static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc, PICK_MODE_CONTEXT ctxs[3], @@ -2284,13 +2303,16 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, MACROBLOCKD *const xd = &x->e_mbd; RD_STATS sum_rdc, this_rdc; #define RTP_STX_TRY_ARGS - - if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0, + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + av1_init_rd_stats(&sum_rdc); + sum_rdc.rate = x->partition_cost[pl][partition]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0, best_rdc, &sum_rdc, &this_rdc, RTP_STX_TRY_ARGS partition, ctx, &ctxs[0])) return; - if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1, + if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1, best_rdc, &sum_rdc, &this_rdc, RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1])) return; @@ -2302,15 +2324,13 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, // difference (obviously) doesn't contribute to the error. const int try_block2 = 1; if (try_block2 && - !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2, + !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2, best_rdc, &sum_rdc, &this_rdc, RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2])) return; if (sum_rdc.rdcost >= best_rdc->rdcost) return; - int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - sum_rdc.rate += x->partition_cost[pl][partition]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost >= best_rdc->rdcost) return; @@ -2321,45 +2341,6 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, #undef RTP_STX_TRY_ARGS } -#if CONFIG_DIST_8X8 -static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x, - uint8_t *src_plane_8x8[MAX_MB_PLANE], - uint8_t *dst_plane_8x8[MAX_MB_PLANE]) { - const AV1_COMMON *const cm = &cpi->common; - const int num_planes = av1_num_planes(cm); - MACROBLOCKD *const xd = &x->e_mbd; - int64_t dist_8x8, dist_8x8_uv, total_dist; - const int src_stride = x->plane[0].src.stride; - int plane; - - const int dst_stride = xd->plane[0].dst.stride; - dist_8x8 = - av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0], - dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex) - << 4; - - // Compute chroma distortion for a luma 8x8 block - dist_8x8_uv = 0; - - if (num_planes > 1) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - unsigned sse; - const int src_stride_uv = x->plane[plane].src.stride; - const int dst_stride_uv = xd->plane[plane].dst.stride; - const int ssx = xd->plane[plane].subsampling_x; - const int ssy = xd->plane[plane].subsampling_y; - const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy); - - cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv, - dst_plane_8x8[plane], dst_stride_uv, &sse); - dist_8x8_uv += (int64_t)sse << 4; - } - } - - return total_dist = dist_8x8 + dist_8x8_uv; -} -#endif // CONFIG_DIST_8X8 - static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) { pc_tree->partitioning = PARTITION_NONE; pc_tree->cb_search_range = SEARCH_FULL_PLANE; @@ -2372,7 +2353,7 @@ static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) { } } -static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, +static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, int64_t best_rd, @@ -2410,7 +2391,12 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, (void)*tp_orig; (void)split_rd; - av1_zero(pc_tree->pc_tree_stats); + if (best_rd < 0) { + pc_tree->none.rdcost = INT64_MAX; + pc_tree->none.skip = 0; + av1_invalid_rd_stats(rd_cost); + return; + } pc_tree->pc_tree_stats.valid = 1; // Override partition costs at the edges of the frame in the same @@ -2441,9 +2427,11 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, #ifndef NDEBUG // Nothing should rely on the default value of this array (which is just - // leftover from encoding the previous block. Setting it to magic number + // leftover from encoding the previous block. Setting it to fixed pattern // when debugging. - memset(x->blk_skip, 234, sizeof(x->blk_skip)); + // bit 0, 1, 2 are blk_skip of each plane + // bit 4, 5, 6 are initialization checking of each plane + memset(x->blk_skip, 0x77, sizeof(x->blk_skip)); #endif // NDEBUG assert(mi_size_wide[bsize] == mi_size_high[bsize]); @@ -2456,19 +2444,35 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); if (bsize == BLOCK_16X16 && cpi->vaq_refresh) - x->mb_energy = av1_block_energy(cpi, x, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) + do_square_split = 0; + } +#endif + // PARTITION_NONE if (partition_none_allowed) { - if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; - + int pt_cost = 0; + if (bsize_at_least_8x8) { + pc_tree->partitioning = PARTITION_NONE; + pt_cost = partition_cost[PARTITION_NONE] < INT_MAX + ? partition_cost[PARTITION_NONE] + : 0; + } + int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0); + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - partition_rd_cost); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost); + PARTITION_NONE, bsize, ctx_none, best_remain_rdcost); pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost; pc_tree->pc_tree_stats.skip = ctx_none->skip; @@ -2476,9 +2480,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, if (none_rd) *none_rd = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { if (bsize_at_least_8x8) { - const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX - ? partition_cost[PARTITION_NONE] - : 0; this_rdc.rate += pt_cost; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); } @@ -2520,17 +2521,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, int64_t temp_best_rdcost = best_rdc.rdcost; pn_rdc = best_rdc; -#if CONFIG_DIST_8X8 - uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE]; - - if (x->using_dist_8x8 && bsize == BLOCK_8X8) { - for (int i = 0; i < MAX_MB_PLANE; i++) { - src_plane_8x8[i] = x->plane[i].src.buf; - dst_plane_8x8[i] = xd->plane[i].dst.buf; - } - } -#endif // CONFIG_DIST_8X8 - // PARTITION_SPLIT if (do_square_split) { int reached_last_index = 0; @@ -2548,6 +2538,8 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, pc_tree->split[idx]->index = idx; int64_t *p_split_rd = &split_rd[idx]; + // TODO(Cherma) : Account for partition cost while passing best rd to + // rd_pick_sqr_partition() rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, &this_rdc, temp_best_rdcost - sum_rdc.rdcost, @@ -2568,14 +2560,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, } reached_last_index = (idx == 4); -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && reached_last_index && - sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { - sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 - if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { sum_rdc.rate += partition_cost[PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); @@ -2634,14 +2618,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, } } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && best_rdc.rate < INT_MAX && - best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) { - encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, - pc_tree, NULL); - } -#endif // CONFIG_DIST_8X8 - if (bsize == cm->seq_params.sb_size) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); @@ -2791,6 +2767,99 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats, } #undef FEATURE_SIZE +static void ml_prune_rect_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + int64_t best_rd, int64_t none_rd, + int64_t *split_rd, + int *const dst_prune_horz, + int *const dst_prune_vert) { + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + best_rd = AOMMAX(best_rd, 1); + const NN_CONFIG *nn_config = NULL; + const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f }; + float cur_thresh = 0.0f; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_rect_partition_nnconfig_8; + cur_thresh = prob_thresholds[0]; + break; + case BLOCK_16X16: + nn_config = &av1_rect_partition_nnconfig_16; + cur_thresh = prob_thresholds[1]; + break; + case BLOCK_32X32: + nn_config = &av1_rect_partition_nnconfig_32; + cur_thresh = prob_thresholds[2]; + break; + case BLOCK_64X64: + nn_config = &av1_rect_partition_nnconfig_64; + cur_thresh = prob_thresholds[3]; + break; + case BLOCK_128X128: + nn_config = &av1_rect_partition_nnconfig_128; + cur_thresh = prob_thresholds[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + aom_clear_system_state(); + + // 1. Compute input features + float features[9]; + + // RD cost ratios + for (int i = 0; i < 5; i++) features[i] = 1.0f; + if (none_rd > 0 && none_rd < 1000000000) + features[0] = (float)none_rd / (float)best_rd; + for (int i = 0; i < 4; i++) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + features[1 + i] = (float)split_rd[i] / (float)best_rd; + } + + // Variance ratios + const MACROBLOCKD *const xd = &x->e_mbd; + int whole_block_variance; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + whole_block_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + whole_block_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } + whole_block_variance = AOMMAX(whole_block_variance, 1); + + int split_variance[4]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + struct buf_2d buf; + buf.stride = x->plane[0].src.stride; + const int bw = block_size_wide[bsize]; + for (int i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bw / 2; + const int y_idx = (i >> 1) * bw / 2; + buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + split_variance[i] = + av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd); + } else { + split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize); + } + } + + for (int i = 0; i < 4; i++) + features[5 + i] = (float)split_variance[i] / (float)whole_block_variance; + + // 2. Do the prediction and prune 0-2 partitions based on their probabilities + float raw_scores[3] = { 0.0f }; + av1_nn_predict(features, nn_config, raw_scores); + float probs[3] = { 0.0f }; + av1_nn_softmax(raw_scores, probs, 3); + + // probs[0] is the probability of the fact that both rectangular partitions + // are worse than current best_rd + if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1; + if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1; +} + // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be // considered. static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, @@ -2880,13 +2949,14 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, #define FEATURES 18 #define LABELS 4 // Use a ML model to predict if horz4 and vert4 should be considered. -static void ml_prune_4_partition(const AV1_COMP *const cpi, - const MACROBLOCK *const x, BLOCK_SIZE bsize, - int part_ctx, int64_t best_rd, - int64_t horz_rd[2], int64_t vert_rd[2], - int64_t split_rd[4], +static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t horz_rd[2], + int64_t vert_rd[2], int64_t split_rd[4], int *const partition_horz4_allowed, - int *const partition_vert4_allowed) { + int *const partition_vert4_allowed, + unsigned int pb_source_variance, int mi_row, + int mi_col) { if (best_rd >= 1000000000) return; const NN_CONFIG *nn_config = NULL; switch (bsize) { @@ -2903,7 +2973,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, float features[FEATURES]; int feature_index = 0; features[feature_index++] = (float)part_ctx; - features[feature_index++] = (float)get_unsigned_bits(x->source_variance); + features[feature_index++] = (float)get_unsigned_bits(pb_source_variance); const int rdcost = (int)AOMMIN(INT_MAX, best_rd); int sub_block_rdcost[8] = { 0 }; @@ -2937,6 +3007,8 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, { BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common)); const int src_stride = x->plane[0].src.stride; const uint8_t *src = x->plane[0].src.buf; const MACROBLOCKD *const xd = &x->e_mbd; @@ -2990,7 +3062,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, } } - const float denom = (float)(x->source_variance + 1); + const float denom = (float)(pb_source_variance + 1); const float low_b = 0.1f; const float high_b = 10.0f; for (int i = 0; i < 4; ++i) { @@ -3022,9 +3094,9 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, // Make decisions based on the model scores. int thresh = max_score; switch (bsize) { - case BLOCK_16X16: thresh -= 400; break; - case BLOCK_32X32: thresh -= 400; break; - case BLOCK_64X64: thresh -= 100; break; + case BLOCK_16X16: thresh -= 500; break; + case BLOCK_32X32: thresh -= 500; break; + case BLOCK_64X64: thresh -= 200; break; default: break; } *partition_horz4_allowed = 0; @@ -3039,10 +3111,73 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, #undef FEATURES #undef LABELS +#define FEATURES 4 +// ML-based partition search breakout. +static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance) { + const NN_CONFIG *nn_config = NULL; + int thresh = 0; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_partition_breakout_nnconfig_8; + thresh = cpi->sf.ml_partition_search_breakout_thresh[0]; + break; + case BLOCK_16X16: + nn_config = &av1_partition_breakout_nnconfig_16; + thresh = cpi->sf.ml_partition_search_breakout_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &av1_partition_breakout_nnconfig_32; + thresh = cpi->sf.ml_partition_search_breakout_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &av1_partition_breakout_nnconfig_64; + thresh = cpi->sf.ml_partition_search_breakout_thresh[3]; + break; + case BLOCK_128X128: + nn_config = &av1_partition_breakout_nnconfig_128; + thresh = cpi->sf.ml_partition_search_breakout_thresh[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config || thresh < 0) return 0; + + // Generate feature values. + float features[FEATURES]; + int feature_index = 0; + aom_clear_system_state(); + + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX); + rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + features[feature_index++] = rate_f; + + const float dist_f = + (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2); + features[feature_index++] = dist_f; + + features[feature_index++] = (float)pb_source_variance; + + const int dc_q = (int)x->plane[0].dequant_QTX[0]; + features[feature_index++] = (float)(dc_q * dc_q) / 256.0f; + assert(feature_index == FEATURES); + + // Calculate score using the NN model. + float score = 0.0f; + av1_nn_predict(features, nn_config, &score); + + // Make decision. + return (int)(score * 100) >= thresh; +} +#undef FEATURES + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, +static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, int64_t best_rd, @@ -3068,6 +3203,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0]; int do_rectangular_split = 1; + int64_t cur_none_rd = 0; int64_t split_rd[4] = { 0, 0, 0, 0 }; int64_t horz_rd[2] = { 0, 0 }; int64_t vert_rd[2] = { 0, 0 }; @@ -3077,6 +3213,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, int vert_ctx_is_ready = 0; BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + if (best_rd < 0) { + pc_tree->none.rdcost = INT64_MAX; + pc_tree->none.skip = 0; + av1_invalid_rd_stats(rd_cost); + return; + } if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0; // Override skipping rectangular partition operations for edge blocks @@ -3129,9 +3271,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, #ifndef NDEBUG // Nothing should rely on the default value of this array (which is just - // leftover from encoding the previous block. Setting it to magic number + // leftover from encoding the previous block. Setting it to fixed pattern // when debugging. - memset(x->blk_skip, 234, sizeof(x->blk_skip)); + // bit 0, 1, 2 are blk_skip of each plane + // bit 4, 5, 6 are initialization checking of each plane + memset(x->blk_skip, 0x77, sizeof(x->blk_skip)); #endif // NDEBUG assert(mi_size_wide[bsize] == mi_size_high[bsize]); @@ -3143,7 +3287,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); if (bsize == BLOCK_16X16 && cpi->vaq_refresh) - x->mb_energy = av1_block_energy(cpi, x, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { const int cb_partition_search_ctrl = @@ -3285,22 +3429,56 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, } #endif + // Ref frames picked in the [i_th] quarter subblock during square partition + // RD search. It may be used to prune ref frame selection of rect partitions. + int ref_frames_used[4] = { + 0, + }; + BEGIN_PARTITION_SEARCH: if (x->must_find_valid_partition) { partition_none_allowed = has_rows && has_cols; partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; } + + // Partition block source pixel variance. + unsigned int pb_source_variance = UINT_MAX; + +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 8) partition_horz_allowed = 0; + if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0; + if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) + do_square_split = 0; + } +#endif + // PARTITION_NONE if (partition_none_allowed) { + int pt_cost = 0; + if (bsize_at_least_8x8) { + pt_cost = partition_cost[PARTITION_NONE] < INT_MAX + ? partition_cost[PARTITION_NONE] + : 0; + } + int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0); + int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX) + ? INT64_MAX + : (best_rdc.rdcost - partition_rd_cost); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost); + PARTITION_NONE, bsize, ctx_none, best_remain_rdcost); + pb_source_variance = x->source_variance; if (none_rd) *none_rd = this_rdc.rdcost; + cur_none_rd = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame); + for (int i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref_type); + } + } if (bsize_at_least_8x8) { - const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX - ? partition_cost[PARTITION_NONE] - : 0; this_rdc.rate += pt_cost; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); } @@ -3318,16 +3496,29 @@ BEGIN_PARTITION_SEARCH: best_rdc = this_rdc; if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; - // If all y, u, v transform blocks in this partition are skippable, and - // the dist & rate are within the thresholds, the partition search is - // terminated for current branch of the partition search tree. - // The dist & rate thresholds are set to 0 at speed 0 to disable the - // early termination at that speed. - if (!x->e_mbd.lossless[xd->mi[0]->segment_id] && - (ctx_none->skippable && best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr)) { - do_square_split = 0; - do_rectangular_split = 0; + if ((do_square_split || do_rectangular_split) && + !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) { + const int use_ml_based_breakout = + bsize <= cpi->sf.use_square_partition_only_threshold && + bsize > BLOCK_4X4 && xd->bd == 8; + if (use_ml_based_breakout) { + if (ml_predict_breakout(cpi, bsize, x, &this_rdc, + pb_source_variance)) { + do_square_split = 0; + do_rectangular_split = 0; + } + } + + // If all y, u, v transform blocks in this partition are skippable, + // and the dist & rate are within the thresholds, the partition + // search is terminated for current branch of the partition search + // tree. The dist & rate thresholds are set to 0 at speed 0 to + // disable the early termination at that speed. + if (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr) { + do_square_split = 0; + do_rectangular_split = 0; + } } #if CONFIG_FP_MB_STATS @@ -3384,24 +3575,14 @@ BEGIN_PARTITION_SEARCH: // store estimated motion vector if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none); -#if CONFIG_DIST_8X8 - uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE]; - - if (x->using_dist_8x8 && bsize == BLOCK_8X8) { - for (int i = 0; i < num_planes; i++) { - src_plane_8x8[i] = x->plane[i].src.buf; - dst_plane_8x8[i] = xd->plane[i].dst.buf; - } - } -#endif // CONFIG_DIST_8X8 - // PARTITION_SPLIT if (do_square_split) { av1_init_rd_stats(&sum_rdc); - int reached_last_index = 0; subsize = get_partition_subsize(bsize, PARTITION_SPLIT); - int idx; + sum_rdc.rate = partition_cost[PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + int idx; for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) { const int x_idx = (idx & 1) * mi_step; const int y_idx = (idx >> 1) * mi_step; @@ -3413,8 +3594,13 @@ BEGIN_PARTITION_SEARCH: pc_tree->split[idx]->index = idx; int64_t *p_split_rd = &split_rd[idx]; + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - sum_rdc.rdcost); + if (cpi->sf.prune_ref_frame_for_rect_partitions) + pc_tree->split[idx]->none.rate = INT_MAX; rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, - subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, + subsize, &this_rdc, best_remain_rdcost, pc_tree->split[idx], p_split_rd); if (this_rdc.rate == INT_MAX) { @@ -3424,11 +3610,16 @@ BEGIN_PARTITION_SEARCH: sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; - + if (cpi->sf.prune_ref_frame_for_rect_partitions && + pc_tree->split[idx]->none.rate != INT_MAX) { + const int ref_type = + av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame); + ref_frames_used[idx] |= (1 << ref_type); + } if (idx <= 1 && (bsize <= BLOCK_8X8 || pc_tree->split[idx]->partitioning == PARTITION_NONE)) { - MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic); - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1; @@ -3436,60 +3627,83 @@ BEGIN_PARTITION_SEARCH: } } } - reached_last_index = (idx == 4); - -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && reached_last_index && - sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { - int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); -#ifdef DEBUG_DIST_8X8 - // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/) - assert(sum_rdc.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - sum_rdc.dist = dist_8x8; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 + const int reached_last_index = (idx == 4); if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; pc_tree->partitioning = PARTITION_SPLIT; } - } else if (cpi->sf.less_rectangular_check) { + } else if (cpi->sf.less_rectangular_check_level > 0) { // skip rectangular partition test when larger block size // gives better rd cost - do_rectangular_split &= !partition_none_allowed; + if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2) + do_rectangular_split &= !partition_none_allowed; } restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // if (do_split) + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + pc_tree->vertical[0].skip_ref_frame_mask = 0; + pc_tree->vertical[1].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames; + } + + int prune_horz = 0; + int prune_vert = 0; + if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) && + (partition_horz_allowed || partition_vert_allowed)) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes); + ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd, + split_rd, &prune_horz, &prune_vert); + } + // PARTITION_HORZ - if (partition_horz_allowed && + if (partition_horz_allowed && !prune_horz && (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { av1_init_rd_stats(&sum_rdc); subsize = get_partition_subsize(bsize, PARTITION_HORZ); if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->horizontal[0].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); - - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, + } + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - sum_rdc.rdcost); + sum_rdc.rate = partition_cost[PARTITION_HORZ]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ, subsize, &pc_tree->horizontal[0], - best_rdc.rdcost); - horz_rd[0] = sum_rdc.rdcost; + best_remain_rdcost); + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + horz_rd[0] = this_rdc.rdcost; if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) { - PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; - MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic); - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0]; + const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1; @@ -3501,24 +3715,15 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->horizontal[1].pred_interp_filter = av1_extract_interp_filter(ctx_h->mic.interp_filters, 0); - + } rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, PARTITION_HORZ, subsize, &pc_tree->horizontal[1], best_rdc.rdcost - sum_rdc.rdcost); horz_rd[1] = this_rdc.rdcost; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) { - update_state(cpi, tile_data, td, &pc_tree->horizontal[1], - mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL); - encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, - mi_row + mi_step, mi_col, subsize, NULL); - } -#endif // CONFIG_DIST_8X8 - if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { @@ -3526,24 +3731,9 @@ BEGIN_PARTITION_SEARCH: sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX && - bsize == BLOCK_8X8) { - int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); -#ifdef DEBUG_DIST_8X8 - // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/) - assert(sum_rdc.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - sum_rdc.dist = dist_8x8; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_HORZ]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3555,7 +3745,7 @@ BEGIN_PARTITION_SEARCH: } // PARTITION_VERT - if (partition_vert_allowed && + if (partition_vert_allowed && !prune_vert && (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) { av1_init_rd_stats(&sum_rdc); subsize = get_partition_subsize(bsize, PARTITION_VERT); @@ -3563,18 +3753,31 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->vertical[0].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); - - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, + } + sum_rdc.rate = partition_cost[PARTITION_VERT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - sum_rdc.rdcost); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT, subsize, &pc_tree->vertical[0], - best_rdc.rdcost); - vert_rd[0] = sum_rdc.rdcost; + best_remain_rdcost); + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + vert_rd[0] = this_rdc.rdcost; const int64_t vert_max_rdcost = best_rdc.rdcost; if (sum_rdc.rdcost < vert_max_rdcost && has_cols) { - MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic); - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1; @@ -3587,24 +3790,15 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->vertical[1].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); - + } rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, PARTITION_VERT, subsize, &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost); vert_rd[1] = this_rdc.rdcost; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) { - update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row, - mi_col + mi_step, subsize, DRY_RUN_NORMAL); - encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, - mi_col + mi_step, subsize, NULL); - } -#endif // CONFIG_DIST_8X8 - if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { @@ -3612,25 +3806,9 @@ BEGIN_PARTITION_SEARCH: sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX && - bsize == BLOCK_8X8) { - int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); -#ifdef DEBUG_DIST_8X8 - // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && - 0 /* !CONFIG_CFL */) - assert(sum_rdc.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - sum_rdc.dist = dist_8x8; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_VERT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3641,6 +3819,17 @@ BEGIN_PARTITION_SEARCH: restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } + if (pb_source_variance == UINT_MAX) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + pb_source_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + pb_source_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } + } + const int ext_partition_allowed = do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed; @@ -3649,15 +3838,26 @@ BEGIN_PARTITION_SEARCH: int horzab_partition_allowed = ext_partition_allowed; int vertab_partition_allowed = ext_partition_allowed; +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) { + horzab_partition_allowed = 0; + vertab_partition_allowed = 0; + } + } +#endif + if (cpi->sf.prune_ext_partition_types_search_level) { if (cpi->sf.prune_ext_partition_types_search_level == 1) { + // TODO(debargha,huisu@google.com): may need to tune the threshold for + // pb_source_variance. horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || (pc_tree->partitioning == PARTITION_NONE && - x->source_variance < 32) || + pb_source_variance < 32) || pc_tree->partitioning == PARTITION_SPLIT); vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || (pc_tree->partitioning == PARTITION_NONE && - x->source_variance < 32) || + pb_source_variance < 32) || pc_tree->partitioning == PARTITION_SPLIT); } else { horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || @@ -3712,6 +3912,9 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed && partition_horz_allowed && partition_vert_allowed) { + // TODO(huisu@google.com): x->source_variance may not be the current block's + // variance. The correct one to use is pb_source_variance. + // Need to re-train the model to fix it. ml_prune_ab_partition(bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance), best_rdc.rdcost, horz_rd, vert_rd, split_rd, @@ -3736,6 +3939,21 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontala[1].rd_mode_is_ready = 1; } } + pc_tree->horizontala[0].skip_ref_frame_mask = 0; + pc_tree->horizontala[1].skip_ref_frame_mask = 0; + pc_tree->horizontala[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0]; + if (used_frames) + pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1]; + if (used_frames) + pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) + pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row, @@ -3754,6 +3972,21 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B; pc_tree->horizontalb[0].rd_mode_is_ready = 1; } + pc_tree->horizontalb[0].skip_ref_frame_mask = 0; + pc_tree->horizontalb[1].skip_ref_frame_mask = 0; + pc_tree->horizontalb[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) + pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2]; + if (used_frames) + pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[3]; + if (used_frames) + pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col, subsize, @@ -3773,6 +4006,18 @@ BEGIN_PARTITION_SEARCH: pc_tree->verticala[0].mic.partition = PARTITION_VERT_A; pc_tree->verticala[0].rd_mode_is_ready = 1; } + pc_tree->verticala[0].skip_ref_frame_mask = 0; + pc_tree->verticala[1].skip_ref_frame_mask = 0; + pc_tree->verticala[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0]; + if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2]; + if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala, ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col, bsize2, @@ -3791,6 +4036,18 @@ BEGIN_PARTITION_SEARCH: pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B; pc_tree->verticalb[0].rd_mode_is_ready = 1; } + pc_tree->verticalb[0].skip_ref_frame_mask = 0; + pc_tree->verticalb[1].skip_ref_frame_mask = 0; + pc_tree->verticalb[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1]; + if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[3]; + if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row, @@ -3823,9 +4080,19 @@ BEGIN_PARTITION_SEARCH: partition_horz_allowed && partition_vert_allowed) { ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost, horz_rd, vert_rd, split_rd, &partition_horz4_allowed, - &partition_vert4_allowed); + &partition_vert4_allowed, pb_source_variance, mi_row, + mi_col); } +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) { + partition_horz4_allowed = 0; + partition_vert4_allowed = 0; + } + } +#endif + // PARTITION_HORZ_4 if (partition_horz4_allowed && has_rows && (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { @@ -3834,25 +4101,33 @@ BEGIN_PARTITION_SEARCH: PICK_MODE_CONTEXT *ctx_prev = ctx_none; subsize = get_partition_subsize(bsize, PARTITION_HORZ_4); + sum_rdc.rate = partition_cost[PARTITION_HORZ_4]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); for (int i = 0; i < 4; ++i) { - int this_mi_row = mi_row + i * quarter_step; + const int this_mi_row = mi_row + i * quarter_step; if (i > 0 && this_mi_row >= cm->mi_rows) break; PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i]; ctx_this->rd_mode_is_ready = 0; - if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), - this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc, - &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this)) + ctx_this->skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int used_frames = i <= 1 + ? (ref_frames_used[0] | ref_frames_used[1]) + : (ref_frames_used[2] | ref_frames_used[3]); + if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames; + } + if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row, + mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, + PARTITION_HORZ_4, ctx_prev, ctx_this)) break; ctx_prev = ctx_this; } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_HORZ_4]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3870,16 +4145,25 @@ BEGIN_PARTITION_SEARCH: PICK_MODE_CONTEXT *ctx_prev = ctx_none; subsize = get_partition_subsize(bsize, PARTITION_VERT_4); + sum_rdc.rate = partition_cost[PARTITION_VERT_4]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); for (int i = 0; i < 4; ++i) { - int this_mi_col = mi_col + i * quarter_step; + const int this_mi_col = mi_col + i * quarter_step; if (i > 0 && this_mi_col >= cm->mi_cols) break; PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i]; ctx_this->rd_mode_is_ready = 0; - if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row, + ctx_this->skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int used_frames = i <= 1 + ? (ref_frames_used[0] | ref_frames_used[2]) + : (ref_frames_used[1] | ref_frames_used[3]); + if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames; + } + if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row, this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, PARTITION_VERT_4, ctx_prev, ctx_this)) break; @@ -3888,7 +4172,6 @@ BEGIN_PARTITION_SEARCH: } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_VERT_4]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3924,14 +4207,6 @@ BEGIN_PARTITION_SEARCH: } } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && best_rdc.rate < INT_MAX && - best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) { - encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, - pc_tree, NULL); - } -#endif // CONFIG_DIST_8X8 - if (bsize == cm->seq_params.sb_size) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); @@ -3950,6 +4225,15 @@ static void init_first_partition_pass_stats_tables( } } +// clear pc_tree_stats +static INLINE void clear_pc_tree_stats(PC_TREE *pt) { + if (pt == NULL) return; + pt->pc_tree_stats.valid = 0; + for (int i = 0; i < 4; ++i) { + clear_pc_tree_stats(pt->split[i]); + } +} + // Minimum number of samples to trigger the // mode_pruning_based_on_two_pass_partition_search feature. #define FIRST_PARTITION_PASS_MIN_SAMPLES 16 @@ -3963,7 +4247,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; SPEED_FEATURES *const sf = &cpi->sf; - int mi_col; const int leaf_nodes = 256; // Initialize the left context for the new SB row @@ -3977,26 +4260,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } } + PC_TREE *const pc_root = + td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2]; // Code each SB in the row - for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; + for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; mi_col += cm->seq_params.mib_size) { - const struct segmentation *const seg = &cm->seg; - int dummy_rate; - int64_t dummy_dist; - RD_STATS dummy_rdc; - int i; - int seg_skip = 0; - - const int idx_str = cm->mi_stride * mi_row + mi_col; - MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str; - PC_TREE *const pc_root = - td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2]; - av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes); av1_fill_mode_rates(cm, x, xd->tile_ctx); if (sf->adaptive_pred_interp_filter) { - for (i = 0; i < leaf_nodes; ++i) { + for (int i = 0; i < leaf_nodes; ++i) { td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; @@ -4015,10 +4288,12 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, av1_zero(x->pred_mv); pc_root->index = 0; + const struct segmentation *const seg = &cm->seg; + int seg_skip = 0; if (seg->enabled) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - int segment_id = + const int segment_id = map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col) : 0; seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); @@ -4039,15 +4314,14 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, cpi, block_wavelet_energy_level); } else { const int block_var_level = - av1_block_energy(cpi, x, cm->seq_params.sb_size); + av1_log_block_var(cpi, x, cm->seq_params.sb_size); x->sb_energy_level = block_var_level; offset_qindex = av1_compute_deltaq_from_energy_level(cpi, block_var_level); } - int qmask = ~(cm->delta_q_res - 1); + const int qmask = ~(cm->delta_q_res - 1); int current_qindex = clamp(cm->base_qindex + offset_qindex, cm->delta_q_res, 256 - cm->delta_q_res); - current_qindex = ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) + cm->base_qindex; @@ -4058,18 +4332,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, xd->mi[0]->current_qindex = current_qindex; av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id); if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) { - int j, k; - int lfmask = ~(cm->delta_lf_res - 1); - int delta_lf_from_base = offset_qindex / 2; - delta_lf_from_base = - ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask); + const int lfmask = ~(cm->delta_lf_res - 1); + const int delta_lf_from_base = + ((offset_qindex / 2 + cm->delta_lf_res / 2) & lfmask); // pre-set the delta lf for loop filter. Note that this value is set // before mi is assigned for each block in current superblock - for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); - j++) { - for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); - k++) { + for (int j = 0; + j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); j++) { + for (int k = 0; + k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); k++) { cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)] .delta_lf_from_base = clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); @@ -4085,19 +4357,24 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } } + int dummy_rate; + int64_t dummy_dist; + RD_STATS dummy_rdc; + const int idx_str = cm->mi_stride * mi_row + mi_col; + MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str; x->source_variance = UINT_MAX; if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { - BLOCK_SIZE bsize; set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); - bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size; + const BLOCK_SIZE bsize = + seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size; set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1, pc_root); } else if (cpi->partition_search_skippable_frame) { - BLOCK_SIZE bsize; set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); - bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); + const BLOCK_SIZE bsize = + get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1, @@ -4113,9 +4390,9 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, reset_partition(pc_root, cm->seq_params.sb_size); x->use_cb_search_range = 0; init_first_partition_pass_stats_tables(x->first_partition_pass_stats); + // Do the first pass if we need two pass partition search if (cpi->sf.two_pass_partition_search && - cpi->sf.use_square_partition_only_threshold < - cm->seq_params.sb_size && + cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 && mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows && mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols && cm->frame_type != KEY_FRAME) { @@ -4123,6 +4400,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, // Reset the stats tables. if (sf->mode_pruning_based_on_two_pass_partition_search) av1_zero(x->first_partition_pass_stats); + clear_pc_tree_stats(pc_root); rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root, NULL); @@ -4130,7 +4408,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, x->source_variance = UINT_MAX; if (sf->adaptive_pred_interp_filter) { - for (i = 0; i < leaf_nodes; ++i) { + for (int i = 0; i < leaf_nodes; ++i) { td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; @@ -4157,7 +4435,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, x->use_cb_search_range = 1; if (sf->mode_pruning_based_on_two_pass_partition_search) { - for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { + for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { FIRST_PARTITION_PASS_STATS *const stat = &x->first_partition_pass_stats[i]; if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) { @@ -4174,21 +4452,17 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } } } - - rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, - cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, - pc_root, NULL); - } else { - rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, - cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, - pc_root, NULL); } + + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root, + NULL); } #if CONFIG_COLLECT_INTER_MODE_RD_STATS // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && cm->tile_rows == 1) { - av1_inter_mode_data_fit(x->rdmult); + av1_inter_mode_data_fit(tile_data, x->rdmult); } #endif } @@ -4233,6 +4507,32 @@ static TX_MODE select_tx_mode(const AV1_COMP *cpi) { return cpi->common.tx_mode; } +void av1_alloc_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + int tile_col, tile_row; + + if (cpi->tile_data != NULL) aom_free(cpi->tile_data); + CHECK_MEM_ERROR( + cm, cpi->tile_data, + aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); + cpi->allocated_tiles = tile_cols * tile_rows; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + int i, j; + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = 32; + tile_data->mode_map[i][j] = j; + } + } + } +} + void av1_init_tile_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); @@ -4240,28 +4540,9 @@ void av1_init_tile_data(AV1_COMP *cpi) { const int tile_rows = cm->tile_rows; int tile_col, tile_row; TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; + TOKENLIST *tplist = cpi->tplist[0][0]; unsigned int tile_tok = 0; - - if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { - if (cpi->tile_data != NULL) aom_free(cpi->tile_data); - CHECK_MEM_ERROR( - cm, cpi->tile_data, - aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); - cpi->allocated_tiles = tile_cols * tile_rows; - - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *const tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - int i, j; - for (i = 0; i < BLOCK_SIZES_ALL; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = 32; - tile_data->mode_map[i][j] = j; - } - } - } - } + int tplist_count = 0; for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { @@ -4274,6 +4555,9 @@ void av1_init_tile_data(AV1_COMP *cpi) { pre_tok = cpi->tile_tok[tile_row][tile_col]; tile_tok = allocated_tokens( *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + cpi->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = cpi->tplist[tile_row][tile_col]; + tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info); tile_data->allow_update_cdf = !cm->large_scale_tile; tile_data->allow_update_cdf = tile_data->allow_update_cdf && !cm->disable_cdf_update; @@ -4281,15 +4565,56 @@ void av1_init_tile_data(AV1_COMP *cpi) { } } +void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int tile_cols = cm->tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = NULL; + int sb_row_in_tile; + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + + int num_mb_rows_in_sb = + ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; + + sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2; + + get_start_tok(cpi, tile_row, tile_col, mi_row, &tok, + cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok; + + encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + + cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok; + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count = + (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop - + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start); + + assert( + (unsigned int)(tok - + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <= + get_token_alloc(num_mb_rows_in_sb, tile_mb_cols, + cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes)); + + (void)tile_mb_cols; + (void)num_mb_rows_in_sb; +} + void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, int tile_col) { AV1_COMMON *const cm = &cpi->common; TileDataEnc *const this_tile = &cpi->tile_data[tile_row * cm->tile_cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; - TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; int mi_row; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + av1_inter_mode_data_init(this_tile); +#endif + av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, tile_info->mi_col_end, tile_row); av1_init_above_context(cm, &td->mb.e_mbd, tile_row); @@ -4310,25 +4635,23 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; mi_row += cm->seq_params.mib_size) { - encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); } - - cpi->tok_count[tile_row][tile_col] = - (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); - assert(cpi->tok_count[tile_row][tile_col] <= - allocated_tokens(*tile_info, - cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, - av1_num_planes(cm))); } static void encode_tiles(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; int tile_col, tile_row; + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) + av1_alloc_tile_data(cpi); + av1_init_tile_data(cpi); - for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); cpi->intrabc_used |= cpi->td.intrabc_used_this_tile; } @@ -4616,6 +4939,13 @@ static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) { return 0; } +static void set_default_interp_skip_flags(AV1_COMP *cpi) { + const int num_planes = av1_num_planes(&cpi->common); + cpi->default_interp_skip_flags = (num_planes == 1) + ? DEFAULT_LUMA_INTERP_SKIP_FLAG + : DEFAULT_INTERP_SKIP_FLAG; +} + static void encode_frame_internal(AV1_COMP *cpi) { ThreadData *const td = &cpi->td; MACROBLOCK *const x = &td->mb; @@ -4683,41 +5013,41 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_hash_table_create(&cm->cur_frame->hash_table); av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0], block_hash_values[1], is_block_same[0], - is_block_same[1]); + is_block_same[1], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2], pic_width, pic_height, 4); av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1], block_hash_values[0], is_block_same[1], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2], pic_width, pic_height, 8); av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0], block_hash_values[1], is_block_same[0], - is_block_same[1]); + is_block_same[1], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2], pic_width, pic_height, 16); av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1], block_hash_values[0], is_block_same[1], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2], pic_width, pic_height, 32); av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0], block_hash_values[1], is_block_same[0], - is_block_same[1]); + is_block_same[1], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2], pic_width, pic_height, 64); av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1], block_hash_values[0], is_block_same[1], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2], pic_width, pic_height, 128); @@ -4769,7 +5099,7 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_initialize_rd_consts(cpi); av1_initialize_me_consts(cpi, x, cm->base_qindex); init_encode_frame_mb_context(cpi); - + set_default_interp_skip_flags(cpi); if (cm->prev_frame) cm->last_frame_seg_map = cm->prev_frame->seg_map; else @@ -4793,6 +5123,9 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_zero(rdc->global_motion_used); av1_zero(cpi->gmparams_cost); +#if !CONFIG_GLOBAL_MOTION_SEARCH + cpi->global_motion_search_done = 1; +#endif // !CONFIG_GLOBAL_MOTION_SEARCH if (cpi->common.frame_type == INTER_FRAME && cpi->source && !cpi->global_motion_search_done) { YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; @@ -4939,27 +5272,13 @@ static void encode_frame_internal(AV1_COMP *cpi) { } #endif -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - av1_inter_mode_data_init(); -#endif - - // If allowed, encoding tiles in parallel with one thread handling one tile. - // TODO(geza.lore): The multi-threaded encoder is not safe with more than - // 1 tile rows, as it uses the single above_context et al arrays from - // cpi->common - if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1) + if (cpi->row_mt && (cpi->oxcf.max_threads > 1)) + av1_encode_tiles_mt(cpi); + else if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1) av1_encode_tiles_mt(cpi); else encode_tiles(cpi); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS -#if INTER_MODE_RD_TEST - if (cpi->sf.inter_mode_rd_model_estimation) { - av1_inter_mode_data_show(cm); - } -#endif -#endif - aom_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer); } @@ -5407,7 +5726,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; } mbmi->tx_size = tx_size; - set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h, (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd); } CFL_CTX *const cfl = &xd->cfl; diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h index 62141dba4..e8cf9b468 100644 --- a/third_party/aom/av1/encoder/encodeframe.h +++ b/third_party/aom/av1/encoder/encodeframe.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODEFRAME_H_ -#define AV1_ENCODER_ENCODEFRAME_H_ +#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_ +#define AOM_AV1_ENCODER_ENCODEFRAME_H_ #include "aom/aom_integer.h" #include "av1/common/blockd.h" @@ -20,7 +20,7 @@ extern "C" { #endif -#define DELTAQ_MODULATION 0 // 0: variance based, 1: wavelet AC energy based +#define DELTAQ_MODULATION 1 // 0: variance based, 1: wavelet AC energy based struct macroblock; struct yv12_buffer_config; @@ -33,12 +33,15 @@ void av1_setup_src_planes(struct macroblock *x, void av1_encode_frame(struct AV1_COMP *cpi); +void av1_alloc_tile_data(struct AV1_COMP *cpi); void av1_init_tile_data(struct AV1_COMP *cpi); void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); +void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_ENCODEFRAME_H_ +#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_ diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c index cea8db6f9..ad12577e6 100644 --- a/third_party/aom/av1/encoder/encodemb.c +++ b/third_party/aom/av1/encoder/encodemb.c @@ -222,11 +222,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col, a = &args->ta[blk_col]; l = &args->tl[blk_row]; - // Assert not magic number (uninitialized). - assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234); - if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) && - !mbmi->skip_mode) { + if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) { TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, tx_size, cm->reduced_tx_set_used); if (args->enable_optimize_b) { @@ -350,6 +347,66 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col, } } +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const uint8_t txw_unit = tx_size_wide_unit[tx_size]; + const uint8_t txh_unit = tx_size_high_unit[tx_size]; + const int step = txw_unit * txh_unit; + int i = 0, r, c; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + + int blk_row, blk_col; + + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); + int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; + int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (r = 0; r < max_blocks_high; r += mu_blocks_high) { + const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) { + const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); + for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) { + for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) { + visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); + i += step; + } + } + } + } +} + +void av1_foreach_transformed_block(const MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + foreach_transformed_block_visitor visit, + void *arg, const int num_planes) { + for (int plane = 0; plane < num_planes; ++plane) { + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) + continue; + av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); + } +} + typedef struct encode_block_pass1_args { AV1_COMMON *cm; MACROBLOCK *x; @@ -382,7 +439,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, txfm_param.tx_set_type = av1_get_ext_tx_set_type( txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used); if (txfm_param.is_hbd) { - av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param); + av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); return; } av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); @@ -513,9 +570,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - // Assert not magic number (uninitialized). - assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234); - if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) { + if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) { *eob = 0; p->txb_entropy_ctx[block] = 0; } else { diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h index 673f87ea7..39080de59 100644 --- a/third_party/aom/av1/encoder/encodemb.h +++ b/third_party/aom/av1/encoder/encodemb.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODEMB_H_ -#define AV1_ENCODER_ENCODEMB_H_ +#ifndef AOM_AV1_ENCODER_ENCODEMB_H_ +#define AOM_AV1_ENCODER_ENCODEMB_H_ #include "config/aom_config.h" @@ -47,7 +47,18 @@ typedef enum AV1_XFORM_QUANT { void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, RUN_TYPE dry_run); + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg); + +void av1_foreach_transformed_block(const MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + foreach_transformed_block_visitor visit, + void *arg, const int num_planes); + void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize); + void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, TX_TYPE tx_type, @@ -82,4 +93,4 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, } // extern "C" #endif -#endif // AV1_ENCODER_ENCODEMB_H_ +#endif // AOM_AV1_ENCODER_ENCODEMB_H_ diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c index 944e2c53d..42eb5abf6 100644 --- a/third_party/aom/av1/encoder/encodemv.c +++ b/third_party/aom/av1/encoder/encodemv.c @@ -18,19 +18,37 @@ #include "av1/encoder/encodemv.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/bitops.h" + +static INLINE int mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} + +// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0. +static INLINE uint8_t log_in_base_2(unsigned int n) { + // get_msb() is only valid when n != 0. + return n == 0 ? 0 : get_msb(n); +} + +static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) { + const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) + ? MV_CLASS_10 + : (MV_CLASS_TYPE)log_in_base_2(z >> 3); + if (offset) *offset = z - mv_class_base(c); + return c; +} static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, MvSubpelPrecision precision) { + assert(comp != 0); int offset; const int sign = comp < 0; const int mag = sign ? -comp : comp; - const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int mv_class = get_mv_class(mag - 1, &offset); const int d = offset >> 3; // int mv data const int fr = (offset >> 1) & 3; // fractional mv data const int hp = offset & 1; // high precision mv data - assert(comp != 0); - // Sign aom_write_symbol(w, sign, mvcomp->sign_cdf, 2); @@ -89,7 +107,7 @@ static void build_nmv_component_cost_table(int *mvcost, for (v = 1; v <= MV_MAX; ++v) { int z, c, o, d, e, f, cost = 0; z = v - 1; - c = av1_get_mv_class(z, &o); + c = get_mv_class(z, &o); cost += class_cost[c]; d = (o >> 3); /* int mv data */ f = (o >> 1) & 3; /* fractional pel mv data */ diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h index 64e9e7162..37ff547c8 100644 --- a/third_party/aom/av1/encoder/encodemv.h +++ b/third_party/aom/av1/encoder/encodemv.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODEMV_H_ -#define AV1_ENCODER_ENCODEMV_H_ +#ifndef AOM_AV1_ENCODER_ENCODEMV_H_ +#define AOM_AV1_ENCODER_ENCODEMV_H_ #include "av1/encoder/encoder.h" @@ -40,8 +40,16 @@ void av1_find_best_ref_mvs_from_stack(int allow_hp, int_mv *nearest_mv, int_mv *near_mv, int is_integer); +static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { + if (mv->row == 0) { + return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ; + } else { + return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ; + } +} + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_ENCODEMV_H_ +#endif // AOM_AV1_ENCODER_ENCODEMV_H_ diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c index 13ea32e38..a2da2df89 100644 --- a/third_party/aom/av1/encoder/encoder.c +++ b/third_party/aom/av1/encoder/encoder.c @@ -14,9 +14,28 @@ #include <stdio.h> #include "config/aom_config.h" -#include "config/av1_rtcd.h" #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#if CONFIG_DENOISE +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_util.h" +#include "aom_dsp/noise_model.h" +#endif +#include "aom_dsp/psnr.h" +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "aom_scale/aom_scale.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "av1/common/alloccommon.h" #include "av1/common/cdef.h" @@ -38,6 +57,7 @@ #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" #include "av1/encoder/firstpass.h" +#include "av1/encoder/grain_test_vectors.h" #include "av1/encoder/hash_motion.h" #include "av1/encoder/mbgraph.h" #include "av1/encoder/picklpf.h" @@ -49,26 +69,6 @@ #include "av1/encoder/speed_features.h" #include "av1/encoder/temporal_filter.h" -#include "aom_dsp/psnr.h" -#if CONFIG_INTERNAL_STATS -#include "aom_dsp/ssim.h" -#endif -#include "av1/encoder/grain_test_vectors.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#if CONFIG_DENOISE -#include "aom_dsp/grain_table.h" -#include "aom_dsp/noise_util.h" -#include "aom_dsp/noise_model.h" -#endif -#include "aom_ports/aom_timer.h" -#include "aom_ports/mem.h" -#include "aom_ports/system_state.h" -#include "aom_scale/aom_scale.h" -#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG -#include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG - #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 // av1 uses 10,000,000 ticks/second as time stamp @@ -413,18 +413,13 @@ static void swap_mi_and_prev_mi(AV1_COMMON *cm) { } void av1_initialize_enc(void) { - static volatile int init_done = 0; - - if (!init_done) { - av1_rtcd(); - aom_dsp_rtcd(); - aom_scale_rtcd(); - av1_init_intra_predictors(); - av1_init_me_luts(); - av1_rc_init_minq_luts(); - av1_init_wedge_masks(); - init_done = 1; - } + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_me_luts(); + av1_rc_init_minq_luts(); + av1_init_wedge_masks(); } static void dealloc_context_buffers_ext(AV1_COMP *cpi) { @@ -506,6 +501,11 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->td.mb.wsrc_buf); cpi->td.mb.wsrc_buf = NULL; + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) { + aom_free(cpi->td.mb.hash_value_buffer[i][j]); + cpi->td.mb.hash_value_buffer[i][j] = NULL; + } aom_free(cpi->td.mb.mask_buf); cpi->td.mb.mask_buf = NULL; @@ -527,10 +527,18 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->tile_tok[0][0]); cpi->tile_tok[0][0] = 0; + aom_free(cpi->tplist[0][0]); + cpi->tplist[0][0] = NULL; + av1_free_pc_tree(&cpi->td, num_planes); aom_free(cpi->td.mb.palette_buffer); + aom_free(cpi->td.mb.tmp_conv_dst); + for (int j = 0; j < 2; ++j) { + aom_free(cpi->td.mb.tmp_obmc_bufs[j]); + } + #if CONFIG_DENOISE if (cpi->denoise_and_model) { aom_denoise_and_model_free(cpi->denoise_and_model); @@ -785,6 +793,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) { av1_alloc_context_buffers(cm, cm->width, cm->height); + int mi_rows_aligned_to_sb = + ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + av1_alloc_txb_buf(cpi); alloc_context_buffers_ext(cpi); @@ -797,6 +809,11 @@ static void alloc_compressor_data(AV1_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); } + aom_free(cpi->tplist[0][0]); + + CHECK_MEM_ERROR(cm, cpi->tplist[0][0], + aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS, + sizeof(*cpi->tplist[0][0]))); av1_setup_pc_tree(&cpi->common, &cpi->td); } @@ -1067,6 +1084,32 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { 10; // Default value (not signaled) } + if (cm->seq_params.monochrome) { + cm->seq_params.subsampling_x = 1; + cm->seq_params.subsampling_y = 1; + } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 && + cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB && + cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) { + cm->seq_params.subsampling_x = 0; + cm->seq_params.subsampling_y = 0; + } else { + if (cm->seq_params.profile == 0) { + cm->seq_params.subsampling_x = 1; + cm->seq_params.subsampling_y = 1; + } else if (cm->seq_params.profile == 1) { + cm->seq_params.subsampling_x = 0; + cm->seq_params.subsampling_y = 0; + } else { + if (cm->seq_params.bit_depth == AOM_BITS_12) { + cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x; + cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y; + } else { + cm->seq_params.subsampling_x = 1; + cm->seq_params.subsampling_y = 0; + } + } + } + cm->width = oxcf->width; cm->height = oxcf->height; set_sb_size(&cm->seq_params, @@ -2326,6 +2369,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cpi->oxcf = *oxcf; cpi->common.options = oxcf->cfg; + cpi->row_mt = oxcf->row_mt; x->e_mbd.bd = (int)seq_params->bit_depth; x->e_mbd.global_motion = cm->global_motion; @@ -2350,6 +2394,22 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { CHECK_MEM_ERROR(cm, x->palette_buffer, aom_memalign(16, sizeof(*x->palette_buffer))); } + + if (x->tmp_conv_dst == NULL) { + CHECK_MEM_ERROR( + cm, x->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst))); + x->e_mbd.tmp_conv_dst = x->tmp_conv_dst; + } + for (int i = 0; i < 2; ++i) { + if (x->tmp_obmc_bufs[i] == NULL) { + CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i], + aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*x->tmp_obmc_bufs[i]))); + x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i]; + } + } + av1_reset_segment_features(cm); set_high_precision_mv(cpi, 1, 0); @@ -2367,11 +2427,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { rc->worst_quality = cpi->oxcf.worst_allowed_q; rc->best_quality = cpi->oxcf.best_allowed_q; - if (!oxcf->large_scale_tile) - cm->interp_filter = cpi->sf.default_interp_filter; - else - cm->interp_filter = EIGHTTAP_REGULAR; - + cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE; cm->switchable_motion_mode = 1; if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) { @@ -2588,6 +2644,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf))); + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, cpi->td.mb.hash_value_buffer[x][y], + (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*cpi->td.mb.hash_value_buffer[0][0]))); + + cpi->td.mb.g_crc_initialized = 0; + CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf))); @@ -2913,9 +2978,19 @@ void av1_remove_compressor(AV1_COMP *cpi) { // Deallocate allocated thread data. if (t < cpi->num_workers - 1) { aom_free(thread_data->td->palette_buffer); + aom_free(thread_data->td->tmp_conv_dst); + for (int j = 0; j < 2; ++j) { + aom_free(thread_data->td->tmp_obmc_bufs[j]); + } aom_free(thread_data->td->above_pred_buf); aom_free(thread_data->td->left_pred_buf); aom_free(thread_data->td->wsrc_buf); + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + aom_free(thread_data->td->hash_value_buffer[x][y]); + thread_data->td->hash_value_buffer[x][y] = NULL; + } + } aom_free(thread_data->td->mask_buf); aom_free(thread_data->td->counts); av1_free_pc_tree(thread_data->td, num_planes); @@ -3058,53 +3133,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { } #endif -#if USE_GF16_MULTI_LAYER -static void check_show_existing_frame_gf16(AV1_COMP *cpi) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - AV1_COMMON *const cm = &cpi->common; - const FRAME_UPDATE_TYPE next_frame_update_type = - gf_group->update_type[gf_group->index]; - - if (cm->show_existing_frame == 1) { - cm->show_existing_frame = 0; - } else if (cpi->rc.is_last_bipred_frame) { - cpi->rc.is_last_bipred_frame = 0; - cm->show_existing_frame = 1; - cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1]; - } else if (next_frame_update_type == OVERLAY_UPDATE || - next_frame_update_type == INTNL_OVERLAY_UPDATE) { - // Check the temporal filtering status for the next OVERLAY frame - const int num_arfs_in_gf = cpi->num_extra_arfs + 1; - int which_arf = 0, arf_idx; - // Identify the index to the next overlay frame. - for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) { - if (gf_group->index == cpi->arf_pos_for_ovrly[arf_idx]) { - which_arf = arf_idx; - break; - } - } - assert(arf_idx < num_arfs_in_gf); - if (cpi->is_arf_filter_off[which_arf]) { - cm->show_existing_frame = 1; - cpi->rc.is_src_frame_alt_ref = 1; - cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE) - ? cpi->ref_fb_idx[ALTREF_FRAME - 1] - : cpi->ref_fb_idx[BWDREF_FRAME - 1]; - cpi->is_arf_filter_off[which_arf] = 0; - } - } - cpi->rc.is_src_frame_ext_arf = 0; -} -#endif // USE_GF16_MULTI_LAYER - static void check_show_existing_frame(AV1_COMP *cpi) { -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) { - check_show_existing_frame_gf16(cpi); - return; - } -#endif // USE_GF16_MULTI_LAYER - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; AV1_COMMON *const cm = &cpi->common; const FRAME_UPDATE_TYPE next_frame_update_type = @@ -3350,13 +3379,13 @@ static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) { EXTREF_FRAME - 1 }; for (int i = 2; i > 0; --i) { - cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]]; - // [0] is allocated to the current coded frame, i.e. bwdref memcpy( cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME], cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME], sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME])); + + cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]]; } } @@ -3370,52 +3399,16 @@ static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) { EXTREF_FRAME - 1 }; for (int i = 0; i < 2; ++i) { - cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]]; - // [0] is allocated to the current coded frame, i.e. bwdref memcpy( cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME], cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME], sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME])); - } -} -#endif // USE_SYMM_MULTI_LAYER -#if USE_GF16_MULTI_LAYER -static void update_reference_frames_gf16(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - BufferPool *const pool = cm->buffer_pool; - - if (cm->frame_type == KEY_FRAME) { - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { - ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]], - cm->new_fb_idx); - } - } else { - if (cpi->refresh_last_frame || cpi->refresh_golden_frame || - cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame || - cpi->refresh_alt_ref_frame) { - assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES); - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->refresh_fb_idx], - cm->new_fb_idx); - } - - // TODO(zoeliu): To handle cpi->interp_filter_selected[]. - - // For GF of 16, an additional ref frame index mapping needs to be handled - // if this is the last frame to encode in the current GF group. - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->update_type[gf_group->index + 1] == OVERLAY_UPDATE) - av1_ref_frame_map_idx_updates(cpi, gf_group->index + 1); + cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]]; } - -#if DUMP_REF_FRAME_IMAGES == 1 - // Dump out all reference frame images. - dump_ref_frame_images(cpi); -#endif // DUMP_REF_FRAME_IMAGES } -#endif // USE_GF16_MULTI_LAYER +#endif // USE_SYMM_MULTI_LAYER static void update_reference_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; @@ -3424,12 +3417,20 @@ static void update_reference_frames(AV1_COMP *cpi) { // for the purpose to verify no mismatch between encoder and decoder. if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx; -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) { - update_reference_frames_gf16(cpi); - return; + // In the case of show_existing frame, we will not send fresh flag + // to decoder. Any change in the reference frame buffer can be done by + // switching the virtual indices. + if (cm->show_existing_frame) { + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt2_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; + + cpi->rc.is_bwd_ref_frame = 0; + cpi->rc.is_last_bipred_frame = 0; + cpi->rc.is_bipred_frame = 0; } -#endif // USE_GF16_MULTI_LAYER BufferPool *const pool = cm->buffer_pool; @@ -3458,9 +3459,15 @@ static void update_reference_frames(AV1_COMP *cpi) { // slot and, if we're updating the GF, the current frame becomes the new GF. int tmp; - ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]], - cm->new_fb_idx); + // ARF in general is a better reference than overlay. We shouldkeep ARF as + // reference instead of replacing it with overlay. + + if (!cpi->preserve_arf_as_gld) { + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]], + cm->new_fb_idx); + } + tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1]; cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp; @@ -3758,7 +3765,7 @@ static void set_size_independent_vars(AV1_COMP *cpi) { av1_set_speed_features_framesize_independent(cpi); av1_set_rd_speed_thresholds(cpi); av1_set_rd_speed_thresholds_sub8x8(cpi); - cpi->common.interp_filter = cpi->sf.default_interp_filter; + cpi->common.interp_filter = SWITCHABLE; cpi->common.switchable_motion_mode = 1; } @@ -3818,7 +3825,8 @@ static void set_restoration_unit_size(int width, int height, int sx, int sy, rst[2].restoration_unit_size = rst[1].restoration_unit_size; } -static void init_ref_frame_bufs(AV1_COMMON *cm) { +static void init_ref_frame_bufs(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; int i; BufferPool *const pool = cm->buffer_pool; cm->new_fb_idx = INVALID_IDX; @@ -3828,7 +3836,7 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) { } if (cm->seq_params.force_screen_content_tools) { for (i = 0; i < FRAME_BUFFERS; ++i) { - av1_hash_table_init(&pool->frame_bufs[i].hash_table); + av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb); } } } @@ -3846,7 +3854,7 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth, seq_params->use_highbitdepth = use_highbitdepth; alloc_raw_frame_buffers(cpi); - init_ref_frame_bufs(cm); + init_ref_frame_bufs(cpi); alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); // TODO(agrange) This can be removed. @@ -4220,7 +4228,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { if (lf->filter_level[0] || lf->filter_level[1]) { #if LOOP_FILTER_BITMASK - av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0); + av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, 0, num_planes, 0); #else if (cpi->num_workers > 1) av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0, @@ -4587,8 +4595,8 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; - if (recode_loop_test_global_motion(cpi)) { - loop = 1; + if (!cpi->sf.gm_disable_recode) { + if (recode_loop_test_global_motion(cpi)) loop = 1; } if (loop) { @@ -4716,47 +4724,6 @@ static void set_ext_overrides(AV1_COMP *cpi) { cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME; } -static int setup_interp_filter_search_mask(AV1_COMP *cpi) { - InterpFilter ifilter; - int ref_total[REF_FRAMES] = { 0 }; - MV_REFERENCE_FRAME ref; - int mask = 0; - int arf_idx = ALTREF_FRAME; - - if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || - cpi->refresh_alt2_ref_frame) - return mask; - - for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) - for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) - ref_total[ref] += cpi->interp_filter_selected[ref][ifilter]; - - for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) { - if ((ref_total[LAST_FRAME] && - cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) && - (ref_total[LAST2_FRAME] == 0 || - cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 < - ref_total[LAST2_FRAME]) && - (ref_total[LAST3_FRAME] == 0 || - cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 < - ref_total[LAST3_FRAME]) && - (ref_total[GOLDEN_FRAME] == 0 || - cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 < - ref_total[GOLDEN_FRAME]) && - (ref_total[BWDREF_FRAME] == 0 || - cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 < - ref_total[BWDREF_FRAME]) && - (ref_total[ALTREF2_FRAME] == 0 || - cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 < - ref_total[ALTREF2_FRAME]) && - (ref_total[ALTREF_FRAME] == 0 || - cpi->interp_filter_selected[arf_idx][ifilter] * 50 < - ref_total[ALTREF_FRAME])) - mask |= 1 << ifilter; - } - return mask; -} - #define DUMP_RECON_FRAMES 0 #if DUMP_RECON_FRAMES == 1 @@ -4914,7 +4881,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, if (cm->current_video_frame > 0) cpi->ref_frame_flags = get_ref_frame_flags(cpi); - if (cm->show_existing_frame) { + if (encode_show_existing_frame(cm)) { // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current // BWDREF_FRAME in the reference frame buffer. if (cm->frame_type == KEY_FRAME) { @@ -4925,20 +4892,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, cm->show_frame = 1; cpi->frame_flags = *frame_flags; - // In the case of show_existing frame, we will not send fresh flag - // to decoder. Any change in the reference frame buffer can be done by - // switching the virtual indices. - - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_bwd_ref_frame = 0; - cpi->rc.is_last_bipred_frame = 0; - cpi->rc.is_bipred_frame = 0; - restore_coding_context(cpi); // Build the bitstream @@ -4990,10 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, av1_rc_postencode_update(cpi, *size); } - // Decrement count down till next gf - if (cpi->rc.frames_till_gf_update_due > 0) - cpi->rc.frames_till_gf_update_due--; - ++cm->current_video_frame; return AOM_CODEC_OK; @@ -5002,9 +4951,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; - if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search) - cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi); - // Set various flags etc to special state if it is a key frame. if (frame_is_intra_only(cm) || frame_is_sframe(cm)) { // Reset the loop filter deltas and segmentation map. @@ -5246,15 +5192,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, // takes a space in the gf group. Therefore, even when // it is not shown, we still need update the count down. - // TODO(weitinglin): This is a work-around to handle the condition - // when a frame is drop. We should fix the cm->show_frame flag - // instead of checking the other condition to update the counter properly. - if (cm->show_frame || is_frame_droppable(cpi)) { - // Decrement count down till next gf - if (cpi->rc.frames_till_gf_update_due > 0) - cpi->rc.frames_till_gf_update_due--; - } - if (cm->show_frame) { // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that // are @@ -5279,6 +5216,50 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, return AOM_CODEC_OK; } +static INLINE void update_keyframe_counters(AV1_COMP *cpi) { + // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME + // differently here for rc->avg_frame_bandwidth. + if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) { + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.frame_type == KEY_FRAME) { + // If this is a show_existing_frame with a source other than altref, + // or if it is not a displayed forward keyframe, the keyframe update + // counters were incremented when it was originally encoded. + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + } + } +} + +static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { + // TODO(weitinglin): Updating this counter for is_frame_droppable + // is a work-around to handle the condition when a frame is drop. + // We should fix the cpi->common.show_frame flag + // instead of checking the other condition to update the counter properly. + if (cpi->common.show_frame || is_frame_droppable(cpi)) { + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + } +} + +static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) { + // Increment the gf group index ready for the next frame. If this is + // a show_existing_frame with a source other than altref, or if it is not + // a displayed forward keyframe, the index was incremented when it was + // originally encoded. + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.frame_type == KEY_FRAME) { + ++cpi->twopass.gf_group.index; + } +} + +static void update_rc_counts(AV1_COMP *cpi) { + update_keyframe_counters(cpi); + update_frames_till_gf_update(cpi); + if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi); +} + static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, int skip_adapt, unsigned int *frame_flags) { if (cpi->oxcf.rc_mode == AOM_CBR) { @@ -5290,6 +5271,7 @@ static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, AOM_CODEC_OK) { return AOM_CODEC_ERROR; } + update_rc_counts(cpi); check_show_existing_frame(cpi); return AOM_CODEC_OK; } @@ -5319,14 +5301,8 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, cm->cum_txcoeff_cost_timer); #endif - // Do not do post-encoding update for those frames that do not have a spot - // in - // a gf group, but note that an OVERLAY frame always has a spot in a gf - // group, - // even when show_existing_frame is used. - if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) { - av1_twopass_postencode_update(cpi); - } + av1_twopass_postencode_update(cpi); + update_rc_counts(cpi); check_show_existing_frame(cpi); return AOM_CODEC_OK; } @@ -5734,7 +5710,7 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, av1_get_block_hash_value( cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur, block_size, &hash_value_1, &hash_value_2, - (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH)); + (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb); // Hashing does not work for highbitdepth currently. // TODO(Roger): Make it work for highbitdepth. if (av1_use_hash_me(&cpi->common)) { @@ -5822,13 +5798,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0); - // Is multi-arf enabled. - // Note that at the moment multi_arf is only configured for 2 pass VBR - if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1)) - cpi->multi_arf_allowed = 1; - else - cpi->multi_arf_allowed = 0; - // Normal defaults cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_DISABLED @@ -5850,16 +5819,20 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, struct lookahead_entry *lookahead_src = NULL; if (cm->current_video_frame > 0) lookahead_src = av1_lookahead_peek(cpi->lookahead, 0); - if (lookahead_src != NULL && - ((cpi->oxcf.error_resilient_mode | - ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) || - (cpi->oxcf.s_frame_mode | - ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) && - !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) { - cm->show_existing_frame = 0; + + int use_show_existing = 1; + if (lookahead_src != NULL) { + const int is_error_resilient = + cpi->oxcf.error_resilient_mode || + (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); + const int is_s_frame = cpi->oxcf.s_frame_mode || + (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); + const int is_key_frame = + (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY); + use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame; } - if (oxcf->pass == 2 && cm->show_existing_frame) { + if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) { // Manage the source buffer and flush out the source frame that has been // coded already; Also get prepared for PSNR calculation if needed. if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) { @@ -6415,3 +6388,50 @@ int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) { const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; } + +aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) { + if (!cpi) return NULL; + + uint8_t header_buf[512] = { 0 }; + const uint32_t sequence_header_size = + write_sequence_header_obu(cpi, &header_buf[0]); + assert(sequence_header_size <= sizeof(header_buf)); + if (sequence_header_size == 0) return NULL; + + const size_t obu_header_size = 1; + const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size); + const size_t payload_offset = obu_header_size + size_field_size; + + if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; + memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); + + if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) != + obu_header_size) { + return NULL; + } + + size_t coded_size_field_size = 0; + if (aom_uleb_encode(sequence_header_size, size_field_size, + &header_buf[obu_header_size], + &coded_size_field_size) != 0) { + return NULL; + } + assert(coded_size_field_size == size_field_size); + + aom_fixed_buf_t *global_headers = + (aom_fixed_buf_t *)malloc(sizeof(*global_headers)); + if (!global_headers) return NULL; + + const size_t global_header_buf_size = + obu_header_size + size_field_size + sequence_header_size; + + global_headers->buf = malloc(global_header_buf_size); + if (!global_headers->buf) { + free(global_headers); + return NULL; + } + + memcpy(global_headers->buf, &header_buf[0], global_header_buf_size); + global_headers->sz = global_header_buf_size; + return global_headers; +} diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h index 2b7ab711d..ee7fc4637 100644 --- a/third_party/aom/av1/encoder/encoder.h +++ b/third_party/aom/av1/encoder/encoder.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODER_H_ -#define AV1_ENCODER_ENCODER_H_ +#ifndef AOM_AV1_ENCODER_ENCODER_H_ +#define AOM_AV1_ENCODER_ENCODER_H_ #include <stdio.h> @@ -142,7 +142,6 @@ typedef struct AV1EncoderConfig { int noise_sensitivity; // pre processing blur: recommendation 0 int sharpness; // sharpening output: recommendation 0: int speed; - int dev_sf; // maximum allowed bitrate for any intra frame in % of bitrate target. unsigned int rc_max_intra_bitrate_pct; // maximum allowed bitrate for any inter frame in % of bitrate target. @@ -249,6 +248,7 @@ typedef struct AV1EncoderConfig { int min_gf_interval; int max_gf_interval; + int row_mt; int tile_columns; int tile_rows; int tile_width_count; @@ -309,6 +309,9 @@ typedef struct AV1EncoderConfig { float noise_level; int noise_block_size; #endif + + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; } AV1EncoderConfig; static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) { @@ -401,6 +404,43 @@ typedef struct FRAME_COUNTS { [SWITCHABLE_FILTERS]; } FRAME_COUNTS; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS +#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 + +typedef struct { + int ready; + double a; + double b; + double dist_mean; + double ld_mean; + double sse_mean; + double sse_sse_mean; + double sse_ld_mean; + int num; + double dist_sum; + double ld_sum; + double sse_sum; + double sse_sse_sum; + double sse_ld_sum; +} InterModeRdModel; + +typedef struct { + int idx; + int64_t rd; +} RdIdxPair; +// TODO(angiebird): This is an estimated size. We still need to figure what is +// the maximum number of modes. +#define MAX_INTER_MODES 1024 +typedef struct inter_modes_info { + int num; + MB_MODE_INFO mbmi_arr[MAX_INTER_MODES]; + int mode_rate_arr[MAX_INTER_MODES]; + int64_t sse_arr[MAX_INTER_MODES]; + int64_t est_rd_arr[MAX_INTER_MODES]; + RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES]; +} InterModesInfo; +#endif + // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. typedef struct TileDataEnc { TileInfo tile_info; @@ -411,8 +451,18 @@ typedef struct TileDataEnc { CFL_CTX cfl; DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); uint8_t allow_update_cdf; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + InterModesInfo inter_modes_info; +#endif } TileDataEnc; +typedef struct { + TOKENEXTRA *start; + TOKENEXTRA *stop; + unsigned int count; +} TOKENLIST; + typedef struct RD_COUNTS { int64_t comp_pred_diff[REFERENCE_MODES]; // Stores number of 4x4 blocks using global motion per reference frame. @@ -427,11 +477,14 @@ typedef struct ThreadData { FRAME_COUNTS *counts; PC_TREE *pc_tree; PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1]; + uint32_t *hash_value_buffer[2][2]; int32_t *wsrc_buf; int32_t *mask_buf; uint8_t *above_pred_buf; uint8_t *left_pred_buf; PALETTE_BUFFER *palette_buffer; + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; int intrabc_used_this_tile; } ThreadData; @@ -502,6 +555,7 @@ typedef struct AV1_COMP { int previous_index; int cur_poc; // DebugInfo + unsigned int row_mt; int scaled_ref_idx[REF_FRAMES]; int ref_fb_idx[REF_FRAMES]; int refresh_fb_idx; // ref frame buffer index to refresh @@ -647,13 +701,12 @@ typedef struct AV1_COMP { search_site_config ss_cfg; - int multi_arf_allowed; - TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS]; + TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS]; TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; @@ -703,8 +756,13 @@ typedef struct AV1_COMP { #if CONFIG_DENOISE struct aom_denoise_and_model_t *denoise_and_model; #endif + // Stores the default value of skip flag depending on chroma format + // Set as 1 for monochrome and 3 for other color formats + int default_interp_skip_flags; + int preserve_arf_as_gld; } AV1_COMP; +// Must not be called more than once. void av1_initialize_enc(void); struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, @@ -833,6 +891,22 @@ static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2, return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes); } +static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col, + int mi_row, TOKENEXTRA **tok, int sb_size_log2, + int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2; + + *tok = cpi->tile_tok[tile_row][tile_col] + + get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes); +} + void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); #define ALT_MIN_LAG 3 @@ -885,8 +959,27 @@ static INLINE int av1_frame_scaled(const AV1_COMMON *cm) { return !av1_superres_scaled(cm) && av1_resize_scaled(cm); } +// Don't allow a show_existing_frame to coincide with an error resilient +// frame. An exception can be made for a forward keyframe since it has no +// previous dependencies. +static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) { + return cm->show_existing_frame && + (!cm->error_resilient_mode || cm->frame_type == KEY_FRAME); +} + +// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon +// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this +// function, the memory must be freed by the caller. Both the buf member of the +// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory +// returned must be freed via call to free(). +// +// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically, +// the obu_has_size_field bit is set, and the buffer contains the obu_size +// field. +aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_ENCODER_H_ +#endif // AOM_AV1_ENCODER_ENCODER_H_ diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c index 81f360733..5a31d93d7 100644 --- a/third_party/aom/av1/encoder/encodetxb.c +++ b/third_party/aom/av1/encoder/encodetxb.c @@ -133,6 +133,38 @@ static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, return error; } +static const int8_t eob_to_pos_small[33] = { + 0, 1, 2, // 0-2 + 3, 3, // 3-4 + 4, 4, 4, 4, // 5-8 + 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 +}; + +static const int8_t eob_to_pos_large[17] = { + 6, // place holder + 7, // 33-64 + 8, 8, // 65-128 + 9, 9, 9, 9, // 129-256 + 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 + 11 // 513- +}; + +static INLINE int get_eob_pos_token(const int eob, int *const extra) { + int t; + + if (eob < 33) { + t = eob_to_pos_small[eob]; + } else { + const int e = AOMMIN((eob - 1) >> 5, 16); + t = eob_to_pos_large[e]; + } + + *extra = eob - k_eob_group_start[t]; + + return t; +} + #if CONFIG_ENTROPY_STATS void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, TX_CLASS tx_class, PLANE_TYPE plane, @@ -464,8 +496,12 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_writer *w, int blk_row, int blk_col, int plane, TX_SIZE tx_size, const tran_low_t *tcoeff, uint16_t eob, TXB_CTX *txb_ctx) { - const PLANE_TYPE plane_type = get_plane_type(plane); const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, eob == 0, + ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2); + if (eob == 0) return; + const PLANE_TYPE plane_type = get_plane_type(plane); const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size, cm->reduced_tx_set_used); const TX_CLASS tx_class = tx_type_to_class[tx_type]; @@ -475,18 +511,10 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, const int bwl = get_txb_bwl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, width); DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); - - aom_write_symbol(w, eob == 0, - ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2); - if (plane == 0 && eob == 0) { - assert(tx_type == DCT_DCT); - } - if (eob == 0) return; - av1_txb_init_levels(tcoeff, width, height, levels); av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w); diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h index 0442cc613..40ae343b0 100644 --- a/third_party/aom/av1/encoder/encodetxb.h +++ b/third_party/aom/av1/encoder/encodetxb.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef ENCODETXB_H_ -#define ENCODETXB_H_ +#ifndef AOM_AV1_ENCODER_ENCODETXB_H_ +#define AOM_AV1_ENCODER_ENCODETXB_H_ #include "config/aom_config.h" @@ -84,4 +84,4 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, } #endif -#endif // COEFFS_CODING_H_ +#endif // AOM_AV1_ENCODER_ENCODETXB_H_ diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c index 637d6824c..e8ac30bb5 100644 --- a/third_party/aom/av1/encoder/ethread.c +++ b/third_party/aom/av1/encoder/ethread.c @@ -27,7 +27,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; } -static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; const AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tile_cols; @@ -47,88 +48,141 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { return 1; } -void av1_encode_tiles_mt(AV1_COMP *cpi) { +static void create_enc_workers(AV1_COMP *cpi, int num_workers) { AV1_COMMON *const cm = &cpi->common; - const int tile_cols = cm->tile_cols; const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols); - int i; - av1_init_tile_data(cpi); + CHECK_MEM_ERROR(cm, cpi->workers, + aom_malloc(num_workers * sizeof(*cpi->workers))); - // Only run once to create threads and allocate thread data. - if (cpi->num_workers == 0) { - CHECK_MEM_ERROR(cm, cpi->workers, - aom_malloc(num_workers * sizeof(*cpi->workers))); + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + aom_calloc(num_workers, sizeof(*cpi->tile_thr_data))); - CHECK_MEM_ERROR(cm, cpi->tile_thr_data, - aom_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; + ++cpi->num_workers; + winterface->init(worker); + + thread_data->cpi = cpi; + + if (i < num_workers - 1) { + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + aom_memalign(32, sizeof(*thread_data->td))); + av1_zero(*thread_data->td); + + // Set up pc_tree. + thread_data->td->pc_tree = NULL; + av1_setup_pc_tree(cm, thread_data->td); + + CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->above_pred_buf))); + CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->left_pred_buf))); + + CHECK_MEM_ERROR( + cm, thread_data->td->wsrc_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf))); + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, thread_data->td->hash_value_buffer[x][y], + (uint32_t *)aom_malloc( + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0]))); + + CHECK_MEM_ERROR( + cm, thread_data->td->mask_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf))); + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(cm, thread_data->td->counts, + aom_calloc(1, sizeof(*thread_data->td->counts))); + + // Allocate buffers used by palette coding mode. + CHECK_MEM_ERROR( + cm, thread_data->td->palette_buffer, + aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); + + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->td->tmp_conv_dst))); + for (int j = 0; j < 2; ++j) { + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_obmc_bufs[j], + aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->tmp_obmc_bufs[j]))); + } - ++cpi->num_workers; - winterface->init(worker); + // Create threads + if (!winterface->reset(worker)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->td = &cpi->td; + } + winterface->sync(worker); + } +} - thread_data->cpi = cpi; +static void launch_enc_workers(AV1_COMP *cpi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + // Encode a frame + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - if (i < num_workers - 1) { - // Allocate thread data. - CHECK_MEM_ERROR(cm, thread_data->td, - aom_memalign(32, sizeof(*thread_data->td))); - av1_zero(*thread_data->td); + // Set the starting tile for each thread. + thread_data->start = i; - // Set up pc_tree. - thread_data->td->pc_tree = NULL; - av1_setup_pc_tree(cm, thread_data->td); + if (i == cpi->num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } +} - CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf, - (uint8_t *)aom_memalign( - 16, MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->above_pred_buf))); - CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf, - (uint8_t *)aom_memalign( - 16, MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->left_pred_buf))); +static void sync_enc_workers(AV1_COMP *cpi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - CHECK_MEM_ERROR( - cm, thread_data->td->wsrc_buf, - (int32_t *)aom_memalign( - 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf))); - CHECK_MEM_ERROR( - cm, thread_data->td->mask_buf, - (int32_t *)aom_memalign( - 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf))); - // Allocate frame counters in thread data. - CHECK_MEM_ERROR(cm, thread_data->td->counts, - aom_calloc(1, sizeof(*thread_data->td->counts))); - - // Allocate buffers used by palette coding mode. - CHECK_MEM_ERROR( - cm, thread_data->td->palette_buffer, - aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); - - // Create threads - if (!winterface->reset(worker)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, - "Tile encoder thread creation failed"); - } else { - // Main thread acts as a worker and uses the thread data in cpi. - thread_data->td = &cpi->td; - } + // Encoding ends. + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } +} - winterface->sync(worker); +static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) { + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile; + // Accumulate counters. + if (i < cpi->num_workers - 1) { + av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); + accumulate_rd_opt(&cpi->td, thread_data->td); + cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; } - } else { - num_workers = AOMMIN(num_workers, cpi->num_workers); } +} - for (i = 0; i < num_workers; i++) { +static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + for (int i = 0; i < num_workers; i++) { AVxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; - worker->hook = (AVxWorkerHook)enc_worker_hook; + worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; @@ -139,47 +193,59 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf; thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf; thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf; + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + memcpy(thread_data->td->hash_value_buffer[x][y], + cpi->td.mb.hash_value_buffer[x][y], + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0])); + thread_data->td->mb.hash_value_buffer[x][y] = + thread_data->td->hash_value_buffer[x][y]; + } + } thread_data->td->mb.mask_buf = thread_data->td->mask_buf; } if (thread_data->td->counts != &cpi->counts) { memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); } - if (i < num_workers - 1) + if (i < num_workers - 1) { thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; - } - - // Encode a frame - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - - // Set the starting tile for each thread. - thread_data->start = i; + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.tmp_obmc_bufs[j] = + thread_data->td->tmp_obmc_bufs[j]; + } - if (i == cpi->num_workers - 1) - winterface->execute(worker); - else - winterface->launch(worker); + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = + thread_data->td->mb.tmp_obmc_bufs[j]; + } + } } +} - // Encoding ends. - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - winterface->sync(worker); - } +void av1_encode_tiles_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows); - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile; - // Accumulate counters. - if (i < cpi->num_workers - 1) { - av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); - accumulate_rd_opt(&cpi->td, thread_data->td); - cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; - } + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) + av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + // Only run once to create threads and allocate thread data. + if (cpi->num_workers == 0) { + create_enc_workers(cpi, num_workers); + } else { + num_workers = AOMMIN(num_workers, cpi->num_workers); } + prepare_enc_workers(cpi, enc_worker_hook, num_workers); + launch_enc_workers(cpi, num_workers); + sync_enc_workers(cpi, num_workers); + accumulate_counters_enc_workers(cpi, num_workers); } // Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h index b6b1fed4e..5de4b4803 100644 --- a/third_party/aom/av1/encoder/ethread.h +++ b/third_party/aom/av1/encoder/ethread.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ETHREAD_H_ -#define AV1_ENCODER_ETHREAD_H_ +#ifndef AOM_AV1_ENCODER_ETHREAD_H_ +#define AOM_AV1_ENCODER_ETHREAD_H_ #ifdef __cplusplus extern "C" { @@ -34,4 +34,4 @@ void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts, } // extern "C" #endif -#endif // AV1_ENCODER_ETHREAD_H_ +#endif // AOM_AV1_ENCODER_ETHREAD_H_ diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h index 48178b964..e0432cc97 100644 --- a/third_party/aom/av1/encoder/extend.h +++ b/third_party/aom/av1/encoder/extend.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_EXTEND_H_ -#define AV1_ENCODER_EXTEND_H_ +#ifndef AOM_AV1_ENCODER_EXTEND_H_ +#define AOM_AV1_ENCODER_EXTEND_H_ #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" @@ -29,4 +29,4 @@ void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, } // extern "C" #endif -#endif // AV1_ENCODER_EXTEND_H_ +#endif // AOM_AV1_ENCODER_EXTEND_H_ diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c index ef0800c79..69dd20c52 100644 --- a/third_party/aom/av1/encoder/firstpass.c +++ b/third_party/aom/av1/encoder/firstpass.c @@ -31,6 +31,7 @@ #include "av1/encoder/aq_variance.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" +#include "av1/encoder/dwt.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" @@ -39,7 +40,7 @@ #include "av1/encoder/firstpass.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rd.h" -#include "av1/encoder/dwt.h" +#include "av1/encoder/reconinter_enc.h" #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 @@ -51,9 +52,10 @@ #define FACTOR_PT_LOW 0.70 #define FACTOR_PT_HIGH 0.90 #define FIRST_PASS_Q 10.0 -#define GF_MAX_BOOST 96.0 +#define GF_MAX_BOOST 90.0 #define INTRA_MODE_PENALTY 1024 -#define KF_MAX_BOOST 128.0 +#define KF_MIN_FRAME_BOOST 80.0 +#define KF_MAX_FRAME_BOOST 128.0 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 #define MIN_KF_BOOST 300 @@ -62,6 +64,7 @@ #define DEFAULT_GRP_WEIGHT 1.0 #define RC_FACTOR_MIN 0.75 #define RC_FACTOR_MAX 1.75 +#define MIN_FWD_KF_INTERVAL 8 #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 @@ -1562,576 +1565,9 @@ static int calculate_boost_bits(int frame_count, int boost, 0); } -#if USE_GF16_MULTI_LAYER -// === GF Group of 16 === -#define GF_INTERVAL_16 16 -#define GF_FRAME_PARAMS (REF_FRAMES + 5) - -// GF Group of 16: multi-layer hierarchical coding structure -// 1st Layer: Frame 0 and Frame 16 (ALTREF) -// 2nd Layer: Frame 8 (ALTREF2) -// 3rd Layer: Frame 4 and 12 (ALTREF2) -// 4th Layer: Frame 2, 6, 10, and 14 (BWDREF) -// 5th Layer: Frame 1, 3, 5, 7, 9, 11, 13, and 15 -static const unsigned char gf16_multi_layer_params[][GF_FRAME_PARAMS] = { - // gf_group->index: coding order - // (Frame #) : display order - { - // gf_group->index == 0 (Frame 0) - OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // References (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF_FRAME, // Index (current) of reference to get updated - GOLDEN_FRAME // cpi->refresh_golden_frame = 1 - }, - { - // gf_group->index == 1 (Frame 16) - ARF_UPDATE, // update_type - GF_INTERVAL_16 - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - GOLDEN_FRAME, // cpi->gld_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF_FRAME, // Index (current) of reference to get updated - ALTREF_FRAME // cpi->refresh_alt_ref_frame = 1 - }, - { - // gf_group->index == 2 (Frame 8) - INTNL_ARF_UPDATE, // update_type - (GF_INTERVAL_16 >> 1) - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 3 (Frame 4) - INTNL_ARF_UPDATE, // update_type - (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx - // (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx - // (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 4 (Frame 2) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx - // (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx - // (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - REF_FRAMES, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_ref_frame = 1 - }, - { - // gf_group->index == 5 (Frame 1) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - REF_FRAMES, // cpi->ext_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 6 (Frame 3) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - REF_FRAMES, // cpi->ext_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 7 (Frame 4 - OVERLAY) - INTNL_OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 8 (Frame 6) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx -> cpi->bwd_fb_idx (BWDREF_FRAME) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_frame = 1 - }, - { - // gf_group->index == 9 (Frame 5) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 10 (Frame 7) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 11 (Frame 8 - OVERLAY) - INTNL_OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 12 (Frame 12) - INTNL_ARF_UPDATE, // update_type - (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 13 (Frame 10) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_frame = 1 - }, - { - // gf_group->index == 14 (Frame 9) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 15 (Frame 11) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 16 (Frame 12 - OVERLAY) - INTNL_OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 17 (Frame 14) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_frame = 1 - }, - { - // gf_group->index == 18 (Frame 13) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 19 (Frame 15) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 20 (Frame 16 - OVERLAY: Belonging to the next GF - // group) - OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF_FRAME, // Index (current) of reference to get updated - GOLDEN_FRAME // cpi->refresh_golden_frame = 1 - } -}; - -// === GF Group of 16 === -static void define_gf_group_structure_16(AV1_COMP *cpi) { - RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - const int key_frame = cpi->common.frame_type == KEY_FRAME; - - assert(rc->baseline_gf_interval == GF_INTERVAL_16); - - // Total number of frames to consider for GF group of 16: - // = GF group interval + number of OVERLAY's - // = rc->baseline_gf_interval + MAX_EXT_ARFS + 1 + 1 - // NOTE: The OVERLAY frame for the next GF group also needs to consider to - // prepare for the reference frame index mapping. - - const int gf_update_frames = rc->baseline_gf_interval + MAX_EXT_ARFS + 2; - - for (int frame_index = 0; frame_index < gf_update_frames; ++frame_index) { - int param_idx = 0; - - // Treat KEY_FRAME differently - if (frame_index == 0 && key_frame) { - gf_group->update_type[frame_index] = KF_UPDATE; - - gf_group->rf_level[frame_index] = KF_STD; - gf_group->arf_src_offset[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - gf_group->bidir_pred_enabled[frame_index] = 0; - for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) - gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx; - gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1]; - gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1]; - - continue; - } - - // == update_type == - gf_group->update_type[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == rf_level == - // Derive rf_level from update_type - switch (gf_group->update_type[frame_index]) { - case LF_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break; - case ARF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break; - case OVERLAY_UPDATE: - gf_group->rf_level[frame_index] = INTER_NORMAL; - break; - case BRF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break; - case LAST_BIPRED_UPDATE: - gf_group->rf_level[frame_index] = INTER_NORMAL; - break; - case BIPRED_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break; - case INTNL_ARF_UPDATE: - gf_group->rf_level[frame_index] = GF_ARF_LOW; - break; - case INTNL_OVERLAY_UPDATE: - gf_group->rf_level[frame_index] = INTER_NORMAL; - break; - default: gf_group->rf_level[frame_index] = INTER_NORMAL; break; - } - - // == arf_src_offset == - gf_group->arf_src_offset[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == brf_src_offset == - gf_group->brf_src_offset[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == bidir_pred_enabled == - // Derive bidir_pred_enabled from bidir_src_offset - gf_group->bidir_pred_enabled[frame_index] = - gf_group->brf_src_offset[frame_index] ? 1 : 0; - - // == ref_fb_idx_map == - for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) - gf_group->ref_fb_idx_map[frame_index][ref_idx] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == refresh_idx == - gf_group->refresh_idx[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == refresh_flag == - gf_group->refresh_flag[frame_index] = - gf16_multi_layer_params[frame_index][param_idx]; - } - - // Mark the ARF_UPDATE / INTNL_ARF_UPDATE and OVERLAY_UPDATE / - // INTNL_OVERLAY_UPDATE for rate allocation - // NOTE: Indexes are designed in the display order backward: - // ALT[3] .. ALT[2] .. ALT[1] .. ALT[0], - // but their coding order is as follows: - // ALT0-ALT2-ALT3 .. OVERLAY3 .. OVERLAY2-ALT1 .. OVERLAY1 .. OVERLAY0 - - const int num_arfs_in_gf = cpi->num_extra_arfs + 1; - const int sub_arf_interval = rc->baseline_gf_interval / num_arfs_in_gf; - - // == arf_pos_for_ovrly ==: Position for OVERLAY - for (int arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) { - const int prior_num_arfs = - (arf_idx <= 1) ? num_arfs_in_gf : (num_arfs_in_gf - 1); - cpi->arf_pos_for_ovrly[arf_idx] = - sub_arf_interval * (num_arfs_in_gf - arf_idx) + prior_num_arfs; - } - - // == arf_pos_in_gf ==: Position for ALTREF - cpi->arf_pos_in_gf[0] = 1; - cpi->arf_pos_in_gf[1] = cpi->arf_pos_for_ovrly[2] + 1; - cpi->arf_pos_in_gf[2] = 2; - cpi->arf_pos_in_gf[3] = 3; - - // == arf_update_idx == - // == arf_ref_idx == - // NOTE: Due to the hierarchical nature of GF16, these two parameters only - // relect the index to the nearest future overlay. - int start_frame_index = 0; - for (int arf_idx = (num_arfs_in_gf - 1); arf_idx >= 0; --arf_idx) { - const int end_frame_index = cpi->arf_pos_for_ovrly[arf_idx]; - for (int frame_index = start_frame_index; frame_index <= end_frame_index; - ++frame_index) { - gf_group->arf_update_idx[frame_index] = arf_idx; - gf_group->arf_ref_idx[frame_index] = arf_idx; - } - start_frame_index = end_frame_index + 1; - } -} -#endif // USE_GF16_MULTI_LAYER - #if USE_SYMM_MULTI_LAYER +// #define CHCEK_GF_PARAMETER +#ifdef CHCEK_GF_PARAMETER void check_frame_params(GF_GROUP *const gf_group, int gf_interval, int frame_nums) { static const char *update_type_strings[] = { @@ -2149,9 +1585,15 @@ void check_frame_params(GF_GROUP *const gf_group, int gf_interval, gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i], gf_group->arf_update_idx[i], gf_group->pyramid_level[i]); } + + fprintf(fid, "number of nodes in each level: \n"); + for (int i = 0; i < MAX_PYRAMID_LVL; ++i) { + fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]); + } + fprintf(fid, "\n"); fclose(fid); } - +#endif // CHCEK_GF_PARAMETER static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) { // Derive rf_level from update_type switch (update_type) { @@ -2169,14 +1611,17 @@ static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) { static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r, int *frame_ind, int arf_ind, int level) { - if (r - l == 2) { - // leaf node, not a look-ahead frame - gf_group->update_type[*frame_ind] = LF_UPDATE; - gf_group->arf_src_offset[*frame_ind] = 0; - gf_group->arf_pos_in_gf[*frame_ind] = 0; - gf_group->arf_update_idx[*frame_ind] = arf_ind; - gf_group->pyramid_level[*frame_ind] = level; - ++(*frame_ind); + if (r - l < 4) { + while (++l < r) { + // leaf nodes, not a look-ahead frame + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->arf_pos_in_gf[*frame_ind] = 0; + gf_group->arf_update_idx[*frame_ind] = arf_ind; + gf_group->pyramid_level[*frame_ind] = 0; + ++gf_group->pyramid_lvl_nodes[0]; + ++(*frame_ind); + } } else { int m = (l + r) / 2; int arf_pos_in_gf = *frame_ind; @@ -2186,6 +1631,7 @@ static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r, gf_group->arf_pos_in_gf[*frame_ind] = 0; gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1 gf_group->pyramid_level[*frame_ind] = level; + ++gf_group->pyramid_lvl_nodes[level]; ++(*frame_ind); // set parameters for frames displayed before this frame @@ -2209,7 +1655,7 @@ static INLINE unsigned char get_pyramid_height(int pyramid_width) { assert(pyramid_width <= 16 && pyramid_width >= 4 && "invalid gf interval for pyramid structure"); - return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2); + return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2); } static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, @@ -2217,6 +1663,10 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, int frame_index = 0; gf_group->pyramid_height = get_pyramid_height(gf_interval); + assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL); + + av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL); + // At the beginning of each GF group it will be a key or overlay frame, gf_group->update_type[frame_index] = OVERLAY_UPDATE; gf_group->arf_src_offset[frame_index] = 0; @@ -2236,9 +1686,6 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, // set parameters for the rest of the frames set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0, gf_group->pyramid_height - 1); - - // check_frame_params(gf_group, gf_interval, frame_index); - return frame_index; } @@ -2248,8 +1695,8 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) { GF_GROUP *const gf_group = &twopass->gf_group; const int key_frame = cpi->common.frame_type == KEY_FRAME; - assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 || - rc->baseline_gf_interval == 16); + assert(rc->baseline_gf_interval >= 4 && + rc->baseline_gf_interval <= MAX_PYRAMID_SIZE); const int gf_update_frames = construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval); @@ -2305,8 +1752,9 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) { // This parameter is useless? gf_group->arf_ref_idx[frame_index] = 0; - +#ifdef CHCEK_GF_PARAMETER check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames); +#endif } // It is an example of how to define a GF stucture manually. The function will @@ -2447,16 +1895,10 @@ static int define_gf_group_structure_4(AV1_COMP *cpi) { static void define_gf_group_structure(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; -#if USE_GF16_MULTI_LAYER - if (rc->baseline_gf_interval == 16) { - define_gf_group_structure_16(cpi); - return; - } -#endif // USE_GF16_MULTI_LAYER #if USE_SYMM_MULTI_LAYER - const int valid_customized_gf_length = rc->baseline_gf_interval == 4 || - rc->baseline_gf_interval == 8 || - rc->baseline_gf_interval == 16; + const int valid_customized_gf_length = + rc->baseline_gf_interval >= 4 && + rc->baseline_gf_interval <= MAX_PYRAMID_SIZE; // used the new structure only if extra_arf is allowed if (valid_customized_gf_length && rc->source_alt_ref_pending && cpi->extra_arf_allowed > 0) { @@ -2685,6 +2127,18 @@ static void define_gf_group_structure(AV1_COMP *cpi) { gf_group->brf_src_offset[frame_index] = 0; } +#if USE_SYMM_MULTI_LAYER +#define LEAF_REDUCTION_FACTOR 0.75f +#define LVL_3_BOOST_FACTOR 0.8f +#define LVL_2_BOOST_FACTOR 0.3f + +static float_t lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = { + { 1, 0, 0 }, + { LVL_3_BOOST_FACTOR, 0, 0 }, // Leaking budget works better + { LVL_3_BOOST_FACTOR, (1 - LVL_3_BOOST_FACTOR) * LVL_2_BOOST_FACTOR, + (1 - LVL_3_BOOST_FACTOR) * (1 - LVL_2_BOOST_FACTOR) } +}; +#endif // USE_SYMM_MULTI_LAYER static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, double group_error, int gf_arf_bits) { RATE_CONTROL *const rc = &cpi->rc; @@ -2771,20 +2225,39 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, // BIPRED_UPDATE frames need to be further adjusted. gf_group->bit_allocation[frame_index] = target_frame_size; #if USE_SYMM_MULTI_LAYER - } else if (cpi->new_bwdref_update_rule == 1 && + } else if (cpi->new_bwdref_update_rule && gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) { + assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL && + gf_group->pyramid_height >= 0 && + "non-valid height for a pyramid structure"); + int arf_pos = gf_group->arf_pos_in_gf[frame_index]; gf_group->bit_allocation[frame_index] = 0; - // Tried boosting up the allocated bits on backward reference frame - // by (target_frame_size >> 2) as in the original setting. However it - // does not bring gains for pyramid structure with GF length = 16. gf_group->bit_allocation[arf_pos] = target_frame_size; -#endif +#if MULTI_LVL_BOOST_VBR_CQ + const int pyr_h = gf_group->pyramid_height - 2; + const int this_lvl = gf_group->pyramid_level[arf_pos]; + const int dist2top = gf_group->pyramid_height - 1 - this_lvl; + + const float_t budget = + LEAF_REDUCTION_FACTOR * gf_group->pyramid_lvl_nodes[0]; + const float_t lvl_boost = budget * lvl_budget_factor[pyr_h][dist2top] / + gf_group->pyramid_lvl_nodes[this_lvl]; + + gf_group->bit_allocation[arf_pos] += (int)(target_frame_size * lvl_boost); +#endif // MULTI_LVL_BOOST_VBR_CQ +#endif // USE_SYMM_MULTI_LAYER } else { assert(gf_group->update_type[frame_index] == LF_UPDATE || gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE); gf_group->bit_allocation[frame_index] = target_frame_size; +#if MULTI_LVL_BOOST_VBR_CQ + if (cpi->new_bwdref_update_rule) { + gf_group->bit_allocation[frame_index] -= + (int)(target_frame_size * LEAF_REDUCTION_FACTOR); + } +#endif // MULTI_LVL_BOOST_VBR_CQ } ++frame_index; @@ -2833,9 +2306,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i; double boost_score = 0.0; -#if !FIX_GF_INTERVAL_LENGTH +#if !CONFIG_FIX_GF_LENGTH double old_boost_score = 0.0; double mv_ratio_accumulator_thresh; + int active_max_gf_interval; + int active_min_gf_interval; #endif double gf_group_err = 0.0; #if GROUP_ADAPTIVE_MAXQ @@ -2862,8 +2337,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { int f_boost = 0; int b_boost = 0; int flash_detected; - int active_max_gf_interval; - int active_min_gf_interval; int64_t gf_group_bits; double gf_group_error_left; int gf_arf_bits; @@ -2898,11 +2371,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_skip_pct -= this_frame->intra_skip_pct; gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; } -#if !FIX_GF_INTERVAL_LENGTH +#if !CONFIG_FIX_GF_LENGTH // Motion breakout threshold for loop below depends on image size. mv_ratio_accumulator_thresh = (cpi->initial_height + cpi->initial_width) / 4.0; -#endif // Set a maximum and minimum interval for the GF group. // If the image appears almost completely static we can extend beyond this. { @@ -2915,23 +2387,19 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (active_min_gf_interval > rc->max_gf_interval) active_min_gf_interval = rc->max_gf_interval; - if (cpi->multi_arf_allowed) { + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6)); + + // We have: active_min_gf_interval <= rc->max_gf_interval + if (active_max_gf_interval < active_min_gf_interval) + active_max_gf_interval = active_min_gf_interval; + else if (active_max_gf_interval > rc->max_gf_interval) active_max_gf_interval = rc->max_gf_interval; - } else { - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6)); - - // We have: active_min_gf_interval <= rc->max_gf_interval - if (active_max_gf_interval < active_min_gf_interval) - active_max_gf_interval = active_min_gf_interval; - else if (active_max_gf_interval > rc->max_gf_interval) - active_max_gf_interval = rc->max_gf_interval; - } } - +#endif // !CONFIG_FIX_GF_LENGTH double avg_sr_coded_error = 0; double avg_raw_err_stdev = 0; int non_zero_stdev_count = 0; @@ -2990,10 +2458,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { boost_score += decay_accumulator * calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); -#if FIX_GF_INTERVAL_LENGTH +#if CONFIG_FIX_GF_LENGTH if (i == (FIXED_GF_LENGTH + 1)) break; #else - // Skip breaking condition for FIX_GF_INTERVAL_LENGTH + // Skip breaking condition for CONFIG_FIX_GF_LENGTH // Break out conditions. if ( // Break at active_max_gf_interval unless almost totally static. @@ -3017,7 +2485,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } old_boost_score = boost_score; -#endif // FIX_GF_INTERVAL_LENGTH +#endif // CONFIG_FIX_GF_LENGTH *this_frame = next_frame; } twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); @@ -3030,44 +2498,116 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { assert(num_mbs > 0); if (i) avg_sr_coded_error /= i; + if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; + + // Disable extra altrefs and backward refs for "still" gf group: + // zero_motion_accumulator: minimum percentage of (0,0) motion; + // avg_sr_coded_error: average of the SSE per pixel of each frame; + // avg_raw_err_stdev: average of the standard deviation of (0,0) + // motion error per block of each frame. + const int disable_bwd_extarf = + (zero_motion_accumulator > MIN_ZERO_MOTION && + avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && + avg_raw_err_stdev < MAX_RAW_ERR_VAR); + + if (disable_bwd_extarf) cpi->extra_arf_allowed = 0; + +#define REDUCE_GF_LENGTH_THRESH 4 +#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 +#define REDUCE_GF_LENGTH_BY 1 + int alt_offset = 0; +#if REDUCE_LAST_GF_LENGTH + // TODO(weitinglin): The length reduction stretagy is tweaking using AOM_Q + // mode, and hurting the performance of VBR mode. We need to investigate how + // to adjust GF length for other modes. + + int allow_gf_length_reduction = + cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0; + + // We are going to have an alt ref, but we don't have do adjustment for + // lossless mode + if (allow_alt_ref && allow_gf_length_reduction && + (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval) && + !is_lossless_requested(&cpi->oxcf)) { + // adjust length of this gf group if one of the following condition met + // 1: only one overlay frame left and this gf is too long + // 2: next gf group is too short to have arf compared to the current gf + + // maximum length of next gf group + const int next_gf_len = rc->frames_to_key - i; + const int single_overlay_left = + next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; + // the next gf is probably going to have a ARF but it will be shorter than + // this gf + const int unbalanced_gf = + i > REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 >= rc->min_gf_interval; + + if (single_overlay_left || unbalanced_gf) { + // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work + // better in the current setting + const int roll_back = REDUCE_GF_LENGTH_BY; + alt_offset = -roll_back; + i -= roll_back; + } + } +#endif + // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { // Calculate the boost for alt ref. rc->gfu_boost = - calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); + calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost); rc->source_alt_ref_pending = 1; + + // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF + cpi->preserve_arf_as_gld = 1; } else { rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST); rc->source_alt_ref_pending = 0; + cpi->preserve_arf_as_gld = 0; } // Set the interval until the next gf. - if (cpi->oxcf.fwd_kf_enabled) { - // Ensure the gf group before the next keyframe will contain an altref - if ((rc->frames_to_key - i < rc->min_gf_interval) && - (rc->frames_to_key != i)) { - rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval, - rc->static_scene_max_gf_interval); - } else { + // If forward keyframes are enabled, ensure the final gf group obeys the + // MIN_FWD_KF_INTERVAL. + if (cpi->oxcf.fwd_kf_enabled && + ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) { + if (i == rc->frames_to_key) { rc->baseline_gf_interval = i; + // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL + } else if ((rc->frames_to_key - i < + AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) && + (rc->frames_to_key != i)) { + // if possible, merge the last two gf groups + if (rc->frames_to_key <= MAX_PYRAMID_SIZE) { + rc->baseline_gf_interval = rc->frames_to_key; + // if merging the last two gf groups creates a group that is too long, + // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL + } else { + rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL; + } + } else { + rc->baseline_gf_interval = + i - (is_key_frame || rc->source_alt_ref_pending); } } else { rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); } - if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; - // Disable extra altrefs and backward refs for "still" gf group: - // zero_motion_accumulator: minimum percentage of (0,0) motion; - // avg_sr_coded_error: average of the SSE per pixel of each frame; - // avg_raw_err_stdev: average of the standard deviation of (0,0) - // motion error per block of each frame. - const int disable_bwd_extarf = - (zero_motion_accumulator > MIN_ZERO_MOTION && - avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && - avg_raw_err_stdev < MAX_RAW_ERR_VAR); - - if (disable_bwd_extarf) cpi->extra_arf_allowed = 0; +#if REDUCE_LAST_ALT_BOOST +#define LAST_ALR_BOOST_FACTOR 0.2f + rc->arf_boost_factor = 1.0; + if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) { + // Reduce the boost of altref in the last gf group + if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY || + rc->frames_to_key - i == 0) { + rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; + } + } +#endif if (!cpi->extra_arf_allowed) { cpi->num_extra_arfs = 0; @@ -3439,6 +2979,11 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // how many bits to spend on it. decay_accumulator = 1.0; boost_score = 0.0; + const double kf_max_boost = + cpi->oxcf.rc_mode == AOM_Q + ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), + KF_MAX_FRAME_BOOST) + : KF_MAX_FRAME_BOOST; for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; @@ -3450,7 +2995,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { if ((i <= rc->max_gf_interval) || ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { const double frame_boost = - calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST); + calc_frame_boost(cpi, this_frame, 0, kf_max_boost); // How fast is prediction quality decaying. if (!detect_flash(twopass, 0)) { @@ -3513,147 +3058,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->modified_error_left -= kf_group_err; } -#if USE_GF16_MULTI_LAYER -// === GF Group of 16 === -void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) { - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - - int ref_fb_idx_prev[REF_FRAMES]; - int ref_fb_idx_curr[REF_FRAMES]; - - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { - ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame]; - } - - // Update map index for each reference frame - for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) { - int ref_frame = gf_group->ref_fb_idx_map[gf_frame_index][ref_idx]; - ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME]; - } - - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { - cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame]; - } -} - -// Define the reference buffers that will be updated post encode. -static void configure_buffer_updates_16(AV1_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - - if (gf_group->update_type[gf_group->index] == KF_UPDATE) { - cpi->refresh_fb_idx = 0; - - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_alt2_ref_frame = 1; - cpi->refresh_alt_ref_frame = 1; - - return; - } - - // Update reference frame map indexes - av1_ref_frame_map_idx_updates(cpi, gf_group->index); - - // Update refresh index - switch (gf_group->refresh_idx[gf_group->index]) { - case LAST_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME]; - break; - - case LAST2_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME]; - break; - - case LAST3_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME]; - break; - - case GOLDEN_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; - break; - - case BWDREF_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1]; - break; - - case ALTREF2_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1]; - break; - - case ALTREF_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; - break; - - case REF_FRAMES: - cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1]; - break; - - default: assert(0); break; - } - - // Update refresh flags - switch (gf_group->refresh_flag[gf_group->index]) { - case LAST_FRAME: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case GOLDEN_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 1; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case BWDREF_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case ALTREF2_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 1; - cpi->refresh_alt_ref_frame = 0; - break; - - case ALTREF_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 1; - break; - - default: assert(0); break; - } - - switch (gf_group->update_type[gf_group->index]) { - case BRF_UPDATE: cpi->rc.is_bwd_ref_frame = 1; break; - - case LAST_BIPRED_UPDATE: cpi->rc.is_last_bipred_frame = 1; break; - - case BIPRED_UPDATE: cpi->rc.is_bipred_frame = 1; break; - - case INTNL_OVERLAY_UPDATE: cpi->rc.is_src_frame_ext_arf = 1; - case OVERLAY_UPDATE: cpi->rc.is_src_frame_alt_ref = 1; break; - - default: break; - } -} -#endif // USE_GF16_MULTI_LAYER - // Define the reference buffers that will be updated post encode. static void configure_buffer_updates(AV1_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; @@ -3667,14 +3071,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) { cpi->rc.is_bipred_frame = 0; cpi->rc.is_src_frame_ext_arf = 0; -#if USE_GF16_MULTI_LAYER - RATE_CONTROL *const rc = &cpi->rc; - if (rc->baseline_gf_interval == 16) { - configure_buffer_updates_16(cpi); - return; - } -#endif // USE_GF16_MULTI_LAYER - switch (twopass->gf_group.update_type[twopass->gf_group.index]) { case KF_UPDATE: cpi->refresh_last_frame = 1; @@ -3979,8 +3375,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) { : cpi->common.MBs; // The multiplication by 256 reverses a scaling factor of (>> 8) // applied when combining MB error values for the frame. - twopass->mb_av_energy = - log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0); + twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0); twopass->frame_avg_haar_energy = log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0); } @@ -4020,9 +3415,6 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) { } twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); - // Increment the gf group index ready for the next frame. - ++twopass->gf_group.index; - // If the rate control is drifting consider adjustment to min or maxq. if ((cpi->oxcf.rc_mode != AOM_Q) && (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) && diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h index b0c1a21e4..4b7325ae2 100644 --- a/third_party/aom/av1/encoder/firstpass.h +++ b/third_party/aom/av1/encoder/firstpass.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_FIRSTPASS_H_ -#define AV1_ENCODER_FIRSTPASS_H_ +#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_ +#define AOM_AV1_ENCODER_FIRSTPASS_H_ #include "av1/common/enums.h" #include "av1/common/onyxc_int.h" @@ -47,15 +47,7 @@ typedef struct { // number of bi-predictive frames. #define BFG_INTERVAL 2 // The maximum number of extra ALTREF's except ALTREF_FRAME -// NOTE: REF_FRAMES indicates the maximum number of frames that may be buffered -// to serve as references. Currently REF_FRAMES == 8. -#define USE_GF16_MULTI_LAYER 0 - -#if USE_GF16_MULTI_LAYER -#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME) -#else // !USE_GF16_MULTI_LAYER #define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1) -#endif // USE_GF16_MULTI_LAYER #define MIN_EXT_ARF_INTERVAL 4 @@ -126,6 +118,7 @@ typedef struct { unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char pyramid_height; + unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL]; #endif unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1]; @@ -197,10 +190,6 @@ void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi, // Post encode update of the rate control parameters for 2-pass void av1_twopass_postencode_update(struct AV1_COMP *cpi); -#if USE_GF16_MULTI_LAYER -void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index); -#endif // USE_GF16_MULTI_LAYER - static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { if (arf_pending && MAX_EXT_ARFS > 0) return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1) @@ -216,4 +205,4 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { } // extern "C" #endif -#endif // AV1_ENCODER_FIRSTPASS_H_ +#endif // AOM_AV1_ENCODER_FIRSTPASS_H_ diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c index f07d1bc00..e9f8b0bb4 100644 --- a/third_party/aom/av1/encoder/global_motion.c +++ b/third_party/aom/av1/encoder/global_motion.c @@ -32,8 +32,8 @@ // Border over which to compute the global motion #define ERRORADV_BORDER 0 -static const double erroradv_tr[] = { 0.75, 0.70, 0.65 }; -static const double erroradv_prod_tr[] = { 22000, 20000, 18000 }; +static const double erroradv_tr[] = { 0.65, 0.60, 0.55 }; +static const double erroradv_prod_tr[] = { 20000, 18000, 16000 }; int is_enough_erroradvantage(double best_erroradvantage, int params_cost, int erroradv_type) { diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h index 2c15753fd..c7c016c43 100644 --- a/third_party/aom/av1/encoder/global_motion.h +++ b/third_party/aom/av1/encoder/global_motion.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_GLOBAL_MOTION_H_ -#define AV1_ENCODER_GLOBAL_MOTION_H_ +#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_ +#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_ #include "aom/aom_integer.h" #include "aom_scale/yv12config.h" @@ -61,4 +61,4 @@ int compute_global_motion_feature_based(TransformationType type, #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_GLOBAL_MOTION_H_ +#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_ diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h index 45632da9b..945dc3733 100644 --- a/third_party/aom/av1/encoder/grain_test_vectors.h +++ b/third_party/aom/av1/encoder/grain_test_vectors.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_GRAIN_TEST_VECTORS_H_ -#define AV1_GRAIN_TEST_VECTORS_H_ +#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ +#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ /* Test vectors for emulation of different film grain types. * Note that bit depth would be derived from the bitstream and @@ -778,4 +778,4 @@ static aom_film_grain_t film_grain_test_vectors[16] = { 45231 /* random_seed */ }, }; -#endif // AV1_GRAIN_TEST_VECTORS_H_ +#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h index 8b6227540..826c004d6 100644 --- a/third_party/aom/av1/encoder/hash.h +++ b/third_party/aom/av1/encoder/hash.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_HASH_H_ -#define AV1_ENCODER_HASH_H_ +#ifndef AOM_AV1_ENCODER_HASH_H_ +#define AOM_AV1_ENCODER_HASH_H_ #include "config/aom_config.h" @@ -43,8 +43,10 @@ typedef struct _CRC32C { // init table for software version crc32c void av1_crc32c_calculator_init(CRC32C *p_crc32c); +#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_HASH_H_ +#endif // AOM_AV1_ENCODER_HASH_H_ diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c index f2ff5b495..e85a516e8 100644 --- a/third_party/aom/av1/encoder/hash_motion.c +++ b/third_party/aom/av1/encoder/hash_motion.c @@ -13,14 +13,12 @@ #include "config/av1_rtcd.h" +#include "av1/encoder/block.h" #include "av1/encoder/hash.h" #include "av1/encoder/hash_motion.h" static const int crc_bits = 16; static const int block_size_bits = 3; -static CRC_CALCULATOR crc_calculator1; -static CRC_CALCULATOR crc_calculator2; -static int g_crc_initialized = 0; static void hash_table_clear_all(hash_table *p_hash_table) { if (p_hash_table->p_lookup_table == NULL) { @@ -106,11 +104,11 @@ static int hash_block_size_to_index(int block_size) { } } -void av1_hash_table_init(hash_table *p_hash_table) { - if (g_crc_initialized == 0) { - av1_crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB); - av1_crc_calculator_init(&crc_calculator2, 24, 0x864CFB); - g_crc_initialized = 1; +void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) { + if (x->g_crc_initialized == 0) { + av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB); + av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB); + x->g_crc_initialized = 1; } p_hash_table->p_lookup_table = NULL; } @@ -181,7 +179,8 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, uint32_t *pic_block_hash[2], - int8_t *pic_block_same_info[3]) { + int8_t *pic_block_same_info[3], + MACROBLOCK *x) { const int width = 2; const int height = 2; const int x_end = picture->y_crop_width - width + 1; @@ -201,9 +200,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p); pic_block_hash[0][pos] = av1_get_crc_value( - &crc_calculator1, (uint8_t *)p, length * sizeof(p[0])); + &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0])); pic_block_hash[1][pos] = av1_get_crc_value( - &crc_calculator2, (uint8_t *)p, length * sizeof(p[0])); + &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0])); pos++; } pos += width - 1; @@ -220,9 +219,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); pic_block_hash[0][pos] = - av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0])); + av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0])); pic_block_hash[1][pos] = - av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0])); + av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0])); pos++; } pos += width - 1; @@ -235,7 +234,8 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture, uint32_t *src_pic_block_hash[2], uint32_t *dst_pic_block_hash[2], int8_t *src_pic_block_same_info[3], - int8_t *dst_pic_block_same_info[3]) { + int8_t *dst_pic_block_same_info[3], + MACROBLOCK *x) { const int pic_width = picture->y_crop_width; const int x_end = picture->y_crop_width - block_size + 1; const int y_end = picture->y_crop_height - block_size + 1; @@ -254,14 +254,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture, p[2] = src_pic_block_hash[0][pos + src_size * pic_width]; p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size]; dst_pic_block_hash[0][pos] = - av1_get_crc_value(&crc_calculator1, (uint8_t *)p, length); + av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length); p[0] = src_pic_block_hash[1][pos]; p[1] = src_pic_block_hash[1][pos + src_size]; p[2] = src_pic_block_hash[1][pos + src_size * pic_width]; p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size]; dst_pic_block_hash[1][pos] = - av1_get_crc_value(&crc_calculator2, (uint8_t *)p, length); + av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length); dst_pic_block_same_info[0][pos] = src_pic_block_same_info[0][pos] && @@ -388,17 +388,9 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, return 1; } -// global buffer for hash value calculation of a block -// used only in av1_get_block_hash_value() -#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) -// [first hash/second hash] -// [two buffers used ping-pong] -// [num of 2x2 blocks in 128x128] -static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH]; - void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, uint32_t *hash_value1, uint32_t *hash_value2, - int use_highbitdepth) { + int use_highbitdepth, MACROBLOCK *x) { uint32_t to_hash[4]; const int add_value = hash_block_size_to_index(block_size) << crc_bits; assert(add_value >= 0); @@ -415,10 +407,12 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, get_pixels_in_1D_short_array_by_block_2x2( y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); - hash_value_buffer[0][0][pos] = av1_get_crc_value( - &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); - hash_value_buffer[1][0][pos] = av1_get_crc_value( - &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); + x->hash_value_buffer[0][0][pos] = + av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + x->hash_value_buffer[1][0][pos] = + av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); } } } else { @@ -429,10 +423,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, stride, pixel_to_hash); assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); - hash_value_buffer[0][0][pos] = av1_get_crc_value( - &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash)); - hash_value_buffer[1][0][pos] = av1_get_crc_value( - &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash)); + x->hash_value_buffer[0][0][pos] = av1_get_crc_value( + &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash)); + x->hash_value_buffer[1][0][pos] = av1_get_crc_value( + &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash)); } } } @@ -457,24 +451,24 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, assert(srcPos + src_sub_block_in_width + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); - to_hash[0] = hash_value_buffer[0][src_idx][srcPos]; - to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1]; + to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos]; + to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1]; to_hash[2] = - hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width]; - to_hash[3] = - hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width + 1]; + x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = x->hash_value_buffer[0][src_idx] + [srcPos + src_sub_block_in_width + 1]; - hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value( - &crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash)); + x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value( + &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash)); - to_hash[0] = hash_value_buffer[1][src_idx][srcPos]; - to_hash[1] = hash_value_buffer[1][src_idx][srcPos + 1]; + to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos]; + to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1]; to_hash[2] = - hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width]; - to_hash[3] = - hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width + 1]; - hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value( - &crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash)); + x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = x->hash_value_buffer[1][src_idx] + [srcPos + src_sub_block_in_width + 1]; + x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value( + &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash)); dst_pos++; } } @@ -483,8 +477,6 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, sub_block_in_width >>= 1; } - *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value; - *hash_value2 = hash_value_buffer[1][dst_idx][0]; + *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value; + *hash_value2 = x->hash_value_buffer[1][dst_idx][0]; } - -#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h index 8deb92eb6..df3ec3215 100644 --- a/third_party/aom/av1/encoder/hash_motion.h +++ b/third_party/aom/av1/encoder/hash_motion.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_HASH_MOTION_H_ -#define AV1_ENCODER_HASH_MOTION_H_ +#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_ +#define AOM_AV1_ENCODER_HASH_MOTION_H_ #include "config/aom_config.h" @@ -34,7 +34,7 @@ typedef struct _hash_table { Vector **p_lookup_table; } hash_table; -void av1_hash_table_init(hash_table *p_hash_table); +void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x); void av1_hash_table_destroy(hash_table *p_hash_table); void av1_hash_table_create(hash_table *p_hash_table); int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value); @@ -44,13 +44,15 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, uint32_t hash_value2); void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, uint32_t *pic_block_hash[2], - int8_t *pic_block_same_info[3]); + int8_t *pic_block_same_info[3], + struct macroblock *x); void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture, int block_size, uint32_t *src_pic_block_hash[2], uint32_t *dst_pic_block_hash[2], int8_t *src_pic_block_same_info[3], - int8_t *dst_pic_block_same_info[3]); + int8_t *dst_pic_block_same_info[3], + struct macroblock *x); void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, uint32_t *pic_hash[2], int8_t *pic_is_same, @@ -67,10 +69,10 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, int block_size, int x_start, int y_start); void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, uint32_t *hash_value1, uint32_t *hash_value2, - int use_highbitdepth); + int use_highbitdepth, struct macroblock *x); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_HASH_MOTION_H_ +#endif // AOM_AV1_ENCODER_HASH_MOTION_H_ diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c index 0922557d0..67898fd18 100644 --- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c @@ -121,15 +121,45 @@ static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; - av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, - txfm_param->bd); + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + switch (tx_type) { + // use the c version for anything including identity for now + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + default: + av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + } } static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; - av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, - txfm_param->bd); + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + switch (tx_type) { + // use the c version for anything including identity for now + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + default: + av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + } } static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h index 6155b255a..daabc7119 100644 --- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_ -#define AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ #include "config/aom_config.h" @@ -28,4 +28,4 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, } // extern "C" #endif -#endif // AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h index 3897c2a6a..e55224cf7 100644 --- a/third_party/aom/av1/encoder/lookahead.h +++ b/third_party/aom/av1/encoder/lookahead.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_LOOKAHEAD_H_ -#define AV1_ENCODER_LOOKAHEAD_H_ +#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_ +#define AOM_AV1_ENCODER_LOOKAHEAD_H_ #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" @@ -103,4 +103,4 @@ unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx); } // extern "C" #endif -#endif // AV1_ENCODER_LOOKAHEAD_H_ +#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_ diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h index 23243dd9e..64f936176 100644 --- a/third_party/aom/av1/encoder/mathutils.h +++ b/third_party/aom/av1/encoder/mathutils.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_ENCODER_MATHUTILS_H_ +#define AOM_AV1_ENCODER_MATHUTILS_H_ + #include <memory.h> #include <math.h> #include <stdio.h> @@ -23,7 +26,7 @@ static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) { double c; // Forward elimination for (k = 0; k < n - 1; k++) { - // Bring the largest magitude to the diagonal position + // Bring the largest magnitude to the diagonal position for (i = n - 1; i > k; i--) { if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) { for (j = 0; j < n; j++) { @@ -352,3 +355,5 @@ static INLINE int SVD(double *U, double *W, double *V, double *matx, int M, return 0; } + +#endif // AOM_AV1_ENCODER_MATHUTILS_H_ diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c index 472173634..1a35ff77c 100644 --- a/third_party/aom/av1/encoder/mbgraph.c +++ b/third_party/aom/av1/encoder/mbgraph.c @@ -17,11 +17,12 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/system_state.h" -#include "av1/encoder/segmentation.h" -#include "av1/encoder/mcomp.h" #include "av1/common/blockd.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv, int mb_row, int mb_col) { @@ -140,7 +141,7 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) { // calculate SATD for each intra prediction mode; // we're intentionally not doing 4x4, we just want a rough estimate - for (mode = DC_PRED; mode <= PAETH_PRED; mode++) { + for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) { unsigned int err; xd->mi[0]->mode = mode; diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h index 3e0a4fa9b..ba08476f7 100644 --- a/third_party/aom/av1/encoder/mbgraph.h +++ b/third_party/aom/av1/encoder/mbgraph.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_MBGRAPH_H_ -#define AV1_ENCODER_MBGRAPH_H_ +#ifndef AOM_AV1_ENCODER_MBGRAPH_H_ +#define AOM_AV1_ENCODER_MBGRAPH_H_ #ifdef __cplusplus extern "C" { @@ -38,4 +38,4 @@ void av1_update_mbgraph_stats(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_MBGRAPH_H_ +#endif // AOM_AV1_ENCODER_MBGRAPH_H_ diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c index c4572a341..8f6de9b53 100644 --- a/third_party/aom/av1/encoder/mcomp.c +++ b/third_party/aom/av1/encoder/mcomp.c @@ -29,6 +29,7 @@ #include "av1/encoder/encodemv.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" // #define NEW_DIAMOND_SEARCH @@ -219,7 +220,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { thismse = upsampled_pref_error( \ xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, \ pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \ - mask_stride, invert_mask, w, h, &sse); \ + mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \ v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ v += thismse; \ if (v < besterr) { \ @@ -342,19 +343,19 @@ static unsigned int setup_center_error( if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); + uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16); if (mask) { - aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset, + aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride, mask, mask_stride, invert_mask); } else { if (xd->jcp_param.use_jnt_comp_avg) - aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h, - y + offset, y_stride, &xd->jcp_param); + aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, + y_stride, &xd->jcp_param); else - aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); } - besterr = - vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); if (mask) { @@ -648,51 +649,54 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm, int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int invert_mask, int w, int h, - unsigned int *sse) { + unsigned int *sse, int subpel_search) { unsigned int besterr; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16); if (second_pred != NULL) { if (mask) { aom_highbd_comp_mask_upsampled_pred( - xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd); + xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd, + subpel_search); } else { if (xd->jcp_param.use_jnt_comp_avg) aom_highbd_jnt_comp_avg_upsampled_pred( - xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, - subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param); + xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param, subpel_search); else - aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, - second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, xd->bd); + aom_highbd_comp_avg_upsampled_pred( + xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, xd->bd, subpel_search); } } else { - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h, - subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, + subpel_search); } - - besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse); + besterr = vfp->vf(pred8, w, src, src_stride, sse); } else { DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); if (second_pred != NULL) { if (mask) { - aom_comp_mask_upsampled_pred( - xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask); + aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, + second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, mask, + mask_stride, invert_mask, subpel_search); } else { if (xd->jcp_param.use_jnt_comp_avg) aom_jnt_comp_avg_upsampled_pred( xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, &xd->jcp_param); + subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search); else aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride); + subpel_y_q3, y, y_stride, subpel_search); } } else { aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride); + subpel_y_q3, y, y_stride, subpel_search); } besterr = vfp->vf(pred, w, src, src_stride, sse); @@ -707,10 +711,11 @@ static unsigned int upsampled_setup_center_error( const int src_stride, const uint8_t *const y, int y_stride, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2], - unsigned int *sse1, int *distortion) { - unsigned int besterr = upsampled_pref_error( - xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset, - y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1); + unsigned int *sse1, int *distortion, int subpel_search) { + unsigned int besterr = + upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, + y + offset, y_stride, 0, 0, second_pred, mask, + mask_stride, invert_mask, w, h, sse1, subpel_search); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); return besterr; @@ -781,7 +786,8 @@ int av1_find_best_sub_pixel_tree( besterr = upsampled_setup_center_error( xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w, - h, offset, mvjcost, mvcost, sse1, distortion); + h, offset, mvjcost, mvcost, sse1, distortion, + use_accurate_subpel_search); else besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, @@ -802,7 +808,8 @@ int av1_find_best_sub_pixel_tree( thismse = upsampled_pref_error( xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred, - mask, mask_stride, invert_mask, w, h, &sse); + mask, mask_stride, invert_mask, w, h, &sse, + use_accurate_subpel_search); } else { thismse = estimate_upsampled_pref_error( xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), @@ -837,7 +844,8 @@ int av1_find_best_sub_pixel_tree( thismse = upsampled_pref_error( xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred, - mask, mask_stride, invert_mask, w, h, &sse); + mask, mask_stride, invert_mask, w, h, &sse, + use_accurate_subpel_search); } else { thismse = estimate_upsampled_pref_error( xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), @@ -929,8 +937,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, int16_t bc = mbmi->mv[0].as_mv.col; int16_t *tr = &mbmi->mv[0].as_mv.row; int16_t *tc = &mbmi->mv[0].as_mv.col; - WarpedMotionParams best_wm_params = mbmi->wm_params[0]; - int best_num_proj_ref = mbmi->num_proj_ref[0]; + WarpedMotionParams best_wm_params = mbmi->wm_params; + int best_num_proj_ref = mbmi->num_proj_ref; unsigned int bestmse; int minc, maxc, minr, maxr; const int start = cm->allow_high_precision_mv ? 0 : 4; @@ -962,18 +970,18 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); if (total_samples > 1) - mbmi->num_proj_ref[0] = + mbmi->num_proj_ref = selectSamples(&this_mv, pts, pts_inref, total_samples, bsize); - if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr, - *tc, &mbmi->wm_params[0], mi_row, mi_col)) { + if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr, + *tc, &mbmi->wm_params, mi_row, mi_col)) { thismse = av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv); if (thismse < bestmse) { best_idx = idx; - best_wm_params = mbmi->wm_params[0]; - best_num_proj_ref = mbmi->num_proj_ref[0]; + best_wm_params = mbmi->wm_params; + best_num_proj_ref = mbmi->num_proj_ref; bestmse = thismse; } } @@ -990,8 +998,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, *tr = br; *tc = bc; - mbmi->wm_params[0] = best_wm_params; - mbmi->num_proj_ref[0] = best_num_proj_ref; + mbmi->wm_params = best_wm_params; + mbmi->num_proj_ref = best_num_proj_ref; return bestmse; } @@ -2013,8 +2021,16 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, const uint8_t *mask, int mask_stride, int invert_mask, const MV *center_mv, const uint8_t *second_pred) { - const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, - { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; + static const search_neighbors neighbors[8] = { + { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 } + }; const MACROBLOCKD *const xd = &x->e_mbd; const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; @@ -2022,6 +2038,10 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, MV *best_mv = &x->best_mv.as_mv; unsigned int best_sad = INT_MAX; int i, j; + uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] = + { 0 }; + int grid_center = SEARCH_GRID_CENTER_8P; + int grid_coord = grid_center; clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max, x->mv_limits.row_min, x->mv_limits.row_max); @@ -2043,13 +2063,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit); } + do_refine_search_grid[grid_coord] = 1; + for (i = 0; i < search_range; ++i) { int best_site = -1; for (j = 0; j < 8; ++j) { - const MV mv = { best_mv->row + neighbors[j].row, - best_mv->col + neighbors[j].col }; + grid_coord = grid_center + neighbors[j].coord_offset; + if (do_refine_search_grid[grid_coord] == 1) { + continue; + } + const MV mv = { best_mv->row + neighbors[j].coord.row, + best_mv->col + neighbors[j].coord.col }; + do_refine_search_grid[grid_coord] = 1; if (is_mv_in(&x->mv_limits, &mv)) { unsigned int sad; if (mask) { @@ -2079,8 +2106,9 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, if (best_site == -1) { break; } else { - best_mv->row += neighbors[best_site].row; - best_mv->col += neighbors[best_site].col; + best_mv->row += neighbors[best_site].coord.row; + best_mv->col += neighbors[best_site].coord.col; + grid_center += neighbors[best_site].coord_offset; } } return best_sad; @@ -2099,11 +2127,11 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) { } int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, + MV *mvp_full, int step_param, int method, + int run_mesh_search, int error_per_bit, int *cost_list, const MV *ref_mv, int var_max, int rd, int x_pos, int y_pos, int intra) { const SPEED_FEATURES *const sf = &cpi->sf; - const SEARCH_METHODS method = sf->mv.search_method; const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; @@ -2168,11 +2196,35 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, default: assert(0 && "Invalid search method."); } + // Should we allow a follow on exhaustive search? + if (!run_mesh_search) { + if (method == NSTEP) { + if (is_exhaustive_allowed(cpi, x)) { + int exhuastive_thr = sf->exhaustive_searches_thresh; + exhuastive_thr >>= + 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + // Threshold variance for an exhaustive full search. + if (var > exhuastive_thr) run_mesh_search = 1; + } + } + } + + if (run_mesh_search) { + int var_ex; + MV tmp_mv_ex; + var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit, + cost_list, fn_ptr, ref_mv, &tmp_mv_ex); + if (var_ex < var) { + var = var_ex; + x->best_mv.as_mv = tmp_mv_ex; + } + } + if (method != NSTEP && rd && var < var_max) var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1); do { - if (!av1_use_hash_me(&cpi->common)) break; + if (!intra || !av1_use_hash_me(&cpi->common)) break; // already single ME // get block size and original buffer of current block @@ -2195,7 +2247,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, av1_get_block_hash_value( what, what_stride, block_width, &hash_value1, &hash_value2, - x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x); const int count = av1_hash_table_count(ref_frame_hash, hash_value1); // for intra, at lest one matching can be found, itself. @@ -2279,7 +2331,8 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV this_mv = { r, c }; \ thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv, \ mask, vfp, z, pre(y, y_stride, r, c), \ - y_stride, sp(c), sp(r), w, h, &sse); \ + y_stride, sp(c), sp(r), w, h, &sse, \ + use_accurate_subpel_search); \ if ((v = MVC(r, c) + thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -2307,18 +2360,20 @@ static int upsampled_obmc_pref_error( MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc, const uint8_t *const y, int y_stride, - int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) { + int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse, + int subpel_search) { unsigned int besterr; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h, - subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd); - besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse); + DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, + subpel_search); + besterr = vfp->ovf(pred8, w, wsrc, mask, sse); } else { - DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride); + subpel_y_q3, y, y_stride, subpel_search); besterr = vfp->ovf(pred, w, wsrc, mask, sse); } @@ -2330,10 +2385,11 @@ static unsigned int upsampled_setup_obmc_center_error( const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w, int h, int offset, - int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) { - unsigned int besterr = - upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, - y + offset, y_stride, 0, 0, w, h, sse1); + int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion, + int subpel_search) { + unsigned int besterr = upsampled_obmc_pref_error( + xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0, + 0, w, h, sse1, subpel_search); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); return besterr; @@ -2388,11 +2444,12 @@ int av1_find_best_obmc_sub_pixel_tree_up( bestmv->row *= 8; bestmv->col *= 8; - // use_accurate_subpel_search can be 0 or 1 + // use_accurate_subpel_search can be 0 or 1 or 2 if (use_accurate_subpel_search) besterr = upsampled_setup_obmc_center_error( xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, - y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion); + y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion, + use_accurate_subpel_search); else besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, offset, mvjcost, mvcost, @@ -2408,7 +2465,8 @@ int av1_find_best_obmc_sub_pixel_tree_up( if (use_accurate_subpel_search) { thismse = upsampled_obmc_pref_error( xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address, - pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse); + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse, + use_accurate_subpel_search); } else { thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), src_address, mask, &sse); @@ -2439,7 +2497,8 @@ int av1_find_best_obmc_sub_pixel_tree_up( if (use_accurate_subpel_search) { thismse = upsampled_obmc_pref_error( xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address, - pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse); + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse, + use_accurate_subpel_search); } else { thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), src_address, mask, &sse); @@ -2643,11 +2702,12 @@ int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg, return best_sad; } -int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, - MV *mvp_full, int step_param, int sadpb, - int further_steps, int do_refine, - const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, MV *dst_mv, int is_second) { +static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, + int is_second) { const int32_t *wsrc = x->wsrc_buf; const int32_t *mask = x->mask_buf; MV temp_mv; @@ -2704,6 +2764,31 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, return bestsme; } +int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full, + int step_param, int sadpb, int further_steps, + int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second) { + if (cpi->sf.obmc_full_pixel_search_level == 0) { + return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb, + further_steps, do_refine, fn_ptr, ref_mv, + dst_mv, is_second); + } else { + const int32_t *wsrc = x->wsrc_buf; + const int32_t *mask = x->mask_buf; + const int search_range = 8; + *dst_mv = *mvp_full; + clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + int thissme = obmc_refining_search_sad( + x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second); + if (thissme < INT_MAX) + thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1, + is_second); + return thissme; + } +} + // Note(yunqingwang): The following 2 functions are only used in the motion // vector unit test, which return extreme motion vectors allowed by the MV // limits. diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h index 539e8f4e4..a975218b0 100644 --- a/third_party/aom/av1/encoder/mcomp.h +++ b/third_party/aom/av1/encoder/mcomp.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_MCOMP_H_ -#define AV1_ENCODER_MCOMP_H_ +#ifndef AOM_AV1_ENCODER_MCOMP_H_ +#define AOM_AV1_ENCODER_MCOMP_H_ #include "av1/encoder/block.h" #include "aom_dsp/variance.h" @@ -31,6 +31,11 @@ extern "C" { // for Block_16x16 #define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND) +#define SEARCH_RANGE_8P 3 +#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1) +#define SEARCH_GRID_CENTER_8P \ + (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P) + // motion search site typedef struct search_site { MV mv; @@ -43,6 +48,11 @@ typedef struct search_site_config { int searches_per_step; } search_site_config; +typedef struct { + MV coord; + int coord_offset; +} search_neighbors; + void av1_init_dsmotion_compensation(search_site_config *cfg, int stride); void av1_init3smotion_compensation(search_site_config *cfg, int stride); @@ -120,14 +130,15 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, - int error_per_bit, int *cost_list, const MV *ref_mv, - int var_max, int rd, int x_pos, int y_pos, int intra); - -int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, - MV *mvp_full, int step_param, int sadpb, - int further_steps, int do_refine, - const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, MV *dst_mv, int is_second); + int method, int run_mesh_search, int error_per_bit, + int *cost_list, const MV *ref_mv, int var_max, int rd, + int x_pos, int y_pos, int intra); + +int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second); int av1_find_best_obmc_sub_pixel_tree_up( MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, @@ -147,4 +158,4 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi, } // extern "C" #endif -#endif // AV1_ENCODER_MCOMP_H_ +#endif // AOM_AV1_ENCODER_MCOMP_H_ diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c index 3a27e5845..d21def43a 100644 --- a/third_party/aom/av1/encoder/ml.c +++ b/third_party/aom/av1/encoder/ml.c @@ -10,7 +10,9 @@ */ #include <assert.h> +#include <math.h> +#include "aom_dsp/aom_dsp_common.h" #include "av1/encoder/ml.h" void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, @@ -55,3 +57,17 @@ void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, weights += num_input_nodes; } } + +void av1_nn_softmax(const float *input, float *output, int n) { + // Softmax function is invariant to adding the same constant + // to all input values, so we subtract the maximum input to avoid + // possible overflow. + float max_inp = input[0]; + for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]); + float sum_out = 0.0f; + for (int i = 0; i < n; i++) { + output[i] = (float)exp(input[i] - max_inp); + sum_out += output[i]; + } + for (int i = 0; i < n; i++) output[i] /= sum_out; +} diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h index 614cb60bb..cb8ef2871 100644 --- a/third_party/aom/av1/encoder/ml.h +++ b/third_party/aom/av1/encoder/ml.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ML_H_ -#define AV1_ENCODER_ML_H_ +#ifndef AOM_AV1_ENCODER_ML_H_ +#define AOM_AV1_ENCODER_ML_H_ #ifdef __cplusplus extern "C" { @@ -37,8 +37,13 @@ typedef struct { void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, float *output); +// Applies the softmax normalization function to the input +// to get a valid probability distribution in the output: +// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k])) +void av1_nn_softmax(const float *input, float *output, int n); + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_RD_H_ +#endif // AOM_AV1_ENCODER_ML_H_ diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h index bbdd50784..8b88c4755 100644 --- a/third_party/aom/av1/encoder/palette.h +++ b/third_party/aom/av1/encoder/palette.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PALETTE_H_ -#define AV1_ENCODER_PALETTE_H_ +#ifndef AOM_AV1_ENCODER_PALETTE_H_ +#define AOM_AV1_ENCODER_PALETTE_H_ #include "av1/common/blockd.h" @@ -93,4 +93,4 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, } // extern "C" #endif -#endif /* AV1_ENCODER_PALETTE_H_ */ +#endif // AOM_AV1_ENCODER_PALETTE_H_ diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h index 279d39495..437ea43f9 100644 --- a/third_party/aom/av1/encoder/partition_model_weights.h +++ b/third_party/aom/av1/encoder/partition_model_weights.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ -#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ +#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { @@ -1314,204 +1314,112 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = { #define FEATURE_SIZE 18 #define LABEL_SIZE 4 -static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = { - 0.121894f, 0.058485f, 0.702226f, 0.015457f, -0.123380f, -0.573450f, - 0.319576f, 0.118808f, 0.166057f, 0.526984f, 0.015211f, -0.025050f, - 0.085717f, -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f, - 0.083573f, 0.007112f, -0.358623f, -0.264443f, -0.064819f, 0.022013f, - -0.040077f, -0.291967f, -0.293100f, 0.072266f, -0.270572f, -0.292253f, - -0.260105f, -0.294472f, -0.275752f, 0.054315f, 0.000085f, 0.105115f, - -0.363572f, -0.016542f, 0.185943f, -0.359903f, 0.038765f, -0.377668f, - 0.172692f, 0.127749f, -0.031275f, -0.242528f, -0.145880f, -0.055247f, - -0.000265f, -0.355224f, 0.089917f, -0.377841f, -0.209766f, 0.030899f, - 0.039546f, -0.375030f, -0.041605f, 0.137677f, 0.021282f, -0.150442f, - -0.189445f, 0.009293f, -0.316033f, 0.038745f, -0.278761f, 0.005692f, - -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f, 0.005435f, - -0.930979f, 0.115513f, 0.689958f, 0.221318f, 1.003891f, 0.359540f, - -0.640534f, -0.162373f, -0.118105f, 0.205587f, 0.019710f, 0.025067f, - -0.025344f, 0.002831f, 0.033078f, 0.040175f, -0.007502f, 0.026272f, - 0.083443f, -0.880884f, 0.436948f, 0.293297f, 0.051678f, -0.133328f, - -0.180323f, 0.667835f, 0.070733f, -0.003060f, -0.221804f, 0.146601f, - 0.064024f, 0.056758f, -0.077361f, 0.105587f, -0.185500f, -0.133552f, - 0.138269f, 0.165055f, 0.628284f, 0.846449f, 0.058825f, 0.223157f, - 0.277896f, -0.381303f, 0.408241f, 0.643301f, 0.067494f, 0.120822f, - -0.182491f, -0.111373f, -0.033374f, 0.131387f, -0.114654f, 0.114318f, - 0.094718f, -0.052232f, 0.385903f, 1.212304f, 0.425305f, -0.052993f, - 0.291474f, -0.319730f, 0.023090f, -0.317259f, 0.011181f, -0.034185f, - -0.100671f, 0.186185f, -0.432511f, -0.115957f, -0.067746f, -0.177810f, - -0.226700f, 0.004464f, 0.006809f, 0.171360f, -0.080723f, 0.099826f, - -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f, - 0.010452f, -0.341089f, -0.013566f, -0.340129f, 0.034675f, -0.036518f, - -0.036473f, -0.192892f, 0.650235f, 0.609437f, -0.160982f, 0.125535f, - -1.004575f, 0.521969f, 1.318091f, 0.614004f, -0.106622f, -0.077453f, - -0.037328f, -0.081940f, 0.007640f, 0.026654f, -0.080332f, -0.077356f, - -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f, 0.089502f, - -0.280502f, 0.003941f, -0.249937f, 0.244263f, 0.023269f, 0.080263f, - 0.073172f, -0.200036f, 0.022381f, 0.008592f, -0.339517f, -0.135073f, - 0.177199f, 0.208363f, 0.652360f, 0.272990f, 0.609535f, 0.145805f, - 0.022527f, -0.088378f, 0.205008f, 0.101021f, -0.019673f, -0.252681f, - 0.116034f, -0.062052f, 0.009991f, 0.138933f, -0.182428f, 0.052542f, - -0.350825f, -0.122654f, -0.154687f, 0.066747f, 0.021541f, -0.212169f, - -0.087093f, -0.087488f, 0.178129f, -0.146544f, 0.013919f, -0.273899f, - 0.223753f, -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f, - -0.135236f, 0.058918f, 0.069080f, 0.279287f, 0.369689f, 1.134526f, - 0.659511f, 0.250223f, 0.286040f, 0.515284f, 0.067791f, -0.156385f, - 0.143283f, 0.050884f, 0.089956f, -0.040850f, -0.003650f, -0.081162f, - 0.086004f, 0.116578f, 0.826254f, 0.504869f, -0.196022f, -0.207279f, - 0.200503f, -0.196801f, 0.008211f, 0.411158f, -0.075855f, -0.036690f, - 0.111519f, -0.057838f, -0.005846f, 0.111067f, 0.174712f, -0.078054f, - 0.765897f, 0.018670f, -0.306960f, -0.020034f, -0.332875f, 0.662707f, - -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f, - -0.009141f, 0.003325f, -0.011233f, -0.000819f, 0.006369f, 0.002418f, - -0.035906f, -0.005135f, 1.073830f, 1.020736f, -0.182611f, -1.038976f, - -0.226695f, -0.375663f, 0.364568f, 0.620995f, -0.018615f, 0.011347f, - 0.045786f, 0.041077f, 0.010886f, -0.148428f, 0.028007f, -0.022322f, - -0.165985f, 0.233315f, -0.277531f, -0.329683f, -0.516967f, -0.390750f, - 0.006948f, 0.133744f, -0.375681f, -0.116877f, -0.009441f, -0.008597f, - -0.160679f, 0.102150f, -0.142647f, -0.117501f, 0.035035f, 0.228687f, - -1.117397f, -0.005171f, -0.008708f, 0.413042f, -0.298532f, 0.614909f, - -0.181084f, -0.711770f, 0.344033f, 0.287220f, -0.112848f, -0.052866f, - -0.222466f, 0.025029f, -0.107558f, 0.137036f, -0.276661f, -0.038808f, - -0.057448f, 0.037563f, 0.526020f, 0.447997f, 0.288366f, 0.264815f, - 0.319974f, -0.193091f, 0.353830f, 0.412950f, -0.280454f, 0.092737f, - 0.070919f, 0.043336f, 0.041214f, -0.052147f, 0.010860f, 0.191325f, - 0.079783f, -0.425672f, -0.053469f, -0.005495f, 0.184526f, -0.166171f, - 0.084459f, -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f, - -0.189614f, -0.054146f, -0.261279f, 0.196347f, -0.087568f, 0.070533f, - -0.145492f, -0.041500f, -0.465861f, 0.077369f, 0.020645f, -0.440232f, - -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f, - 0.085513f, -0.200425f, 0.218516f, 0.049604f, -0.280952f, -0.242674f, - -1.969931f, 0.013374f, -0.039643f, 1.113947f, 0.018568f, 0.916330f, - -0.302934f, -0.225816f, 0.189529f, -0.361971f, 0.021073f, -0.050143f, - -0.041415f, 0.015126f, 0.018091f, -0.082401f, 0.017152f, 0.064856f, - 0.156170f, 0.145323f, -0.281409f, 0.213357f, -0.058966f, 0.158668f, - 0.033742f, 0.378820f, -0.662875f, -0.455532f, -0.702928f, 0.234325f, - 0.139627f, -1.360650f, 0.040921f, -0.044373f, -0.059999f, -0.048565f, - 0.115339f, -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f, - -0.356286f, -0.374928f, 0.143257f, -0.317790f, 0.079875f, -0.359345f, - 0.081321f, -0.219772f, -0.077213f, 0.110624f, -0.252329f, -0.266481f, - 0.190135f, 0.121214f, 0.661064f, -0.037820f, -0.373068f, -0.065209f, - -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f, - -0.032010f, 0.110627f, 0.054094f, -0.884309f, -1.171623f, -0.386911f, - -0.756058f, 0.030362f, 0.563628f, -0.334227f, -0.111213f, 1.143898f, - -0.940454f, 0.084510f, 0.671010f, 0.312244f, -0.052592f, -0.014376f, - 0.039965f, -0.010763f, -0.114936f, -0.146020f, 0.015874f, 0.027439f, - -1.702315f, 0.148702f, 0.153021f, 0.363147f, -0.488933f, 0.220772f, - 0.640310f, -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f, - 0.061041f, -0.013998f, 0.086539f, 0.000466f, 0.037472f, -0.010665f, - -0.326646f, 0.106971f, 0.405589f, 0.555345f, -0.318315f, 0.526498f, - 0.119246f, 0.022213f, 0.171237f, 0.214651f, 0.062904f, -0.023764f, - 0.011831f, 0.079644f, -0.096530f, -0.054373f, -0.306309f, -0.203709f, - -0.353217f, -0.350005f, -0.329549f, 0.062679f, -0.387625f, -0.237111f, - -0.025050f, -0.193987f, 0.002235f, -0.380821f, -0.051036f, -0.136020f, - 0.077989f, -0.361691f, 0.120485f, 0.157746f, 0.073394f, -0.284401f, - 0.113221f, 0.109808f, 0.000197f, 0.122523f, 0.081411f, -0.048544f, - -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f, -1.392915f, - -0.865248f, 0.114577f, -0.000749f, -0.060338f, -0.091176f, -0.108421f, - 0.221256f, 0.100176f, -0.877560f, -1.248838f, 0.643005f, 0.064580f, - -0.049878f, 0.267988f, -0.434340f, -0.299254f, -0.097572f, 0.009606f, - 0.063810f, -0.090525f, 0.027760f, 0.043484f, 0.041697f, 0.108024f, - -0.359586f, -0.197090f, 0.121397f, 0.152206f, -0.391126f, -0.283145f, - 0.008754f, -0.059022f, -0.218745f, 0.043042f, -0.056716f, 0.153051f, - -0.210372f, -0.029681f, -0.288354f, 0.065242f, -0.189376f, 0.115013f, - -0.251488f, -0.533091f, 0.037768f, -0.319107f, -0.161364f, -0.103967f, - 0.063271f, -0.313289f, -0.312093f, -0.045239f, 0.150607f, 0.001487f, - 0.019602f, -0.338031f, -0.036214f, 0.112736f, -0.367762f, 0.122367f, - 0.094670f, 0.175590f, 0.301041f, -0.135257f, 0.539620f, 0.328619f, - -0.163971f, 0.137256f, 0.238805f, 0.483722f, 0.121353f, 0.083630f, - -0.283568f, 0.291661f, -0.061122f, -0.195295f, 0.153459f, -0.153727f, - -0.238839f, -0.071736f, 0.601437f, -0.664072f, 0.230827f, 0.198753f, - -0.039196f, 0.206751f, 0.529020f, 0.904132f, -0.219471f, 0.186694f, - -0.208608f, -0.093385f, -0.161617f, 0.003930f, -0.429869f, -0.123563f, - 0.626098f, -0.002495f, -0.245511f, -1.069848f, 0.296115f, -0.940267f, - -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f, - -0.003030f, 0.035986f, -0.004812f, -0.009193f, -0.004644f, -0.024347f, - 0.068439f, -0.314339f, 0.095057f, -0.212372f, 0.197523f, -0.040878f, - -0.272164f, -0.243326f, -0.204955f, 0.157199f, -0.049964f, -0.091537f, - -0.058012f, -0.306650f, 0.098621f, -0.146778f, -0.154447f, -0.177889f, - -0.009698f, 0.025427f, 0.350576f, -0.448237f, -0.068823f, 1.224960f, - -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f, -0.056460f, - 0.021654f, 0.004352f, 0.041508f, -0.027179f, 0.006789f, -0.023573f, - 0.207775f, -0.280273f, -0.347984f, -0.129935f, 0.151512f, -0.087294f, - -0.494352f, -0.341424f, 0.044084f, -0.064080f, 0.073091f, -0.145574f, - 0.094715f, -0.258786f, -0.020419f, -0.401823f, 0.009397f, -0.138642f, - -0.034953f, -0.077419f, 0.636610f, 0.314980f, 1.110610f, -0.343368f, - 0.696647f, -0.649667f, 0.653491f, -0.096006f, -0.090469f, -0.066975f, - -0.105864f, -0.015666f, 0.102056f, -0.105344f, -0.273495f, -0.014686f, - 0.122031f, 0.139524f, -1.042029f, -0.562510f, 0.885644f, 1.088059f, - 0.189223f, 0.049404f, -0.167371f, 0.018703f, -0.208390f, -0.159002f, - -0.377130f, -0.151118f, 0.117861f, 0.026986f, -0.032433f, 0.081603f, - -0.106729f, -0.040134f, 0.015161f, 0.290572f, 0.241446f, 1.390085f, - 0.438915f, -0.358097f, -0.171799f, 0.879758f, -0.014110f, 0.029562f, - -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f, 0.120979f, - 0.064538f, -0.038841f, 0.034797f, 0.110229f, -0.239779f, -0.004558f, - 0.226534f, 0.111286f, -0.268198f, 0.237673f, -0.328237f, -0.090774f, - -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f, -0.169217f, - -0.300125f, 0.069031f, -0.081358f, -0.376174f, -0.349980f, 0.071443f, - -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f, - 0.079847f, -0.214580f, -0.235661f, -0.184227f, 0.111099f, -0.083945f, - -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f, - -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f, 0.081168f, - -0.120831f, -0.016048f, -0.232495f, 0.214329f, -0.055058f, 0.032856f, - 0.061753f, 0.003226f, 0.097028f, 0.084535f, -1.563199f, 0.434928f, - -0.403710f, 0.520696f, -0.401696f, 0.450568f, -0.074121f, 0.076622f, - -0.098421f, 0.167036f, -0.255250f, -0.526313f, -0.933693f, -0.558104f, - 0.194341f, 0.173326f, 0.071112f, -0.651961f, -1.327587f, -0.705289f, - -1.138889f, 0.197167f, -0.714654f, -0.113891f, 0.080158f, 0.000301f, - 0.057905f, 0.060718f, -0.635995f, 0.100026f, -0.038239f, -0.025530f, -}; - -static const float av1_4_partition_nn_bias_16_layer0[48] = { - -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f, 0.215649f, - -0.337661f, -0.242379f, -0.053829f, 0.165168f, -0.076613f, -0.190579f, - -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f, - -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f, -0.503954f, - -0.083563f, 0.123344f, 0.011821f, 0.085445f, -0.050294f, -0.135194f, - 0.057815f, 0.543558f, -0.090602f, -0.104671f, -0.285075f, 0.354335f, - 1.037007f, -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f, - -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f, -}; - -static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = { - 0.174954f, -0.239117f, 0.073252f, 0.258881f, 0.579781f, 0.441827f, - 0.372037f, -0.062362f, 0.068477f, 0.376811f, -0.130520f, 0.214951f, - -0.200674f, 0.240347f, 0.152954f, 1.360264f, 0.334630f, -0.064789f, - -0.270826f, 0.212699f, 0.045669f, -0.150852f, -0.412603f, 0.122481f, - -0.230246f, 0.005004f, 0.321417f, -0.554083f, -0.186742f, -0.197687f, - -0.028669f, -0.138559f, -0.117773f, 0.024953f, 0.326367f, -0.109951f, - -1.098959f, -0.136134f, 0.563218f, 0.191799f, 0.126191f, -0.093113f, - 0.185371f, 0.058468f, 0.245247f, -0.138064f, -0.471573f, -0.209372f, - -0.111171f, 0.222275f, -0.350556f, -0.106336f, 0.268877f, 0.090639f, - -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f, - 0.099751f, 0.353020f, -0.199079f, -0.463492f, -0.647884f, 0.166611f, - -0.464034f, 0.045096f, -0.312178f, -0.190972f, -0.468297f, 0.662376f, - -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f, - 0.885444f, 0.350273f, -0.003345f, 0.217260f, 0.219156f, 0.240653f, - 0.347840f, 0.101849f, -0.244565f, -0.166971f, 0.091056f, 0.319912f, - 0.268459f, 0.250726f, -0.155819f, -0.087588f, 0.010749f, -0.192344f, - 0.344808f, 0.223482f, -0.189563f, -0.067317f, -0.348191f, -0.085265f, - 0.259318f, 0.102408f, 0.096675f, -0.255564f, -0.168480f, -0.068189f, - -0.457704f, 0.010565f, 0.228573f, -0.124421f, 0.202488f, 0.148519f, - 0.002180f, 0.099099f, -0.179019f, 0.245414f, -0.038307f, 0.116897f, - -0.031377f, 0.368533f, -0.793891f, 0.148614f, 0.075441f, 0.102465f, - -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f, -0.044980f, - 0.092689f, -0.181058f, 0.016279f, 0.155965f, 0.545361f, -0.390699f, - -0.042457f, 0.110238f, 0.114640f, 0.112525f, 0.522221f, 0.533164f, - -0.331720f, -0.212966f, 0.140823f, 0.251311f, -0.006092f, -0.800438f, - 0.007981f, -0.585140f, -0.006526f, 0.541683f, -0.298498f, 0.084322f, - -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f, - 0.667915f, -0.176995f, 0.022307f, -0.169493f, 0.581377f, 0.044929f, - 0.044914f, -0.056290f, 0.324196f, 0.648043f, -0.089381f, -0.054971f, - 0.064782f, 0.629356f, -0.003760f, -0.123822f, 0.144133f, -0.378821f, - 1.116858f, 0.128552f, -0.668783f, 0.207194f, -0.437781f, -0.283321f, - -0.549404f, 0.010538f, 0.208997f, 0.231396f, -0.174347f, 0.161910f, +static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = { + -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f, + 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f, + 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f, + 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f, + -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f, + -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f, + 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f, + 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f, + -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f, + -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f, + -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f, + -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f, + 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f, + 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f, + -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f, + -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f, + -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f, + -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f, + 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f, + -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f, + -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f, + -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f, + -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f, + -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f, + -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f, + 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f, + 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f, + -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f, + 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f, + -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f, + 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f, + 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f, + -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f, + -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f, + 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f, + -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f, + 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f, + -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f, + 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f, + -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f, + 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f, + 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f, + -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f, + 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f, + 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f, + 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f, + 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f, + -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f, + -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f, + -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f, + 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f, + 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f, + -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f, + -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f, + 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f, + -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f, + -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f, + -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f, + 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f, + -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f, + -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f, + 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f, + 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f, + 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f, + 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f, + -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f, + 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f, + -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f, + -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f, + -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f, + 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f, + 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f, +}; + +static const float av1_4_partition_nn_bias_16_layer0[24] = { + 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f, + -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f, + 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f, + -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f, +}; + +static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = { + -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f, + 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f, + -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f, + -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f, + 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f, + -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f, + -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f, + 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f, + 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f, + -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f, + 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f, + -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f, + 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f, + -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f, + -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f, + -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f, }; static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = { - -0.197883f, - -0.136696f, - 0.094115f, - 0.612799f, + -0.462133f, + 0.465060f, + 0.062211f, + 0.401786f, }; static const NN_CONFIG av1_4_partition_nnconfig_16 = { @@ -1519,7 +1427,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = { LABEL_SIZE, // num_outputs 1, // num_hidden_layers { - 48, // num_hidden_nodes + 24, // num_hidden_nodes }, { av1_4_partition_nn_weights_16_layer0, @@ -1532,143 +1440,143 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = { }; static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = { - 0.114554f, 0.043669f, 0.313291f, 0.167688f, -0.413357f, 0.088232f, - 0.301915f, -0.358117f, 0.267711f, -0.252716f, -0.038531f, -0.032805f, - -0.025382f, 0.023624f, -0.949694f, -0.065480f, -0.375721f, -0.697319f, - -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f, - 0.199717f, 0.216902f, -0.239241f, -0.096894f, -0.225046f, 0.246523f, - 0.002333f, -0.254385f, -0.205815f, 0.123139f, -0.476923f, 0.137557f, - 0.059686f, -0.124013f, 0.974675f, 0.889753f, 0.378940f, 0.526413f, - -0.208747f, -0.001913f, 0.094081f, 0.848010f, 0.062042f, 0.159831f, - 0.071016f, 0.024437f, 0.212611f, 0.039501f, -0.149922f, -0.055229f, - -0.229270f, 0.129004f, -0.182803f, 0.291223f, -1.197804f, -0.916991f, - -0.024095f, 0.738729f, -0.300326f, 0.402480f, 0.023944f, -0.022613f, - -0.004554f, 0.001784f, 0.035143f, -0.202237f, 0.080252f, -0.003912f, - -0.040345f, -0.121881f, 0.126672f, 0.093507f, -0.081305f, -0.081099f, - -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f, 0.245259f, - -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f, - 0.103205f, -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f, - -0.204005f, -0.173636f, 0.020302f, -0.109112f, 0.081203f, -0.137344f, - -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f, -0.268105f, - -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f, 0.207510f, - -0.279023f, -0.272472f, -0.166427f, 0.205973f, -0.345692f, -0.238400f, - -0.319178f, -0.327246f, -0.321756f, 0.043191f, -0.027520f, -0.029310f, - 0.161379f, 0.031154f, -0.605365f, -0.230926f, 0.261142f, -0.262678f, - -0.373351f, -0.326245f, 0.279222f, 0.684357f, -0.864302f, 0.036132f, - 0.239307f, 0.136262f, 0.124002f, -0.410379f, -0.172722f, -0.376670f, - -0.195889f, 0.037292f, -0.055295f, 1.022308f, 0.237600f, -0.618435f, - 0.366154f, 0.168308f, -0.473467f, -0.756558f, -0.044830f, 0.019057f, - -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f, 0.001007f, - -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f, 0.254431f, - 0.124238f, -0.198181f, 0.142723f, -0.112997f, -0.164224f, -0.355160f, - 0.135330f, -0.379557f, 0.079392f, 0.210607f, -0.354927f, -0.277678f, - -0.931111f, 0.056208f, -0.347710f, -0.355415f, 0.826145f, 0.390625f, - 0.374414f, -0.205685f, 0.562485f, 0.152288f, 0.130635f, 0.056622f, - 0.057972f, 0.095526f, -0.082436f, -0.085938f, -0.070570f, -0.087634f, - 0.335934f, 0.084860f, 0.544424f, -0.278917f, 0.476740f, 0.050927f, - -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f, - -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f, 0.060073f, - -0.042945f, 0.015249f, 1.241504f, 0.662613f, 0.530496f, -0.180519f, - -1.099086f, -0.825844f, 0.551856f, -0.025009f, -0.006619f, -0.001049f, - 0.014828f, -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f, - -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f, - -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f, - -0.468730f, -0.175170f, 0.136433f, 0.167739f, -0.377602f, 0.135772f, - 0.040972f, -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f, - 0.111125f, -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f, - 0.010556f, 0.058256f, -0.127352f, 0.017665f, 0.072632f, -0.171966f, - -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f, -0.260229f, - 0.025216f, -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f, - -0.280123f, 0.132262f, -0.308330f, -0.119036f, -0.303874f, -0.065445f, - -0.412137f, 0.057167f, 0.044582f, -0.330952f, -0.232572f, 0.039732f, - -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f, 0.058277f, - -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f, - -1.071050f, 0.086854f, -0.004932f, -0.259698f, 0.125301f, -0.742663f, - -0.370517f, -0.772840f, 0.193628f, 0.554676f, 0.051283f, -0.196639f, - 0.040344f, 0.027391f, -0.040501f, 0.038303f, 0.032972f, -0.014638f, - 0.097720f, -0.206897f, -0.015480f, 0.008543f, 0.034469f, 0.127234f, - -0.396463f, -0.390189f, 0.117538f, -0.435622f, 0.043420f, -0.241987f, - -0.118254f, -0.190349f, 0.190273f, -0.085625f, -0.141253f, -0.377438f, - -0.249211f, 0.214512f, -0.363191f, -0.754851f, 0.238045f, 1.127635f, - 0.173947f, -0.357620f, 0.073671f, 0.220617f, 0.072067f, -0.076214f, - -0.044583f, -0.018371f, 0.010952f, -0.135116f, 0.076597f, 0.034480f, - -0.070212f, -0.454429f, -0.135215f, 0.163851f, -0.625990f, -0.283991f, - 0.284051f, 0.182935f, -0.048717f, 0.002484f, -0.009086f, 0.321724f, - 0.125162f, -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f, - 0.123807f, -0.313614f, -0.103142f, 0.072125f, 0.100320f, -0.185558f, - -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f, -0.381231f, - -0.436001f, -0.374834f, 0.230104f, -0.500679f, 0.170880f, 0.029657f, - -0.105857f, -0.366671f, -0.268833f, 0.036885f, -0.026776f, 0.037837f, - -0.362095f, -0.254933f, 0.129650f, 0.007945f, -0.304715f, -0.100813f, - -0.342849f, -0.269223f, 0.178490f, 0.186735f, -0.353995f, 0.050381f, - -0.440186f, 0.025985f, 1.096969f, 1.132937f, 0.581545f, 0.271734f, - -0.109169f, -0.014239f, 0.688644f, 0.602702f, 0.048616f, 0.022335f, - 0.037545f, 0.081667f, -0.109038f, -0.088565f, -0.002506f, -0.041420f, - -0.132515f, 0.187312f, 0.677273f, 1.111182f, 0.199096f, -0.211551f, - -0.896508f, 0.257981f, 0.007803f, 0.160343f, -0.124864f, -0.097150f, - 0.225090f, 0.242900f, -0.195665f, 0.011310f, 0.160765f, 0.169195f, - -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f, 0.511419f, - 0.076009f, -0.165861f, 0.240487f, 0.006298f, -0.153334f, 0.041249f, - 0.387092f, 0.313011f, -0.032269f, 0.019024f, 0.052568f, 0.124247f, - 0.197640f, 0.002537f, 0.651044f, 0.829828f, -0.446444f, -0.402042f, - -0.469399f, -0.019842f, 0.371960f, 0.140373f, -0.044808f, 0.008283f, - 0.093791f, 0.052149f, 0.143123f, -0.449571f, -0.868816f, -0.265661f, - -0.225232f, -0.014704f, 0.543836f, -0.374498f, 0.561647f, 1.309445f, - 0.056789f, -0.048447f, 0.255758f, 0.644553f, -0.124802f, 0.097419f, - -0.149336f, 0.021596f, -0.043699f, 0.057591f, -0.000077f, 0.034488f, - -0.049353f, -0.007799f, 0.437914f, 0.509369f, 0.674428f, 1.858949f, - -0.205964f, 0.060776f, 0.184213f, 0.037177f, -0.062535f, -0.115408f, - 0.076498f, 0.010235f, -0.142253f, 0.009983f, 0.073436f, 0.038716f, - -0.369983f, -0.185959f, -0.137867f, 0.032134f, 0.213814f, -0.125571f, - 0.247874f, -0.166871f, -0.160890f, 0.147029f, 0.267143f, -0.298488f, - -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f, - 0.399519f, 0.143200f, -0.776419f, -0.374639f, -0.022066f, 0.582904f, - 0.006430f, -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f, - -0.398255f, -0.173231f, 0.211789f, -0.036121f, -0.266856f, 0.042956f, - -1.138513f, -0.070313f, 0.158803f, 0.406989f, -0.015974f, 0.651020f, - -0.468982f, -0.310019f, 0.416922f, 0.895162f, 0.019921f, 0.004023f, - 0.006962f, 0.000863f, -0.216395f, -0.074913f, -0.002613f, 0.026703f, + -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f, + 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f, + -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f, + 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f, + -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f, + -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f, + -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f, + -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f, + -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f, + -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f, + 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f, + -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f, + -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f, + 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f, + -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f, + -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f, + -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f, + -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f, + -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f, + -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f, + 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f, + -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f, + -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f, + 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f, + 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f, + -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f, + 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f, + 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f, + -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f, + -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f, + -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f, + 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f, + -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f, + 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f, + -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f, + -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f, + 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f, + -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f, + 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f, + -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f, + -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f, + -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f, + -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f, + 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f, + 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f, + -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f, + -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f, + 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f, + 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f, + 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f, + 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f, + -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f, + 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f, + 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f, + -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f, + -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f, + -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f, + -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f, + -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f, + -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f, + -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f, + -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f, + 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f, + -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f, + 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f, + -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f, + 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f, + 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f, + 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f, + -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f, + 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f, + 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f, + 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f, + -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f, + 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f, + 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f, + -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f, + 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f, + -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f, + -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f, + -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f, + -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f, + -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f, + 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f, + -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f, + 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f, + -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f, + -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f, + 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f, + -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f, + -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f, + 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f, + -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f, + -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f, + -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f, + 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f, }; static const float av1_4_partition_nn_bias_32_layer0[32] = { - 0.133615f, -0.113389f, -0.575989f, 0.589389f, -0.193574f, -0.132463f, - 0.000000f, 0.060317f, 0.264577f, -0.060599f, 0.540147f, -0.127782f, - -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f, - -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f, - -0.448931f, 0.446300f, 0.250002f, 0.223559f, -0.647723f, -0.014369f, - 0.084333f, -0.056270f, + 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f, + -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f, + -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f, + -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f, + -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f, + 0.109579f, -0.082685f, }; static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = { - -0.069633f, -0.087239f, 0.365816f, -0.068579f, 0.231198f, -0.067856f, - -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f, - 0.778888f, 0.169624f, 0.089968f, -0.243569f, 0.353483f, 0.032296f, - -0.157408f, 0.286885f, -0.063537f, -0.324055f, -0.161464f, 0.430600f, - 0.277707f, -0.196463f, 0.154647f, 0.059804f, 0.176408f, 0.303179f, - -0.040156f, 0.375810f, -0.363032f, -0.186808f, -0.264561f, -0.158937f, - -0.007949f, -0.076394f, 0.056475f, 0.308528f, 0.695387f, 0.051336f, - 0.433063f, -0.229948f, -1.210712f, 0.036286f, 0.183868f, -0.117660f, - 0.230134f, -0.093469f, 0.237918f, 0.625986f, -0.236671f, -0.377172f, - 0.331091f, -0.394004f, -0.214349f, 0.243940f, -0.600348f, 0.069843f, - 0.088325f, 0.225775f, 0.276884f, -0.604493f, 0.769812f, 0.259574f, - 0.086220f, 0.511515f, -0.282584f, -0.157719f, 0.278778f, -0.332732f, - 0.068985f, -0.237236f, -0.006102f, -0.154883f, 0.710288f, -0.245896f, - -0.255895f, -0.398038f, 0.304084f, -0.317065f, 0.192609f, -0.235613f, - 0.461340f, 0.117194f, 0.116817f, 0.196150f, 0.421622f, -0.264495f, - 0.617852f, -0.351756f, -0.310016f, 0.135932f, -0.242622f, -0.073094f, - 0.042077f, 0.039230f, -0.482715f, 0.553187f, 0.360637f, 0.313484f, - -0.131540f, -0.104731f, 0.374704f, 0.222173f, 0.437657f, 0.029827f, - -0.545156f, -0.203176f, 0.267824f, 0.169237f, -0.057871f, 0.552197f, - 0.272243f, 0.025681f, -0.262192f, 0.255934f, -0.202407f, -0.483317f, - -0.204721f, 0.288807f, -0.030735f, -0.047161f, -0.780724f, 0.381939f, - -0.295318f, 0.537378f, + 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f, + 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f, + 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f, + -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f, + 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f, + 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f, + -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f, + 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f, + 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f, + 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f, + -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f, + 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f, + -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f, + -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f, + 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f, + -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f, + 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f, + 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f, + 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f, + 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f, + -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f, + -0.800926f, -0.134132f, }; static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = { - -0.332518f, - 0.114452f, - 0.098949f, - 0.465896f, + -0.019518f, + 0.198546f, + 0.339015f, + -0.261961f, }; static const NN_CONFIG av1_4_partition_nnconfig_32 = { @@ -1688,82 +1596,112 @@ static const NN_CONFIG av1_4_partition_nnconfig_32 = { }, }; -static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = { - 0.256343f, -0.021774f, -0.117102f, 0.416930f, 0.188160f, 0.148768f, - -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f, - 0.222769f, -0.199332f, 0.058667f, -0.679529f, 0.081744f, 0.044438f, - -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f, - -0.049503f, -0.309894f, -0.323991f, 0.166113f, 0.138104f, -0.263629f, - -0.368460f, -0.273989f, 0.147239f, 0.044566f, -0.363357f, -0.030792f, - 0.020734f, 0.068506f, -0.434214f, 0.581644f, -1.244146f, -0.569162f, - 0.179499f, -0.188900f, 0.078431f, -0.392126f, -0.006431f, 0.112146f, - -0.065892f, -0.051319f, 0.094607f, 0.251700f, -0.000650f, 0.011911f, - 0.080449f, 0.022816f, 0.322382f, 0.577070f, 0.927738f, 0.178707f, - -0.101237f, -0.212521f, 0.560261f, -0.206492f, -0.077591f, -0.069960f, - 0.025727f, 0.041122f, -0.735228f, -0.506091f, -0.600776f, -0.117829f, - 0.103556f, 0.141823f, 0.853448f, 0.339488f, 0.994022f, 0.121693f, - -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f, - 0.042469f, -0.005319f, -0.305784f, -0.371353f, 0.011194f, -0.018597f, - 0.209260f, 0.071577f, 0.242470f, -0.856593f, 0.288842f, 1.062608f, - -0.300472f, 0.221623f, -0.813563f, -0.250347f, -0.081455f, -0.092779f, - -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f, - -0.006546f, 0.051671f, 0.545608f, 1.101804f, 0.288086f, 1.107046f, - -0.200012f, 0.220182f, -0.189220f, -0.554973f, 0.040711f, -0.058029f, - 0.043737f, 0.016164f, -0.391790f, -0.287770f, -0.046545f, 0.045071f, - 0.190005f, -0.076963f, 0.836839f, 1.633266f, 0.902928f, 0.991972f, - -0.127932f, 0.293680f, -0.035984f, 0.476179f, -0.098024f, 0.068314f, - -0.058365f, 0.096221f, -0.000321f, -0.128840f, 0.136441f, -0.061853f, - 0.270367f, -0.184129f, -0.373670f, -0.177381f, 0.262109f, -0.378013f, - -0.053249f, -0.456389f, 0.222972f, -0.228067f, -0.115210f, -0.277797f, - 0.096913f, -0.014512f, -0.015533f, 0.026389f, -0.360536f, -0.078477f, - -0.203186f, 0.199574f, 0.770476f, 0.595592f, 0.360828f, 0.547721f, - -0.804787f, 0.389690f, -0.437645f, 0.576776f, 0.081903f, 0.082750f, - 0.007166f, -0.143755f, 0.114462f, 0.472432f, -0.058974f, 0.077761f, - -2.015181f, -0.054942f, -0.110894f, 0.529188f, -0.003300f, 0.913895f, - -0.324643f, 0.316135f, -0.291729f, 1.072647f, -0.029236f, 0.045592f, - -0.039399f, 0.043472f, -0.303244f, -0.108761f, -0.011154f, 0.009693f, - -0.374985f, 0.027758f, 0.302075f, -0.295758f, -0.165563f, -0.297259f, - -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f, - 0.089003f, -0.301585f, 0.022474f, 0.077477f, -0.032233f, -0.231036f, - 0.143206f, 0.169113f, -0.556486f, 0.346327f, -0.667790f, 0.126983f, - 0.179727f, 0.397307f, -0.490612f, -1.708789f, -0.040336f, -0.028547f, - -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f, 0.031344f, - -0.131692f, 0.119353f, 0.799313f, 0.443848f, -0.499919f, -1.002983f, - 0.375477f, 0.221096f, -0.238033f, 0.284849f, 0.021897f, 0.023338f, - -0.059067f, 0.117276f, 0.039540f, 0.049630f, 0.175150f, 0.014166f, - -0.071486f, 0.091234f, -1.007432f, -1.417378f, 0.640528f, 1.442576f, - -0.257183f, -0.597016f, 0.861785f, 0.276121f, -0.098017f, 0.120514f, - -0.133184f, 0.106529f, 0.171644f, 0.059513f, 0.215952f, -0.009441f, - -0.505313f, 0.063174f, 0.229148f, -0.344213f, 0.862721f, 1.549941f, - -0.220129f, 0.493094f, 0.264095f, 0.143641f, 0.084968f, -0.078266f, - 0.032335f, -0.019006f, -0.098205f, 0.119213f, -0.103465f, 0.072811f, -}; - -static const float av1_4_partition_nn_bias_64_layer0[16] = { - 0.111611f, -0.067682f, 0.633594f, 0.143559f, -1.051284f, -0.266625f, - -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f, -0.080769f, - 0.235166f, 0.449468f, 0.294689f, -0.395300f, -}; - -static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = { - -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f, 0.141445f, - 0.507161f, 0.435533f, -0.263268f, 0.585105f, 0.235301f, 0.127536f, - -0.688639f, -0.217993f, -0.540066f, 0.406718f, 0.018210f, -0.077349f, - -0.124823f, -0.488220f, -0.957026f, 0.302632f, 0.285490f, -0.411356f, - 0.091089f, 0.103862f, -0.549291f, 0.148628f, 0.640603f, -0.601018f, - 0.178024f, 0.601370f, 0.313780f, 0.051938f, 0.524083f, 0.814631f, - -0.415522f, -0.738849f, 0.477881f, -0.342864f, 0.105181f, 0.040010f, - -0.177521f, 0.400646f, 0.167093f, 0.388279f, -0.898439f, -0.111936f, - 0.469875f, -0.099528f, -0.217370f, 0.283742f, -0.033798f, -0.142797f, - -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f, -0.527150f, - -0.021259f, 0.194651f, -0.276294f, -0.109514f, +static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = { + -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f, + -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f, + 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f, + -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f, + -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f, + 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f, + 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f, + 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f, + 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f, + -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f, + -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f, + 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f, + -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f, + 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f, + -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f, + -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f, + 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f, + -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f, + 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f, + -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f, + -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f, + -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f, + -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f, + -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f, + -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f, + -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f, + -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f, + 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f, + 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f, + -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f, + -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f, + 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f, + -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f, + 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f, + -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f, + 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f, + 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f, + -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f, + -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f, + 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f, + 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f, + 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f, + 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f, + -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f, + -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f, + 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f, + -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f, + 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f, + -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f, + 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f, + -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f, + -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f, + 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f, + -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f, + -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f, + -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f, + -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f, + -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f, + 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f, + 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f, + -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f, + -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f, + -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f, + 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f, + 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f, + -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f, + -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f, + 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f, + 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f, + 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f, + -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f, + 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f, +}; + +static const float av1_4_partition_nn_bias_64_layer0[24] = { + 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f, + -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f, + -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f, + -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f, +}; + +static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = { + -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f, + 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f, + 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f, + -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f, + -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f, + 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f, + -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f, + 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f, + 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f, + -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f, + -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f, + -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f, + 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f, + -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f, + -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f, + -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f, }; static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = { - -0.688947f, - 0.121075f, - 0.289597f, - 0.948091f, + -0.478735f, + 0.292948f, + 0.293172f, + 0.040013f, }; static const NN_CONFIG av1_4_partition_nnconfig_64 = { @@ -1771,7 +1709,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = { LABEL_SIZE, // num_outputs 1, // num_hidden_layers { - 16, // num_hidden_nodes + 24, // num_hidden_nodes }, { av1_4_partition_nn_weights_64_layer0, @@ -1786,8 +1724,725 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = { #undef FEATURE_SIZE #undef LABEL_SIZE +#define FEATURE_SIZE 4 +static const float + av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = { + -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f, + -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f, + 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f, + -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f, + -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f, + -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f, + -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f, + -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f, + 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f, + 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f, + -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f, + -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f, + 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f, + -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f, + -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f, + -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f, + 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f, + -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f, + -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f, + -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f, + 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f, + -0.007193f, -0.257836f, + }; + +static const float av1_partition_breakout_nn_bias_128_layer0[32] = { + 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f, + -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f, + 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f, + 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f, + -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f, + 0.429660f, -8.439470f, +}; + +static const float av1_partition_breakout_nn_weights_128_layer1[32] = { + -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f, + 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f, + 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f, + -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f, + -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f, + -0.039662f, 0.131499f, +}; + +static const float av1_partition_breakout_nn_bias_128_layer1[1] = { + 0.86678213f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_128_layer0, + av1_partition_breakout_nn_weights_128_layer1, + }, + { + av1_partition_breakout_nn_bias_128_layer0, + av1_partition_breakout_nn_bias_128_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = { + 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f, + -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f, + 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f, + 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f, + -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f, + 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f, + 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f, + -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f, + 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f, + -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f, + -2.407131f, -0.062304f, 0.000874f, 0.108786f, + }; + +static const float av1_partition_breakout_nn_bias_64_layer0[16] = { + 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f, + -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f, + -0.337413f, 4.492778f, 0.000000f, 17.043072f, +}; + +static const float av1_partition_breakout_nn_weights_64_layer1[16] = { + -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f, + 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f, + -0.038572f, 0.307899f, -0.294283f, 0.118323f, +}; + +static const float av1_partition_breakout_nn_bias_64_layer1[1] = { + -1.33438122f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_64_layer0, + av1_partition_breakout_nn_weights_64_layer1, + }, + { + av1_partition_breakout_nn_bias_64_layer0, + av1_partition_breakout_nn_bias_64_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = { + -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f, + 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f, + -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f, + -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f, + -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f, + 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f, + 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f, + -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f, + -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f, + -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f, + -0.520814f, -0.045386f, -0.443123f, -0.484209f, + }; + +static const float av1_partition_breakout_nn_bias_32_layer0[16] = { + 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f, + 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f, + -0.423808f, 0.000000f, 6.352258f, -0.155787f, +}; + +static const float av1_partition_breakout_nn_weights_32_layer1[16] = { + 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f, + 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f, + -0.004171f, 0.157694f, 0.117845f, 0.272115f, +}; + +static const float av1_partition_breakout_nn_bias_32_layer1[1] = { + 0.09049262f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_32_layer0, + av1_partition_breakout_nn_weights_32_layer1, + }, + { + av1_partition_breakout_nn_bias_32_layer0, + av1_partition_breakout_nn_bias_32_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = { + 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f, + -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f, + -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f, + -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f, + -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f, + -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f, + -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f, + -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f, + -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f, + -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f, + -0.509287f, -0.048877f, -0.001512f, 0.077086f, + }; + +static const float av1_partition_breakout_nn_bias_16_layer0[16] = { + 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f, + 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f, + 5.625762f, 0.615822f, 0.040057f, 16.668884f, +}; + +static const float av1_partition_breakout_nn_weights_16_layer1[16] = { + -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f, + 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f, + 0.269773f, -0.021105f, -0.146698f, 0.188764f, +}; + +static const float av1_partition_breakout_nn_bias_16_layer1[1] = { + 1.60751927f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_16_layer0, + av1_partition_breakout_nn_weights_16_layer1, + }, + { + av1_partition_breakout_nn_bias_16_layer0, + av1_partition_breakout_nn_bias_16_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = { + -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f, + 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f, + -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f, + -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f, + 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f, + -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f, + -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f, + -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f, + -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f, + -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f, + -0.596269f, 0.098494f, -0.005765f, 0.173652f, + }; + +static const float av1_partition_breakout_nn_bias_8_layer0[16] = { + 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f, + 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f, + 2.336705f, -0.278834f, 0.231905f, 7.954366f, +}; + +static const float av1_partition_breakout_nn_weights_8_layer1[16] = { + -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f, + -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f, + 0.055858f, 0.230970f, -0.056466f, 0.119780f, +}; + +static const float av1_partition_breakout_nn_bias_8_layer1[1] = { + 1.27784479f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_8_layer0, + av1_partition_breakout_nn_weights_8_layer1, + }, + { + av1_partition_breakout_nn_bias_8_layer0, + av1_partition_breakout_nn_bias_8_layer1, + }, +}; +#undef FEATURE_SIZE + +#define FEATURE_SIZE 9 // Input layer size +#define NUM_NODES 32 // Hidden layer size +#define LABEL_SIZE 3 // Output layer size + +static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f, + -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f, + 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f, + -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f, + 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f, + 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f, + 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f, + -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f, + 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f, + 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f, + -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f, + -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f, + 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f, + 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f, + -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f, + 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f, + -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f, + 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f, + 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f, + -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f, + -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f, + -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f, + 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f, + 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f, + -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f, + 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f, + -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f, + -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f, + -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f, + 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f, + -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f, + -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f, + -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f, + 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f, + 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f, + -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f, + 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f, + 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f, + 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f, + 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f, + -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f, + -1.08228f, +}; + +static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = { + 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f, + -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f, + 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f, + -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f, + -0.22638f, 1.40940f, -0.09309f, 0.05828f, +}; + +static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f, + -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f, + -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f, + -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f, + -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f, + 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f, + -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f, + 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f, + 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f, + -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f, + -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f, + 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f, + -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f, + -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f, +}; + +static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = { + 1.70665f, + -0.77954f, + -0.92709f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_8_layer0, + av1_rect_partition_nn_weights_8_layer1 }, + { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 } +}; + +static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f, + -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f, + 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f, + -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f, + 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f, + -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f, + 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f, + 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f, + 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f, + -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f, + 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f, + 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f, + 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f, + 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f, + 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f, + -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f, + -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f, + 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f, + -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f, + -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f, + -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f, + 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f, + 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f, + -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f, + -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f, + -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f, + 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f, + 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f, + -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f, + -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f, + -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f, + -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f, + -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f, + 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f, + 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f, + 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f, + -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f, + -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f, + 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f, + -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f, + -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f, + -0.05573f, +}; + +static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = { + -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f, + 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f, + 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f, + -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f, + -0.12044f, 1.65478f, -0.75153f, 1.18441f, +}; + +static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES * + LABEL_SIZE] = { + -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f, + 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f, + 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f, + 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f, + -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f, + 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f, + 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f, + 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f, + 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f, + -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f, + -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f, + -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f, + 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f, + -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f, +}; + +static const float av1_rect_partition_nn_bias_16_layer1[3] = { + 2.68750f, + -1.31894f, + -1.36768f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_16_layer0, + av1_rect_partition_nn_weights_16_layer1 }, + { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 } +}; + +static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f, + -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f, + -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f, + -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f, + -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f, + -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f, + -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f, + -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f, + -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f, + 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f, + -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f, + -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f, + 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f, + 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f, + -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f, + -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f, + 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f, + 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f, + 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f, + 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f, + 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f, + -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f, + 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f, + 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f, + -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f, + -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f, + -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f, + -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f, + -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f, + -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f, + 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f, + -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f, + -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f, + 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f, + 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f, + -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f, + 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f, + 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f, + 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f, + -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f, + -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f, + 0.33984f, +}; + +static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = { + -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f, + 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f, + 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f, + -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f, + -0.27602f, -1.98063f, 0.20816f, -0.01315f, +}; + +static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f, + -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f, + 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f, + -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f, + 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f, + 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f, + 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f, + 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f, + 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f, + -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f, + 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f, + -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f, + -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f, + -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f, +}; + +static const float av1_rect_partition_nn_bias_32_layer1[3] = { + 2.47332f, + -1.65756f, + -0.81573f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_32_layer0, + av1_rect_partition_nn_weights_32_layer1 }, + { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 } +}; + +static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f, + 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f, + 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f, + 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f, + 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f, + 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f, + 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f, + -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f, + 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f, + 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f, + -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f, + -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f, + -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f, + -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f, + 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f, + 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f, + 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f, + -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f, + -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f, + -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f, + -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f, + -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f, + 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f, + 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f, + 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f, + -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f, + -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f, + 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f, + 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f, + 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f, + -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f, + -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f, + -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f, + 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f, + -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f, + -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f, + 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f, + -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f, + -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f, + 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f, + -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f, + 0.09101f, +}; + +static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = { + 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f, + -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f, + -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f, + -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f, + 0.59835f, -0.31269f, -0.30585f, -1.66212f, +}; + +static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f, + -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f, + 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f, + 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f, + 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f, + -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f, + -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f, + 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f, + -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f, + 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f, + -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f, + -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f, + -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f, + 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f, +}; + +static const float av1_rect_partition_nn_bias_64_layer1[3] = { + 0.32215f, + -0.57522f, + 0.25314f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_64_layer0, + av1_rect_partition_nn_weights_64_layer1 }, + { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 } +}; + +static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f, + 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f, + 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f, + 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f, + -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f, + 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f, + 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f, + 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f, + 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f, + 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f, + -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f, + 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f, + -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f, + -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f, + 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f, + -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f, + -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f, + -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f, + -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f, + -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f, + -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f, + -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f, + -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f, + 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f, + 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f, + -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f, + -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f, + 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f, + 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f, + 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f, + -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f, + 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f, + -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f, + 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f, + -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f, + -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f, + 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f, + -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f, + -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f, + -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f, + 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f, + 2.02519f, +}; + +static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = { + 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f, + 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f, + -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f, + -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f, + 0.66120f, 0.61119f, -1.42293f, 0.32676f, +}; + +static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES * + LABEL_SIZE] = { + 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f, + 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f, + -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f, + 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f, + 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f, + 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f, + 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f, + 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f, + -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f, + -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f, + 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f, + 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f, + 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f, + 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f, +}; + +static const float av1_rect_partition_nn_bias_128_layer1[3] = { + 1.09014f, + -0.53317f, + -0.55668f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_128_layer0, + av1_rect_partition_nn_weights_128_layer1 }, + { av1_rect_partition_nn_bias_128_layer0, + av1_rect_partition_nn_bias_128_layer1 } +}; +#undef FEATURE_SIZE +#undef NUM_NODES +#undef LABEL_SIZE + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ +#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c index 461c3af83..c5508e25c 100644 --- a/third_party/aom/av1/encoder/picklpf.c +++ b/third_party/aom/av1/encoder/picklpf.c @@ -70,7 +70,7 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, // TODO(any): please enable multi-thread and remove the flag when loop // filter mask is compatible with multi-thread. #if LOOP_FILTER_BITMASK - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane, + av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, 0, plane, plane + 1, partial_frame); #else if (cpi->num_workers > 1) @@ -193,6 +193,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, (void)sd; lf->sharpness_level = 0; + cpi->td.mb.rdmult = cpi->rd.RDMULT; if (method == LPF_PICK_MINIMAL_LPF) { lf->filter_level[0] = 0; diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h index 2a168358e..357097ae1 100644 --- a/third_party/aom/av1/encoder/picklpf.h +++ b/third_party/aom/av1/encoder/picklpf.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PICKLPF_H_ -#define AV1_ENCODER_PICKLPF_H_ +#ifndef AOM_AV1_ENCODER_PICKLPF_H_ +#define AOM_AV1_ENCODER_PICKLPF_H_ #ifdef __cplusplus extern "C" { @@ -27,4 +27,4 @@ void av1_pick_filter_level(const struct yv12_buffer_config *sd, } // extern "C" #endif -#endif // AV1_ENCODER_PICKLPF_H_ +#endif // AOM_AV1_ENCODER_PICKLPF_H_ diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c index 28b693b08..e7804f6b4 100644 --- a/third_party/aom/av1/encoder/pickrst.c +++ b/third_party/aom/av1/encoder/pickrst.c @@ -15,6 +15,7 @@ #include <math.h> #include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" @@ -22,7 +23,6 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" - #include "av1/common/onyxc_int.h" #include "av1/common/quant_common.h" #include "av1/common/restoration.h" @@ -181,6 +181,77 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc, return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd); } +int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int xq[2], + const sgr_params_type *params) { + int i, j; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + } + } else if (params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t e = (int32_t)(dat[j]) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + } + } + + return err; +} + static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int use_highbitdepth, @@ -192,21 +263,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, int xq[2]; decode_xq(xqd, xq, params); if (!use_highbitdepth) { - const uint8_t *src = src8; - const uint8_t *dat = dat8; - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - const int32_t u = - (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); - int32_t v = u << SGRPROJ_PRJ_BITS; - if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u); - if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u); - const int32_t e = - ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - - src[i * src_stride + j]; - err += e * e; - } - } + err = av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); } else { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); @@ -463,9 +522,11 @@ static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width, // Iterate over the stripe in blocks of width pu_width for (int j = 0; j < width; j += pu_width) { const int w = AOMMIN(pu_width, width - j); - av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j, - flt1_row + j, flt_stride, sgr_params_idx, - bit_depth, use_highbd); + const int ret = av1_selfguided_restoration( + dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j, + flt_stride, sgr_params_idx, bit_depth, use_highbd); + (void)ret; + assert(!ret); } } } @@ -588,22 +649,9 @@ static void search_sgrproj(const RestorationTileLimits *limits, if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj; } -static double find_average(const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int stride) { - uint64_t sum = 0; - double avg = 0; - int i, j; - aom_clear_system_state(); - for (i = v_start; i < v_end; i++) - for (j = h_start; j < h_end; j++) sum += src[i * stride + j]; - avg = (double)sum / ((v_end - v_start) * (h_end - h_start)); - return avg; -} - -static void compute_stats(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, double *M, double *H) { +void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, + int h_start, int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, double *M, double *H) { int i, j, k, l; double Y[WIENER_WIN2]; const int wiener_win2 = wiener_win * wiener_win; @@ -626,8 +674,7 @@ static void compute_stats(int wiener_win, const uint8_t *dgd, assert(idx == wiener_win2); for (k = 0; k < wiener_win2; ++k) { M[k] += Y[k] * X; - H[k * wiener_win2 + k] += Y[k] * Y[k]; - for (l = k + 1; l < wiener_win2; ++l) { + for (l = k; l < wiener_win2; ++l) { // H is a symmetric matrix, so we only need to fill out the upper // triangle here. We can copy it down to the lower triangle outside // the (i, j) loops. @@ -1073,9 +1120,9 @@ static void search_wiener(const RestorationTileLimits *limits, limits->h_start, limits->h_end, limits->v_start, limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); } else { - compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start, - limits->h_end, limits->v_start, limits->v_end, - rsc->dgd_stride, rsc->src_stride, M, H); + av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, + limits->h_start, limits->h_end, limits->v_start, + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); } const MACROBLOCK *const x = rsc->x; @@ -1266,6 +1313,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { // problem, as these elements are ignored later, but in order to quiet // Valgrind's warnings we initialise the array below. memset(rusi, 0, sizeof(*rusi) * ntiles[0]); + cpi->td.mb.rdmult = cpi->rd.RDMULT; RestSearchCtxt rsc; const int plane_start = AOM_PLANE_Y; diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h index 179b89ff9..3fec0c34b 100644 --- a/third_party/aom/av1/encoder/pickrst.h +++ b/third_party/aom/av1/encoder/pickrst.h @@ -8,22 +8,39 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PICKRST_H_ -#define AV1_ENCODER_PICKRST_H_ +#ifndef AOM_AV1_ENCODER_PICKRST_H_ +#define AOM_AV1_ENCODER_PICKRST_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/encoder.h" +#include "aom_ports/system_state.h" struct yv12_buffer_config; struct AV1_COMP; +static const uint8_t g_shuffle_stats_data[16] = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, +}; + +static INLINE double find_average(const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int stride) { + uint64_t sum = 0; + double avg = 0; + int i, j; + aom_clear_system_state(); + for (i = v_start; i < v_end; i++) + for (j = h_start; j < h_end; j++) sum += src[i * stride + j]; + avg = (double)sum / ((v_end - v_start) * (h_end - h_start)); + return avg; +} + void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_PICKRST_H_ +#endif // AOM_AV1_ENCODER_PICKRST_H_ diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h index 42a4c590b..40dd46768 100644 --- a/third_party/aom/av1/encoder/pustats.h +++ b/third_party/aom/av1/encoder/pustats.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PUSTATS_H_ -#define AV1_ENCODER_PUSTATS_H_ +#ifndef AOM_AV1_ENCODER_PUSTATS_H_ +#define AOM_AV1_ENCODER_PUSTATS_H_ #ifdef __cplusplus extern "C" { @@ -18,83 +18,78 @@ extern "C" { #include "av1/encoder/ml.h" -#define NUM_FEATURES 11 +#define NUM_FEATURES_PUSTATS 8 #define NUM_HIDDEN_LAYERS 2 #define HIDDEN_LAYERS_0_NODES 12 #define HIDDEN_LAYERS_1_NODES 10 #define LOGITS_NODES 1 static const float - av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES * + av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * HIDDEN_LAYERS_0_NODES] = { - 21.5067f, 22.6709f, 0.0049f, 0.9288f, -0.0100f, 0.0060f, -0.0071f, - -0.0085f, 0.0348f, -0.1273f, 10.1154f, 6.3405f, 7.8589f, -0.0652f, - -4.6352f, 0.0445f, -3.2748f, 0.1025f, -0.0385f, -0.4505f, 1.1320f, - 3.2634f, 23.2420f, -7.9056f, 0.0522f, -18.1555f, 0.0977f, 0.1155f, - -0.0138f, 0.0267f, -0.3992f, 0.2735f, 22.8063f, 35.1043f, 3.8140f, - -0.0295f, 0.0771f, -0.6938f, 0.0302f, -0.0266f, 0.0989f, -0.0794f, - 0.2981f, 33.3333f, -24.1150f, 1.4986f, -0.0975f, -15.3938f, -0.0858f, - -0.0845f, -0.0869f, -0.0858f, 0.3542f, 0.0155f, -18.2629f, 9.6688f, - -11.9643f, -0.2904f, -5.3026f, -0.1011f, -0.1202f, 0.0127f, -0.0269f, - 0.3434f, 0.0595f, 16.6800f, 41.4730f, 6.9269f, -0.0512f, -1.4540f, - 0.0468f, 0.0077f, 0.0983f, 0.1265f, -0.5234f, 0.9477f, 36.6470f, - -0.4838f, -0.2269f, -0.1143f, -0.3907f, -0.5005f, -0.0179f, -0.1057f, - 0.1233f, -0.4412f, -0.0474f, 0.1140f, -21.6813f, -0.9077f, -0.0078f, - -3.3306f, 0.0417f, 0.0412f, 0.0427f, 0.0418f, -0.1699f, 0.0072f, - -22.3335f, 16.1203f, -10.1220f, -0.0019f, 0.0005f, -0.0054f, -0.0155f, - -0.0302f, -0.0379f, 0.1276f, 0.1568f, 21.6175f, 12.2919f, 11.0327f, - -0.2000f, -8.6691f, -0.5593f, -0.5952f, -0.4203f, -0.4857f, -1.1239f, - 3.1404f, -13.1098f, -5.9165f, 22.2060f, -0.0312f, -3.9642f, -0.0344f, - -0.0656f, -0.0273f, -0.0465f, 0.1412f, -6.1974f, 9.3661f, + -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f, + -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f, + 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f, + 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f, + -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f, + -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f, + -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f, + -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f, + 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f, + -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f, + -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f, + -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f, + 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f, + -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f, }; static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { - -14.3065f, 2.059f, -62.9916f, -50.1209f, 57.643f, -59.3737f, - -30.4737f, -0.1112f, 72.5427f, 55.402f, 24.9523f, 18.5834f, + 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f, + 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f, }; static const float av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { - 0.3883f, -0.2784f, -0.2850f, 0.4894f, -2.2450f, 0.4511f, -0.1969f, - -0.0077f, -1.4924f, 0.1138f, -2.9848f, 1.0211f, -0.1712f, -0.1952f, - -0.4774f, 0.0761f, -0.3186f, -0.1002f, 0.8663f, 0.5026f, 1.1920f, - 0.9337f, 0.3911f, -0.3841f, -0.0037f, 0.7295f, -0.3183f, 0.1829f, - -1.3670f, -0.1046f, 0.6629f, 0.0619f, -0.1551f, 0.8174f, 2.1521f, - -1.3323f, -0.0527f, -0.5772f, 0.2001f, -0.6270f, -1.0625f, 0.3342f, - 0.6676f, 0.4605f, -2.0049f, 0.7781f, 0.0713f, -0.0824f, -0.4529f, - 0.1757f, -0.1338f, -0.2319f, -0.2864f, 0.1248f, 0.3887f, -0.1676f, - 1.8422f, 0.6435f, 1.2123f, -0.5667f, -0.2423f, -0.0314f, 0.2411f, - -0.5013f, 0.0422f, 0.2559f, 0.4435f, -0.1223f, 1.5167f, 0.3939f, - 1.0898f, 0.0795f, -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f, - -0.4368f, -0.0984f, 0.0837f, 3.6169f, 0.0662f, -0.1679f, -0.8090f, - -0.2610f, -0.5791f, 0.0642f, -0.2979f, -0.9036f, 0.2898f, 0.3265f, - 0.4660f, -1.6358f, -0.0347f, 0.1087f, 0.0353f, 0.5687f, -0.5242f, - -0.4895f, 0.7693f, -1.3829f, -0.2244f, -0.2880f, 0.0575f, 2.0563f, - -0.2322f, -1.1597f, 1.6125f, -0.0925f, 1.3540f, 0.1432f, 0.3993f, - -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f, -0.5724f, 0.0122f, - -1.0829f, + 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f, + 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f, + -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f, + 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f, + 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f, + -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f, + -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f, + -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f, + 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f, + 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f, + -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f, + -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f, + -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f, + 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f, + -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f, + -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f, + 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f, + -2.7566f, }; static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { - -10.3717f, 37.304f, -36.7221f, -52.7572f, 44.0877f, - 41.1631f, 36.3299f, -48.6087f, -4.5189f, 13.0611f, + 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f, + 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f, }; static const float av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { - 0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f, - 1.4909f, 1.3554f, -0.8626f, -0.618f, -0.9458f, + 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f, + 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f, }; static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { - 30.6878f, + 4.5103f, }; static const NN_CONFIG av1_pustats_rate_nnconfig = { - NUM_FEATURES, // num_inputs + NUM_FEATURES_PUSTATS, // num_inputs LOGITS_NODES, // num_outputs NUM_HIDDEN_LAYERS, // num_hidden_layers { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes @@ -111,76 +106,71 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = { }; static const float - av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES * + av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * HIDDEN_LAYERS_0_NODES] = { - 0.7770f, 1.0881f, 0.0177f, 0.4939f, -0.2541f, -0.2672f, -0.1705f, - -0.1940f, -0.6395f, 1.2928f, 3.6240f, 2.4445f, 1.6790f, 0.0265f, - 0.1897f, 0.1776f, 0.0422f, 0.0197f, -0.0466f, 0.0462f, -1.0827f, - 2.0231f, 1.8044f, 2.7022f, 0.0064f, 0.2255f, -0.0552f, -0.1010f, - -0.0581f, -0.0781f, 0.2614f, -3.4085f, 1.7478f, 0.1155f, -0.1458f, - -0.0031f, -0.1797f, -0.4378f, -0.0539f, 0.0607f, -0.1347f, -0.3142f, - -0.2014f, -0.4484f, -0.2808f, 1.5913f, 0.0046f, -0.0610f, -0.6479f, - -0.7278f, -0.5592f, -0.6695f, -0.8120f, 2.9056f, -1.1501f, 9.3618f, - 4.2486f, 0.0011f, -0.1499f, -0.0834f, 0.1282f, 0.0409f, 0.1670f, - -0.1398f, -0.4661f, 13.7700f, 8.2061f, -0.0685f, 0.0061f, -0.2951f, - 0.0169f, 0.0520f, 0.0040f, 0.0374f, 0.0467f, -0.0107f, 14.2664f, - -2.2489f, -0.2516f, -0.0061f, -0.9921f, 0.1223f, 0.1212f, 0.1199f, - 0.1185f, -0.4867f, 0.0325f, -5.0757f, -8.7853f, 1.0450f, 0.0169f, - 0.5462f, 0.0051f, 0.1330f, 0.0143f, 0.1429f, -0.0258f, 0.2769f, - -12.8839f, 22.3093f, 1.2761f, 0.0037f, -1.2459f, -0.0466f, 0.0003f, - -0.0464f, -0.0067f, 0.2361f, 0.0355f, 23.3833f, 10.9218f, 2.6811f, - 0.0222f, -1.1055f, 0.1825f, 0.0575f, 0.0114f, -0.1259f, 0.3148f, - -2.0047f, 11.9559f, 5.7375f, 0.8802f, 0.0042f, -0.2469f, -0.1040f, - -1.5679f, 0.1969f, -0.0184f, 0.0157f, 0.6688f, 3.4492f, + -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f, + 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f, + 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f, + 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f, + 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f, + -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f, + -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f, + -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f, + 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f, + 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f, + -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f, + -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f, + 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f, + -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f, }; static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { - 4.5051f, -4.5858f, 1.4693f, 0.f, 3.7968f, -3.6292f, - -7.3112f, 10.9743f, 8.027f, -2.2692f, -8.748f, -1.3689f, + 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f, + 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f, }; static const float av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { - -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f, -0.0027f, -0.2136f, - -1.2094f, 0.0935f, -0.1403f, -0.1477f, -0.0752f, 0.1519f, -0.4726f, - -0.3521f, 0.4199f, -0.0168f, -0.2927f, -0.2510f, 0.0706f, -0.2920f, - 0.2046f, -0.0400f, -0.2114f, 0.4240f, -0.7070f, 0.4964f, 0.4471f, - 0.3841f, -0.0918f, -0.6140f, 0.6056f, -0.1123f, 0.3944f, -0.0178f, - -1.7702f, -0.4434f, 0.0560f, 0.1565f, -0.0793f, -0.0041f, 0.0052f, - -0.1843f, 0.2400f, -0.0605f, 0.3196f, -0.0286f, -0.0002f, -0.0595f, - -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f, - 0.1900f, -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f, - 0.0212f, -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f, - -0.0226f, 0.0616f, -0.3453f, 0.0810f, 0.4838f, -0.3780f, -1.4486f, - 0.7777f, -0.0459f, -0.6568f, 0.0589f, -1.0286f, -0.6001f, 0.0826f, - 0.4794f, -0.0586f, -0.1759f, 0.3811f, -0.1313f, 0.3829f, -0.0968f, - -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f, 0.0470f, - -0.2432f, 0.3013f, -0.0743f, -0.3479f, 0.0749f, -5.2490f, 0.0209f, - -0.1653f, -0.0826f, -0.0535f, 0.3225f, -0.3786f, -0.0104f, 0.3091f, - 0.3652f, 0.1757f, -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f, - -0.5539f, + -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f, + -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f, + 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f, + 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f, + -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f, + -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f, + -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f, + 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f, + -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f, + 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f, + 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f, + -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f, + 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f, + 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f, + 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f, + -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f, + -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f, + -0.4164f, }; static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { - 11.9337f, -0.3681f, -6.1324f, 12.674f, 9.0956f, - 4.6069f, -4.4158f, -12.4848f, 10.8473f, 5.7633f, + -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f, + 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f, }; static const float av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { - 0.3245f, 0.2979f, -0.157f, -0.1441f, 0.1413f, - -0.7496f, -0.1737f, -0.5322f, 0.0748f, 0.2518f, + -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f, + 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f, }; static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { - 4.6065f, + 2.3371f, }; static const NN_CONFIG av1_pustats_dist_nnconfig = { - NUM_FEATURES, // num_inputs + NUM_FEATURES_PUSTATS, // num_inputs LOGITS_NODES, // num_outputs NUM_HIDDEN_LAYERS, // num_hidden_layers { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes @@ -196,7 +186,6 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = { }, }; -#undef NUM_FEATURES #undef NUM_HIDDEN_LAYERS #undef HIDDEN_LAYERS_0_NODES #undef HIDDEN_LAYERS_1_NODES @@ -206,4 +195,4 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = { } // extern "C" #endif -#endif // AV1_ENCODER_PUSTATS_H_ +#endif // AOM_AV1_ENCODER_PUSTATS_H_ diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h index 9b2dac965..0bca39102 100644 --- a/third_party/aom/av1/encoder/random.h +++ b/third_party/aom/av1/encoder/random.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RANDOM_H_ -#define AV1_ENCODER_RANDOM_H_ +#ifndef AOM_AV1_ENCODER_RANDOM_H_ +#define AOM_AV1_ENCODER_RANDOM_H_ #ifdef __cplusplus extern "C" { @@ -26,4 +26,4 @@ static INLINE unsigned int lcg_rand16(unsigned int *state) { } // extern "C" #endif -#endif // AV1_ENCODER_RANDOM_H_ +#endif // AOM_AV1_ENCODER_RANDOM_H_ diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h index 1019055ed..c429f2ce5 100644 --- a/third_party/aom/av1/encoder/ransac.h +++ b/third_party/aom/av1/encoder/ransac.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RANSAC_H_ -#define AV1_ENCODER_RANSAC_H_ +#ifndef AOM_AV1_ENCODER_RANSAC_H_ +#define AOM_AV1_ENCODER_RANSAC_H_ #include <stdio.h> #include <stdlib.h> @@ -32,4 +32,4 @@ int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion, int ransac_translation(int *matched_points, int npoints, int *num_inliers_by_motion, double *params_by_motion, int num_motions); -#endif // AV1_ENCODER_RANSAC_H_ +#endif // AOM_AV1_ENCODER_RANSAC_H_ diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h index 14d23f10f..7cd0962c5 100644 --- a/third_party/aom/av1/encoder/rate_distortion_model_params.h +++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ -#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ +#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ +#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ #ifdef __cplusplus extern "C" { @@ -588,4 +588,4 @@ static const NN_CONFIG av1_rdcost_model_nnconfig = { } // extern "C" #endif -#endif // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ +#endif // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c index 3aae0144e..2597fb990 100644 --- a/third_party/aom/av1/encoder/ratectrl.c +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -117,7 +117,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, for (i = 0; i < QINDEX_RANGE; i++) { const double maxq = av1_convert_qindex_to_q(i, bit_depth); kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); - kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); @@ -253,6 +253,9 @@ int av1_rc_get_default_min_gf_interval(int width, int height, int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); interval += (interval & 0x01); // Round to even value +#if CONFIG_FIX_GF_LENGTH + interval = AOMMAX(FIXED_GF_LENGTH, interval); +#endif return AOMMAX(interval, min_gf_interval); } @@ -299,9 +302,9 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth); for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { - rc->rate_correction_factors[i] = 1.0; + rc->rate_correction_factors[i] = 0.7; } - + rc->rate_correction_factors[KF_STD] = 1.0; rc->min_gf_interval = oxcf->min_gf_interval; rc->max_gf_interval = oxcf->max_gf_interval; if (rc->min_gf_interval == 0) @@ -556,6 +559,14 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, arfgf_low_motion_minq, arfgf_high_motion_minq); } +#if REDUCE_LAST_ALT_BOOST +static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return arfgf_high_motion_minq[q]; +} +#endif + static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; const unsigned int curr_frame = cpi->common.current_video_frame; @@ -918,7 +929,7 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) { #define STATIC_MOTION_THRESH 95 static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, int height, int *bottom_index, - int *top_index) { + int *top_index, int *arf_q) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; @@ -959,7 +970,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, qindex = rc->last_boosted_qindex; last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); delta_qindex = av1_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75, bit_depth); + last_boosted_q * 0.5, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } } else { @@ -1000,17 +1011,49 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // For constrained quality dont allow Q less than the cq level if (oxcf->rc_mode == AOM_CQ) { if (q < cq_level) q = cq_level; +#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ + if (gf_group->update_type[gf_group->index] == ARF_UPDATE || + (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) { +#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ + active_best_quality = get_gf_active_quality(rc, q, bit_depth); - active_best_quality = get_gf_active_quality(rc, q, bit_depth); - - // Constrained quality use slightly lower active best. - active_best_quality = active_best_quality * 15 / 16; + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; +#if REDUCE_LAST_ALT_BOOST + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); + } +#endif + *arf_q = active_best_quality; +#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ + } else { + active_best_quality = rc->arf_q; + int this_height = gf_group->pyramid_level[gf_group->index]; + while (this_height < gf_group->pyramid_height) { + active_best_quality = (active_best_quality + cq_level + 1) / 2; + ++this_height; + } + } +#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ } else if (oxcf->rc_mode == AOM_Q) { if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q, bit_depth); + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + active_best_quality = get_gf_active_quality(rc, q, bit_depth); + *arf_q = active_best_quality; +#if REDUCE_LAST_ALT_BOOST + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); +#endif + } else { + active_best_quality = rc->arf_q; + } #if USE_SYMM_MULTI_LAYER if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { int this_height = gf_group->pyramid_level[gf_group->index]; @@ -1030,6 +1073,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, } } else { active_best_quality = get_gf_active_quality(rc, q, bit_depth); +#if REDUCE_LAST_ALT_BOOST + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); +#endif #if USE_SYMM_MULTI_LAYER if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { int this_height = gf_group->pyramid_level[gf_group->index]; @@ -1104,7 +1153,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); } else { - q = rc->last_boosted_qindex; + q = AOMMIN(rc->last_boosted_qindex, + (active_best_quality + active_worst_quality) / 2); } } else { q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, @@ -1129,7 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, return q; } -int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, +int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int *bottom_index, int *top_index) { int q; if (cpi->oxcf.pass == 0) { @@ -1140,8 +1190,17 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index, top_index); } else { + assert(cpi->oxcf.pass == 2 && "invalid encode pass"); + + GF_GROUP *gf_group = &cpi->twopass.gf_group; + int arf_q = 0; + q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index, - top_index); + top_index, &arf_q); + + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + cpi->rc.arf_q = arf_q; + } } return q; @@ -1327,13 +1386,6 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { update_golden_frame_stats(cpi); if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0; - - // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME - // differently here for rc->avg_frame_bandwidth. - if (cm->show_frame || rc->is_bwd_ref_frame) { - rc->frames_since_key++; - rc->frames_to_key--; - } // if (cm->current_video_frame == 1 && cm->show_frame) /* rc->this_frame_target = @@ -1635,10 +1687,6 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; -#if FIX_GF_INTERVAL_LENGTH - rc->max_gf_interval = FIXED_GF_LENGTH + 1; -#endif - // Clamp min to max rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); } diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h index f0508da9e..198ecab97 100644 --- a/third_party/aom/av1/encoder/ratectrl.h +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RATECTRL_H_ -#define AV1_ENCODER_RATECTRL_H_ +#ifndef AOM_AV1_ENCODER_RATECTRL_H_ +#define AOM_AV1_ENCODER_RATECTRL_H_ #include "aom/aom_codec.h" #include "aom/aom_integer.h" @@ -25,13 +25,27 @@ extern "C" { #define BPER_MB_NORMBITS 9 #define CUSTOMIZED_GF 1 -#define FIX_GF_INTERVAL_LENGTH 0 -#if FIX_GF_INTERVAL_LENGTH +#if CONFIG_FIX_GF_LENGTH #define FIXED_GF_LENGTH 16 +#define MAX_PYRAMID_LVL 4 +// We allow a frame to have at most two left/right descendants before changing +// them into to a subtree, i.e., we allow the following structure: +/* OUT_OF_ORDER_FRAME + / / \ \ +(two left children) F F F F (two right children) */ +// Therefore the max gf size supported by 4 layer structure is +// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent) +#define MAX_PYRAMID_SIZE 24 #define USE_SYMM_MULTI_LAYER 1 +#define REDUCE_LAST_ALT_BOOST 1 +#define REDUCE_LAST_GF_LENGTH 1 +#define MULTI_LVL_BOOST_VBR_CQ 1 #else #define USE_SYMM_MULTI_LAYER 0 +#define REDUCE_LAST_ALT_BOOST 0 +#define REDUCE_LAST_GF_LENGTH 0 +#define MULTI_LVL_BOOST_VBR_CQ 0 #endif #if USE_SYMM_MULTI_LAYER @@ -159,6 +173,9 @@ typedef struct { // Auto frame-scaling variables. int rf_level_maxq[RATE_FACTOR_LEVELS]; + float_t arf_boost_factor; + // Q index used for ALT frame + int arf_q; } RATE_CONTROL; struct AV1_COMP; @@ -228,7 +245,7 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, int *frame_over_shoot_limit); // Picks q and q bounds given the target for bits -int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height, +int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height, int *bottom_index, int *top_index); // Estimates q to achieve a target bits per frame @@ -275,4 +292,4 @@ int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_RATECTRL_H_ +#endif // AOM_AV1_ENCODER_RATECTRL_H_ diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c deleted file mode 100644 index e69de29bb..000000000 --- a/third_party/aom/av1/encoder/ratectrl_xiph.c +++ /dev/null diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h deleted file mode 100644 index e69de29bb..000000000 --- a/third_party/aom/av1/encoder/ratectrl_xiph.h +++ /dev/null diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c index c4d4777bf..b87d89e50 100644 --- a/third_party/aom/av1/encoder/rd.c +++ b/third_party/aom/av1/encoder/rd.c @@ -648,6 +648,473 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, } } +static double interp_cubic(const double *p, double x) { + return p[1] + 0.5 * x * + (p[2] - p[0] + + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); +} + +static double interp_bicubic(const double *p, int p_stride, double x, + double y) { + double q[4]; + q[0] = interp_cubic(p, x); + q[1] = interp_cubic(p + p_stride, x); + q[2] = interp_cubic(p + 2 * p_stride, x); + q[3] = interp_cubic(p + 3 * p_stride, x); + return interp_cubic(q, y); +} + +static const double interp_rgrid_surf[65 * 18] = { + 0.104019, 0.245714, 0.293686, 0.358635, 0.382167, 0.412446, + 0.419955, 0.421388, 0.426672, 0.427990, 0.428531, 0.456868, + 0.569880, 0.638822, 1.016319, 2.143453, 3.565229, 4.720880, + 0.124618, 0.294211, 0.352023, 0.429991, 0.458206, 0.494510, + 0.503513, 0.505232, 0.511566, 0.513234, 0.519365, 0.570225, + 0.697373, 0.840624, 1.462198, 3.289054, 6.256517, 6.852788, + 0.118630, 0.269669, 0.346620, 0.430999, 0.459385, 0.495783, + 0.504808, 0.506532, 0.512884, 0.514988, 0.543437, 0.662772, + 0.795876, 1.313596, 2.403841, 4.163098, 7.440589, 8.616275, + 0.093329, 0.168205, 0.321320, 0.430607, 0.459385, 0.495783, + 0.504813, 0.506548, 0.512975, 0.520662, 0.571659, 0.701841, + 1.010727, 2.138851, 3.460626, 6.317955, 10.098127, 14.418553, + 0.087021, 0.142905, 0.315011, 0.430509, 0.459385, 0.495787, + 0.505075, 0.507599, 0.513584, 0.543182, 0.669941, 0.825620, + 1.362800, 2.572187, 4.205047, 7.498399, 12.303118, 16.641735, + 0.086923, 0.142513, 0.314913, 0.430508, 0.459385, 0.495803, + 0.506126, 0.511816, 0.514810, 0.549705, 0.725350, 1.127334, + 2.168597, 3.463686, 6.318605, 10.162284, 18.556041, 19.847042, + 0.086923, 0.142513, 0.314913, 0.430506, 0.459376, 0.495805, + 0.506388, 0.512954, 0.520772, 0.580215, 0.810474, 1.391548, + 2.579442, 4.205160, 7.498399, 12.381597, 21.703618, 24.015457, + 0.086923, 0.142513, 0.314911, 0.430353, 0.458765, 0.495652, + 0.506391, 0.513406, 0.544098, 0.702950, 1.121860, 2.168961, + 3.463798, 6.318607, 10.162284, 18.685361, 28.188192, 37.638872, + 0.086923, 0.142513, 0.314901, 0.429742, 0.456313, 0.495045, + 0.506484, 0.519195, 0.580104, 0.810126, 1.391462, 2.579441, + 4.205160, 7.498399, 12.381597, 21.848607, 33.367199, 42.623190, + 0.086923, 0.142513, 0.314899, 0.429589, 0.455706, 0.495155, + 0.507882, 0.542426, 0.702360, 1.119921, 2.168478, 3.463791, + 6.318607, 10.162284, 18.685361, 28.345760, 47.802028, 49.163533, + 0.086924, 0.142548, 0.315086, 0.429842, 0.455870, 0.496336, + 0.512412, 0.556953, 0.773373, 1.266396, 2.548277, 4.204676, + 7.498399, 12.381597, 21.848607, 33.548250, 54.301011, 56.262859, + 0.087067, 0.144957, 0.327436, 0.446616, 0.466362, 0.505706, + 0.522077, 0.610747, 0.972543, 1.666916, 3.338812, 6.316669, + 10.162284, 18.685361, 28.345760, 48.065311, 66.145302, 78.396020, + 0.094295, 0.164235, 0.393722, 0.534219, 0.530922, 0.579308, + 0.603889, 0.760870, 1.229961, 2.423214, 4.173513, 7.497916, + 12.381597, 21.848607, 33.548250, 54.589585, 74.875848, 86.468182, + 0.124096, 0.213005, 0.497188, 0.665176, 0.685973, 0.800200, + 0.911394, 1.077971, 1.677290, 3.332129, 6.314960, 10.162257, + 18.685361, 28.345760, 48.065311, 66.453506, 98.275189, 96.862588, + 0.140999, 0.270140, 0.658212, 0.867661, 0.970183, 1.149516, + 1.480599, 1.664833, 2.421893, 3.857981, 7.418830, 12.380371, + 21.848607, 33.548250, 54.589585, 75.188867, 106.657971, 99.762997, + 0.178353, 0.398001, 0.988462, 1.241473, 1.340967, 1.713568, + 2.335030, 2.701432, 3.348532, 5.077158, 9.829903, 18.676528, + 28.345700, 48.065311, 66.453506, 98.588283, 117.057193, 101.130722, + 0.281079, 0.548300, 1.395825, 1.780770, 2.000508, 2.702964, + 3.638454, 4.573843, 5.051641, 7.079129, 11.293332, 21.594861, + 33.544335, 54.589585, 75.188867, 106.971065, 119.957601, 101.466632, + 0.476762, 0.842189, 2.019678, 2.723895, 3.188467, 4.011610, + 5.545111, 7.508984, 8.176339, 9.774504, 14.720782, 27.334416, + 48.049609, 66.453506, 98.588283, 117.370357, 121.329855, 101.509242, + 0.993999, 1.520111, 3.013605, 4.203530, 4.982992, 6.074944, + 8.583581, 11.818375, 14.192544, 14.937517, 21.258160, 33.305953, + 54.585735, 75.188867, 106.971135, 120.279824, 121.976055, 102.690130, + 1.776487, 2.613655, 4.356487, 6.161726, 7.622196, 9.464193, + 13.077233, 18.051656, 23.221051, 24.080068, 30.085038, 48.345269, + 66.457698, 98.588353, 117.379415, 121.976128, 124.356210, 107.713202, + 3.191085, 4.495201, 5.686033, 8.365566, 11.275339, 14.706437, + 20.300969, 28.152237, 35.688355, 39.341382, 41.030743, 55.752262, + 75.211764, 106.980285, 120.608403, 124.680746, 130.222528, 112.260098, + 6.136611, 7.305215, 7.272532, 10.646713, 15.630815, 22.383168, + 31.349131, 42.419822, 52.301680, 58.983454, 58.915405, 69.161305, + 98.992460, 117.713855, 124.344836, 130.623638, 138.442401, 127.846670, + 11.707980, 13.490761, 11.640845, 14.176132, 22.131124, 33.776462, + 47.365711, 61.603834, 75.281056, 83.463985, 85.510533, 86.026513, + 108.787480, 123.031136, 130.607284, 138.954406, 160.867784, 158.958882, + 27.062874, 32.195139, 24.147297, 22.114632, 35.580506, 52.551674, + 71.652956, 88.606776, 102.107193, 110.703186, 114.398733, 111.118539, + 121.503578, 132.455924, 139.490806, 161.412674, 193.563210, 172.203945, + 35.625692, 47.953028, 42.639820, 42.276254, 58.815664, 84.977282, + 110.656412, 126.168446, 134.658126, 140.604482, 144.006012, 141.702382, + 140.125323, 153.122630, 164.748041, 194.156197, 206.854650, 174.013079, + 49.516447, 65.335381, 71.738306, 81.872819, 98.400740, 136.840488, + 163.775802, 169.440078, 172.747876, 171.222919, 171.679604, 172.173550, + 168.200129, 187.617133, 199.683394, 207.768200, 210.062520, 175.478356, + 60.341673, 92.487135, 119.907299, 136.068010, 144.778950, 189.443534, + 220.120077, 219.641635, 214.616503, 205.894657, 198.453924, 200.013069, + 195.938103, 206.118661, 210.447375, 212.061379, 216.078218, 181.162805, + 78.422159, 112.242899, 158.416312, 181.404320, 193.188690, 229.296967, + 270.461799, 275.168977, 256.511701, 244.706786, 231.344608, 226.065087, + 222.248618, 218.662324, 217.966722, 218.248574, 218.818588, 182.740573, + 88.713664, 123.594164, 172.928179, 213.781414, 245.800351, 252.063414, + 313.283141, 331.703831, 305.866639, 285.177142, 269.759635, 251.988739, + 245.998388, 232.688076, 230.588702, 230.882657, 230.319053, 192.120741, + 102.540561, 152.905927, 189.137131, 241.806756, 273.868497, 284.258017, + 339.689853, 373.561104, 362.657463, 326.291984, 311.922687, 290.460189, + 276.774381, 273.012072, 277.751792, 279.123748, 278.820447, 233.813798, + 132.983118, 176.307242, 197.415684, 243.307787, 280.893995, 332.922370, + 340.329043, 404.530166, 419.475405, 375.775209, 351.300889, 340.042759, + 315.683832, 306.123530, 306.359319, 306.733063, 307.609556, 261.647847, + 149.579109, 185.925581, 207.937033, 245.159084, 301.890957, 350.040480, + 352.250771, 418.742329, 458.112686, 430.125208, 386.460441, 380.346839, + 354.679150, 337.305620, 334.504124, 335.889932, 341.060725, 286.898578, + 153.576812, 202.105624, 219.366967, 248.524506, 314.255692, 350.607526, + 390.567688, 408.629209, 488.000213, 480.563823, 432.461799, 410.412624, + 398.607371, 400.188740, 402.780916, 408.853470, 430.449735, 363.777088, + 161.353129, 214.848904, 231.549852, 258.536466, 313.163177, 368.140577, + 412.136393, 413.409032, 499.838438, 519.571063, 485.833867, 444.562715, + 435.738129, 442.358549, 450.166531, 453.208524, 458.424358, 385.823139, + 175.109034, 227.608058, 250.069563, 286.101747, 312.256740, 378.421485, + 413.344147, 435.058646, 476.960941, 542.448886, 530.189154, 495.408402, + 475.326752, 465.017144, 464.694045, 465.144689, 466.905382, 398.669138, + 184.750180, 240.766694, 283.240772, 305.480150, 322.409001, 374.526162, + 427.141326, 452.840323, 472.604139, 545.366105, 567.676694, 541.666203, + 509.591873, 492.044219, 492.778569, 493.765684, 493.235693, 413.684325, + 194.728357, 254.928927, 289.991157, 300.193195, 324.194589, 371.563147, + 439.226438, 468.295088, 495.654854, 533.506353, 587.476353, 578.298989, + 548.041942, 527.393885, 538.965146, 545.070442, 544.295454, 454.012211, + 205.195287, 283.135677, 297.921431, 319.295927, 355.621830, 392.466463, + 446.696167, 485.053519, 516.426615, 532.264584, 588.481600, 615.906737, + 589.319634, 555.754316, 558.389367, 569.094521, 569.779764, 475.384946, + 218.552054, 298.511016, 319.188338, 351.781666, 372.789510, 412.827434, + 464.569387, 506.270203, 533.049810, 553.347364, 580.644599, 632.759854, + 622.235843, 569.960552, 580.799340, 586.553714, 579.488366, 491.826482, + 244.803348, 299.790203, 324.187975, 363.280782, 403.710443, 441.724083, + 492.732682, 534.722691, 552.193622, 575.112647, 586.097705, 635.224970, + 644.642944, 606.017786, 640.321218, 642.316989, 616.397020, 548.300111, + 256.957358, 318.638991, 355.063346, 389.889307, 433.607315, 468.209001, + 515.178157, 573.556591, 578.113115, 587.246475, 601.762801, 638.454644, + 656.574853, 641.184609, 676.908189, 684.198162, 678.387412, 574.805864, + 251.211502, 323.448532, 364.227424, 411.792704, 462.226488, 503.572288, + 549.299249, 599.124071, 601.227977, 597.118176, 613.247552, 633.278532, + 658.074755, 664.930719, 685.731531, 693.632845, 693.076350, 578.326477, + 267.695377, 354.273736, 389.976833, 438.518178, 493.332686, 544.343027, + 588.895829, 620.206193, 628.327410, 606.067827, 620.998532, 657.985256, + 683.936059, 691.345257, 693.894723, 695.175306, 693.618786, 578.517148, + 274.290725, 363.465288, 411.808596, 463.369805, 515.310226, 581.009306, + 613.070738, 636.638714, 647.333929, 629.867603, 644.646319, 687.796202, + 702.859596, 713.495479, 704.068069, 704.991807, 704.188594, 587.283658, + 302.538449, 389.174737, 438.518422, 493.398902, 547.662399, 601.981814, + 624.773046, 641.629484, 644.699451, 645.848784, 668.033340, 703.643523, + 707.422408, 717.329600, 726.298973, 744.127507, 745.365167, 617.954068, + 310.328188, 410.984766, 463.369805, 515.315010, 581.309832, 613.787792, + 634.988538, 654.145284, 662.632978, 668.413496, 706.494057, 750.545471, + 730.724808, 730.002100, 743.625262, 750.801609, 745.308457, 606.505800, + 329.948756, 437.600191, 493.398902, 547.661910, 601.917884, 622.557745, + 633.244395, 644.055898, 648.224221, 665.062911, 763.555733, 812.391078, + 769.063582, 744.865168, 727.579796, 724.950408, 722.179707, 598.564510, + 350.848328, 462.437458, 515.315010, 581.309823, 613.779123, 634.465309, + 652.056257, 662.179143, 671.466297, 726.881256, 819.824030, 880.232789, + 810.371672, 754.246481, 725.053473, 724.253390, 723.503395, 603.394909, + 373.704088, 492.408266, 547.661910, 601.917884, 622.557620, 633.236320, + 644.023513, 648.232514, 666.381639, 785.498283, 929.441612, 999.772800, + 890.339033, 775.852504, 731.840181, 726.905100, 725.251844, 604.899901, + 394.473422, 514.261306, 581.309823, 613.779123, 634.465309, 652.056257, + 662.179143, 671.466557, 727.134512, 835.764144, 981.747089, 1018.462934, + 939.686967, 811.276731, 739.398459, 727.365647, 725.285425, 604.923525, + 419.976505, 546.538939, 601.917884, 622.557620, 633.236320, 644.023513, + 648.232514, 666.381639, 785.545191, 932.841398, 1036.609617, 1026.945092, + 963.822765, 840.827315, 755.532423, 730.241865, 725.366847, 604.924155, + 437.281359, 580.116337, 613.779123, 634.465309, 652.056257, 662.179143, + 671.466557, 727.134512, 835.764859, 981.996194, 1031.896881, 1002.544732, + 881.157178, 828.151494, 799.340975, 751.314325, 728.316587, 605.005504, + 464.713920, 600.649281, 622.557620, 633.236320, 644.023513, 648.232514, + 666.381639, 785.545191, 932.841398, 1036.735329, 1035.037004, 995.478339, + 858.093733, 823.471976, 819.881754, 798.749289, 749.440463, 607.955244, + 495.880237, 612.473139, 634.465309, 652.056257, 662.179143, 671.466557, + 727.134512, 835.764859, 981.996194, 1032.339788, 1031.105117, 995.303259, + 857.733663, 823.435877, 822.822791, 819.873050, 796.882480, 629.038445, + 510.391280, 621.158273, 633.236320, 644.023513, 648.232514, 666.381639, + 785.545191, 932.841398, 1036.735329, 1035.566013, 1029.599350, 994.926093, + 857.645648, 823.435143, 822.904139, 822.822791, 817.965681, 673.856962, + 514.588176, 632.947715, 652.056257, 662.179143, 671.466557, 727.134512, + 835.764859, 981.996194, 1032.339788, 1031.547475, 1023.835377, 972.158629, + 851.968626, 823.347128, 822.904770, 822.904139, 820.752301, 684.418900, + 520.013294, 631.668183, 644.023513, 648.232514, 666.381639, 785.545191, + 932.841398, 1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721, + 829.201546, 822.994150, 822.904770, 822.904770, 820.792975, 684.582020, + 531.253628, 650.479606, 662.179143, 671.466557, 727.134512, 835.764859, + 981.996194, 1032.339788, 1031.636855, 1029.601779, 995.366703, 858.086641, + 823.524524, 822.906135, 822.904770, 822.904770, 820.792975, 684.582020, + 528.531744, 642.424501, 648.232514, 666.381639, 785.545191, 932.841398, + 1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687, 857.733663, + 823.436508, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 545.401164, 660.550678, 671.508859, 727.304161, 835.807162, 981.996850, + 1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709, 857.645648, + 823.435143, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 537.684760, 646.650947, 669.110131, 796.487512, 935.569890, 1036.777631, + 1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629, 851.968626, + 823.347128, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 552.408370, 670.001885, 738.246482, 879.690154, 992.939171, 1032.509436, + 1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721, 829.201546, + 822.994150, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 539.835902, 667.496388, 799.216004, 946.512211, 1039.506123, 1035.609680, + 1030.219103, 1030.107964, 1029.577207, 995.366703, 858.086641, 823.524524, + 822.906135, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 558.362529, 734.277451, 877.197218, 990.478243, 1029.908393, 1028.993978, + 1027.488620, 1027.464048, 1026.933674, 992.724534, 855.532488, 821.323349, + 820.792975, 820.792975, 820.792975, 820.792975, 818.686600, 682.825198, + 453.127195, 649.075095, 780.278390, 867.165890, 862.469711, 857.067460, + 856.956321, 856.955937, 856.513579, 827.981461, 713.556496, 685.024378, + 684.582020, 684.582020, 684.582020, 684.582020, 682.825198, 569.510056, +}; + +static const double interp_dgrid_surf[65 * 18] = { + 10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583, + 12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431, + 12.092165, 11.602421, 11.141559, 8.864495, 12.770003, 14.634889, 14.437149, + 14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265, + 14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168, + 10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600, + 14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798, + 14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726, + 14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361, + 14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657, + 12.176082, 9.228999, 12.979992, 15.382918, 14.651428, 14.238693, 14.239028, + 14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189, + 14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683, 12.980449, + 15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998, + 14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359, + 12.201859, 10.891931, 8.482221, 12.980449, 15.384750, 14.651886, 14.238801, + 14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647, + 14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672, + 12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211, + 14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358, + 12.201859, 10.911285, 9.730570, 6.696921, 12.980449, 15.384750, 14.652393, + 14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605, + 14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398, + 6.215460, 12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035, + 14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358, + 12.201859, 10.911285, 9.747361, 7.779960, 5.617541, 12.980448, 15.384731, + 14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044, + 14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817, + 7.210003, 5.164575, 12.980446, 15.383448, 14.647073, 14.277541, 14.403813, + 14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118, + 12.201859, 10.911285, 9.747361, 7.790897, 6.322998, 3.931551, 12.981550, + 15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111, + 14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817, + 7.219566, 5.781392, 3.486081, 12.991899, 15.376201, 14.579444, 14.296898, + 14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579, + 12.201867, 10.911285, 9.747361, 7.790897, 6.331506, 4.480348, 2.923138, + 13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174, + 14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817, + 7.219566, 5.789642, 4.018194, 2.766222, 13.028558, 15.315782, 14.439141, + 14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897, + 12.274375, 10.912967, 9.747371, 7.790897, 6.331506, 4.488594, 3.454993, + 2.692682, 12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072, + 13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409, + 7.219566, 5.789642, 4.026440, 3.298077, 2.674624, 12.945493, 15.276596, + 14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525, + 12.288592, 11.511693, 9.900227, 7.793270, 6.331506, 4.488594, 3.463236, + 3.224318, 2.672433, 12.757570, 15.056661, 14.095011, 13.722362, 13.812624, + 13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860, + 7.220151, 5.789642, 4.026437, 3.305882, 3.191260, 2.615317, 12.581293, + 14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229, + 10.936895, 10.619912, 9.634779, 7.763570, 6.331082, 4.488590, 3.462798, + 3.216460, 3.076315, 2.373499, 12.283499, 14.455760, 13.890593, 13.427587, + 13.183783, 12.763833, 11.861006, 10.740618, 9.820756, 9.354945, 8.669862, + 7.123268, 5.787860, 4.025994, 3.290000, 3.084410, 2.810905, 2.222916, + 12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265, + 9.631040, 8.594396, 8.003736, 7.561587, 6.274418, 4.466637, 3.446574, + 3.102467, 2.816989, 2.598688, 1.951541, 11.581477, 13.831132, 13.632027, + 13.380414, 12.807880, 11.665651, 10.218236, 8.562237, 7.222614, 6.611808, + 6.261676, 5.402793, 3.938544, 3.174375, 2.818166, 2.602758, 2.213911, + 1.434763, 11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854, + 9.109699, 7.421701, 5.965603, 5.272129, 4.991435, 4.423000, 3.369988, + 2.800371, 2.593901, 2.217431, 1.670917, 1.215265, 10.641194, 11.766277, + 10.777082, 10.972917, 10.689298, 9.701545, 7.719947, 6.145654, 4.872442, + 4.099600, 3.880934, 3.514159, 2.786474, 2.368963, 2.162376, 1.673670, + 1.450770, 1.185424, 10.071964, 11.107701, 9.172361, 8.551313, 8.412080, + 7.641397, 6.174246, 4.853916, 3.904549, 3.246810, 2.959903, 2.785066, + 2.240001, 1.793166, 1.585520, 1.449824, 1.405368, 1.168856, 9.213182, + 9.173278, 7.219231, 6.242951, 5.626013, 5.768007, 4.908666, 3.809589, + 3.115109, 2.617899, 2.274793, 2.172960, 1.838597, 1.505915, 1.414333, + 1.392666, 1.338173, 1.105611, 7.365015, 7.471370, 5.622346, 4.520127, + 3.936272, 4.208822, 3.623024, 2.977794, 2.450003, 2.097261, 1.824090, + 1.643270, 1.473525, 1.351388, 1.327504, 1.323865, 1.307894, 1.088234, + 6.198210, 6.580712, 4.682511, 3.416952, 2.941929, 2.766637, 2.650686, + 2.315439, 1.925838, 1.659784, 1.464419, 1.252806, 1.162722, 1.197518, + 1.199875, 1.197365, 1.194040, 0.995797, 5.402507, 5.055466, 3.728724, + 2.624359, 2.165810, 1.943189, 1.918190, 1.738078, 1.516328, 1.290520, + 1.155793, 1.015962, 0.881900, 0.807203, 0.754242, 0.743378, 0.740288, + 0.614158, 3.937867, 3.862507, 2.884664, 2.088147, 1.648496, 1.473584, + 1.340123, 1.291769, 1.165381, 1.000224, 0.893316, 0.821333, 0.691363, + 0.610501, 0.586766, 0.583762, 0.577840, 0.468733, 3.104660, 3.181078, + 2.420208, 1.747442, 1.297956, 1.109835, 0.970385, 0.943229, 0.876923, + 0.777584, 0.678183, 0.628623, 0.553745, 0.523430, 0.519490, 0.514394, + 0.492259, 0.403172, 2.593833, 2.533720, 2.010452, 1.480944, 1.060302, + 0.846383, 0.738703, 0.673144, 0.658010, 0.592449, 0.518236, 0.470335, + 0.425088, 0.393168, 0.378116, 0.355846, 0.275469, 0.213128, 2.176988, + 2.089575, 1.671284, 1.225008, 0.895382, 0.672008, 0.566241, 0.496746, + 0.488005, 0.449874, 0.400899, 0.354002, 0.318150, 0.281533, 0.238545, + 0.224159, 0.202399, 0.160681, 1.874679, 1.769165, 1.430124, 1.068727, + 0.780272, 0.557801, 0.441643, 0.377256, 0.352957, 0.338452, 0.304965, + 0.273172, 0.240052, 0.208724, 0.193431, 0.190845, 0.185025, 0.138166, + 1.590226, 1.502830, 1.193127, 0.917885, 0.670432, 0.474546, 0.355420, + 0.292305, 0.259035, 0.249937, 0.232079, 0.208943, 0.181936, 0.160038, + 0.152257, 0.151235, 0.149583, 0.120747, 1.331730, 1.255907, 1.012871, + 0.778422, 0.578977, 0.412432, 0.293155, 0.231824, 0.197187, 0.183921, + 0.174876, 0.157252, 0.140263, 0.127050, 0.110244, 0.105041, 0.104323, + 0.086944, 1.153994, 1.118771, 0.822355, 0.612321, 0.478249, 0.348222, + 0.247408, 0.186141, 0.152714, 0.135445, 0.129810, 0.119994, 0.115619, + 0.131626, 0.095612, 0.079343, 0.077502, 0.064550, 0.946317, 0.925894, + 0.677969, 0.499906, 0.397101, 0.297931, 0.214467, 0.152333, 0.120731, + 0.102686, 0.095062, 0.090361, 0.122319, 0.240194, 0.112687, 0.070690, + 0.070461, 0.054194, 0.824155, 0.787241, 0.581856, 0.419228, 0.313167, + 0.245582, 0.183500, 0.128101, 0.096577, 0.080267, 0.071022, 0.066851, + 0.085754, 0.154163, 0.075884, 0.052401, 0.054270, 0.026656, 0.716310, + 0.671378, 0.489580, 0.349569, 0.256155, 0.206343, 0.157853, 0.111950, + 0.079271, 0.062518, 0.053441, 0.049660, 0.051400, 0.063778, 0.039993, + 0.029133, 0.023382, 0.013725, 0.614125, 0.579096, 0.417126, 0.299465, + 0.217849, 0.165515, 0.129040, 0.093127, 0.065612, 0.049543, 0.041429, + 0.036850, 0.034416, 0.033989, 0.024216, 0.017377, 0.014833, 0.011987, + 0.520407, 0.487239, 0.349473, 0.251741, 0.184897, 0.135813, 0.107098, + 0.073607, 0.053938, 0.040531, 0.032931, 0.028876, 0.025759, 0.022168, + 0.016739, 0.014638, 0.014333, 0.011947, 0.449954, 0.415124, 0.299452, + 0.216942, 0.158874, 0.115334, 0.088821, 0.060105, 0.042610, 0.032566, + 0.026903, 0.023123, 0.019913, 0.016835, 0.014306, 0.013625, 0.013535, + 0.011284, 0.377618, 0.347773, 0.251741, 0.184839, 0.132857, 0.095439, + 0.070462, 0.052244, 0.036078, 0.026025, 0.021518, 0.018487, 0.015361, + 0.012905, 0.011470, 0.010569, 0.010283, 0.008297, 0.319953, 0.297976, + 0.216942, 0.158842, 0.113280, 0.080426, 0.057367, 0.041987, 0.030135, + 0.022295, 0.017901, 0.015121, 0.012224, 0.010035, 0.009353, 0.009108, + 0.008695, 0.006139, 0.267864, 0.250502, 0.184839, 0.132851, 0.095039, + 0.068220, 0.049135, 0.035315, 0.025144, 0.018237, 0.013857, 0.012094, + 0.009715, 0.007743, 0.006937, 0.006446, 0.006243, 0.004929, 0.230449, + 0.215895, 0.158842, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015673, 0.012133, 0.010083, 0.007801, 0.006053, 0.005401, + 0.003834, 0.003429, 0.002851, 0.193984, 0.183963, 0.132851, 0.095039, + 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013175, 0.010422, + 0.008491, 0.006397, 0.004567, 0.003494, 0.002933, 0.002825, 0.002355, + 0.167298, 0.158088, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009257, 0.007051, 0.005543, 0.003905, + 0.002984, 0.002825, 0.002814, 0.002347, 0.143228, 0.132220, 0.095039, + 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008403, 0.006661, 0.005378, 0.003545, 0.002876, 0.002818, 0.002814, + 0.002347, 0.122934, 0.112735, 0.080417, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007182, 0.006012, 0.003762, + 0.002866, 0.002739, 0.002788, 0.002810, 0.002347, 0.101934, 0.094569, + 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006797, 0.005845, 0.003333, 0.002703, 0.002695, 0.002723, + 0.002781, 0.002343, 0.086702, 0.080014, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006533, 0.005839, + 0.003326, 0.002700, 0.002690, 0.002694, 0.002716, 0.002314, 0.073040, + 0.067886, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006807, 0.006468, 0.005831, 0.003325, 0.002700, 0.002690, + 0.002690, 0.002687, 0.002253, 0.061685, 0.056890, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006542, 0.006360, + 0.005416, 0.003221, 0.002698, 0.002690, 0.002690, 0.002683, 0.002238, + 0.052465, 0.048894, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006807, 0.006472, 0.005943, 0.003748, 0.002805, 0.002692, + 0.002690, 0.002690, 0.002683, 0.002238, 0.043838, 0.041101, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006543, 0.006465, + 0.005839, 0.003333, 0.002702, 0.002690, 0.002690, 0.002690, 0.002683, + 0.002238, 0.037824, 0.035133, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006807, 0.006480, 0.006464, 0.005838, 0.003326, 0.002700, + 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.031865, 0.029815, + 0.021866, 0.015668, 0.011955, 0.009258, 0.007190, 0.006543, 0.006475, + 0.006462, 0.005831, 0.003325, 0.002700, 0.002690, 0.002690, 0.002690, + 0.002683, 0.002238, 0.027150, 0.025016, 0.018128, 0.013083, 0.010371, + 0.008405, 0.006807, 0.006480, 0.006472, 0.006359, 0.005416, 0.003221, + 0.002698, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.023094, + 0.021760, 0.015577, 0.011590, 0.009167, 0.007188, 0.006543, 0.006475, + 0.006466, 0.005943, 0.003748, 0.002805, 0.002692, 0.002690, 0.002690, + 0.002690, 0.002683, 0.002238, 0.019269, 0.018038, 0.013060, 0.010280, + 0.008382, 0.006806, 0.006480, 0.006474, 0.006464, 0.005839, 0.003333, + 0.002702, 0.002690, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, + 0.016874, 0.015472, 0.011566, 0.009148, 0.007171, 0.006527, 0.006458, + 0.006457, 0.006447, 0.005823, 0.003318, 0.002693, 0.002683, 0.002683, + 0.002683, 0.002683, 0.002676, 0.002232, 0.011968, 0.011056, 0.008762, + 0.007219, 0.005717, 0.005391, 0.005386, 0.005386, 0.005377, 0.004856, + 0.002767, 0.002246, 0.002238, 0.002238, 0.002238, 0.002238, 0.002232, + 0.001862, +}; + +void av1_model_rd_surffit(double xm, double yl, double *rate_f, + double *dist_f) { + const double x_start = -0.5; + const double x_end = 16.5; + const double x_step = 1; + const double y_start = -15.5; + const double y_end = 16.5; + const double y_step = 0.5; + const double epsilon = 1e-6; + const int stride = (int)rint((x_end - x_start) / x_step) + 1; + (void)y_end; + + xm = AOMMAX(xm, x_start + x_step + epsilon); + xm = AOMMIN(xm, x_end - x_step - epsilon); + yl = AOMMAX(yl, y_start + y_step + epsilon); + yl = AOMMIN(yl, y_end - y_step - epsilon); + + const double y = (yl - y_start) / y_step; + const double x = (xm - x_start) / x_step; + + const int yi = (int)floor(y); + const int xi = (int)floor(x); + assert(xi > 0); + assert(yi > 0); + + const double yo = y - yi; + const double xo = x - xi; + const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)]; + const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)]; + *rate_f = interp_bicubic(prate, stride, xo, yo); + *dist_f = interp_bicubic(pdist, stride, xo, yo); +} + +static const double interp_rgrid_curv[65] = { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.759876, + 8.132086, 13.651828, 21.908271, 33.522054, 48.782376, 71.530983, + 106.728649, 151.942795, 199.893011, 242.850965, 283.933923, 322.154203, + 360.684608, 394.801656, 426.879017, 460.234313, 484.103987, 508.261495, + 536.486763, 558.196737, 586.285894, 614.764511, 634.166333, 647.706472, + 658.211478, 681.360407, 701.052141, 727.007310, 768.663973, 804.407660, + 884.627751, 1065.658131, 1238.875214, 1440.185176, 1678.377931, 1962.243390, + 2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184, 5116.798028, + 5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839, +}; + +static const double interp_dgrid_curv[65] = { + 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, + 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692, + 14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773, + 10.728960, 9.861975, 8.643612, 6.916021, 5.154769, 3.734940, 2.680051, + 1.925506, 1.408410, 1.042223, 0.767641, 0.565392, 0.420116, 0.310427, + 0.231711, 0.172999, 0.128293, 0.094992, 0.072171, 0.052972, 0.039354, + 0.029555, 0.022857, 0.016832, 0.013297, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, +}; + +void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) { + const double x_start = -15.5; + const double x_end = 16.5; + const double x_step = 0.5; + const double epsilon = 1e-6; + (void)x_end; + + xqr = AOMMAX(xqr, x_start + x_step + epsilon); + xqr = AOMMIN(xqr, x_end - x_step - epsilon); + const double x = (xqr - x_start) / x_step; + const int xi = (int)floor(x); + const double xo = x - xi; + + assert(xi > 0); + + const double *prate = &interp_rgrid_curv[(xi - 1)]; + const double *pdist = &interp_dgrid_curv[(xi - 1)]; + *rate_f = interp_cubic(prate, xo); + *distbysse_f = interp_cubic(pdist, xo); +} + static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], @@ -814,8 +1281,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_NEARESTG] = 0; } - rd->thresh_mult[THR_DC] += 1000; - rd->thresh_mult[THR_NEWMV] += 1000; rd->thresh_mult[THR_NEWL2] += 1000; rd->thresh_mult[THR_NEWL3] += 1000; @@ -840,8 +1305,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_GLOBALG] += 2000; rd->thresh_mult[THR_GLOBALA] += 2000; - rd->thresh_mult[THR_PAETH] += 1000; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000; @@ -956,15 +1419,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500; - rd->thresh_mult[THR_H_PRED] += 2000; - rd->thresh_mult[THR_V_PRED] += 2000; - rd->thresh_mult[THR_D135_PRED] += 2500; - rd->thresh_mult[THR_D203_PRED] += 2500; - rd->thresh_mult[THR_D157_PRED] += 2500; - rd->thresh_mult[THR_D67_PRED] += 2500; - rd->thresh_mult[THR_D113_PRED] += 2500; - rd->thresh_mult[THR_D45_PRED] += 2500; - rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600; rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000; rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000; @@ -996,6 +1450,20 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200; rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200; + + rd->thresh_mult[THR_DC] += 1000; + rd->thresh_mult[THR_PAETH] += 1000; + rd->thresh_mult[THR_SMOOTH] += 2000; + rd->thresh_mult[THR_SMOOTH_V] += 2000; + rd->thresh_mult[THR_SMOOTH_H] += 2000; + rd->thresh_mult[THR_H_PRED] += 2000; + rd->thresh_mult[THR_V_PRED] += 2000; + rd->thresh_mult[THR_D135_PRED] += 2500; + rd->thresh_mult[THR_D203_PRED] += 2500; + rd->thresh_mult[THR_D157_PRED] += 2500; + rd->thresh_mult[THR_D67_PRED] += 2500; + rd->thresh_mult[THR_D113_PRED] += 2500; + rd->thresh_mult[THR_D45_PRED] += 2500; } void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) { diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h index 692367d7a..755b61df5 100644 --- a/third_party/aom/av1/encoder/rd.h +++ b/third_party/aom/av1/encoder/rd.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RD_H_ -#define AV1_ENCODER_RD_H_ +#ifndef AOM_AV1_ENCODER_RD_H_ +#define AOM_AV1_ENCODER_RD_H_ #include <limits.h> @@ -57,8 +57,6 @@ typedef enum { THR_NEARESTA, THR_NEARESTG, - THR_DC, - THR_NEWMV, THR_NEWL2, THR_NEWL3, @@ -100,12 +98,6 @@ typedef enum { THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTBA, - THR_PAETH, - - THR_SMOOTH, - THR_SMOOTH_V, - THR_SMOOTH_H, - THR_COMP_NEAR_NEARLA, THR_COMP_NEW_NEARESTLA, THR_COMP_NEAREST_NEWLA, @@ -202,15 +194,6 @@ typedef enum { THR_COMP_NEW_NEWGA2, THR_COMP_GLOBAL_GLOBALGA2, - THR_H_PRED, - THR_V_PRED, - THR_D135_PRED, - THR_D203_PRED, - THR_D157_PRED, - THR_D67_PRED, - THR_D113_PRED, - THR_D45_PRED, - THR_COMP_NEAR_NEARLL2, THR_COMP_NEW_NEARESTLL2, THR_COMP_NEAREST_NEWLL2, @@ -243,7 +226,26 @@ typedef enum { THR_COMP_NEW_NEWBA, THR_COMP_GLOBAL_GLOBALBA, - MAX_MODES + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, + + MAX_MODES, + + LAST_SINGLE_REF_MODES = THR_GLOBALG, + MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1, + LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA, + MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1 } THR_MODES; typedef enum { @@ -392,6 +394,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x, void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, unsigned int qstep, int *rate, int64_t *dist); +void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f); +void av1_model_rd_surffit(double xm, double yl, double *rate_f, + double *distbysse_f); + int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x, const MACROBLOCKD *xd); @@ -455,4 +461,4 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, } // extern "C" #endif -#endif // AV1_ENCODER_RD_H_ +#endif // AOM_AV1_ENCODER_RD_H_ diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c index fef6d2875..c2d15534f 100644 --- a/third_party/aom/av1/encoder/rdopt.c +++ b/third_party/aom/av1/encoder/rdopt.c @@ -55,17 +55,97 @@ #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tokenize.h" #include "av1/encoder/tx_prune_model_weights.h" -#define DNN_BASED_RD_INTERP_FILTER 0 +typedef void (*model_rd_for_sb_type)( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); -// Set this macro as 1 to collect data about tx size selection. -#define COLLECT_TX_SIZE_DATA 0 +static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, + int plane_to, int mi_row, int mi_col, + int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, + int64_t *plane_dist); +static void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_surffit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_dnn( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_fullrdy( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_from_sse(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist); +static void model_rd_with_dnn(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist); +static void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); +static void model_rd_with_surffit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); -#if COLLECT_TX_SIZE_DATA -static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; -#endif +typedef enum { + MODELRD_LEGACY, + MODELRD_CURVFIT, + MODELRD_SUFFIT, + MODELRD_DNN, + MODELRD_FULLRDY, + MODELRD_TYPES +} ModelRdType; + +static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { + model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit, + model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy +}; + +static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { + model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit, + model_rd_with_dnn, NULL +}; + +// 0: Legacy model +// 1: Curve fit model +// 2: Surface fit model +// 3: DNN regression model +// 4: Full rd model +#define MODELRD_TYPE_INTERP_FILTER 1 +#define MODELRD_TYPE_TX_SEARCH_PRUNE 2 +#define MODELRD_TYPE_MASKED_COMPOUND 1 +#define MODELRD_TYPE_INTERINTRA 1 +#define MODELRD_TYPE_INTRA 1 +#define MODELRD_TYPE_JNT_COMPOUND 1 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { @@ -103,6 +183,16 @@ typedef enum { FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 } FAST_TX_SEARCH_MODE; +static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t ref_best_rd); + +static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t non_skip_ref_best_rd, + int64_t skip_ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode); + struct rdcost_block_args { const AV1_COMP *cpi; MACROBLOCK *x; @@ -112,6 +202,7 @@ struct rdcost_block_args { int64_t this_rd; int64_t best_rd; int exit_early; + int incomplete_exit; int use_fast_coef_costing; FAST_TX_SEARCH_MODE ftxs_mode; }; @@ -126,8 +217,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, - { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEWMV, { LAST_FRAME, NONE_FRAME } }, { NEWMV, { LAST2_FRAME, NONE_FRAME } }, { NEWMV, { LAST3_FRAME, NONE_FRAME } }, @@ -172,12 +261,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, - { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, - - { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, - { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, - { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, @@ -274,15 +357,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, - { H_PRED, { INTRA_FRAME, NONE_FRAME } }, - { V_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, @@ -314,6 +388,21 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + // intra modes + { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, + { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, }; static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = { @@ -451,7 +540,6 @@ static int get_prediction_mode_idx(PREDICTION_MODE this_mode, if (this_mode >= SINGLE_INTER_MODE_START && this_mode < SINGLE_INTER_MODE_END) { assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); - assert(second_ref_frame == NONE_FRAME); return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] [ref_frame]; } @@ -479,6 +567,12 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { UV_D113_PRED, UV_D45_PRED, }; +typedef struct SingleInterModeState { + int64_t rd; + MV_REFERENCE_FRAME ref_frame; + int valid; +} SingleInterModeState; + typedef struct InterModeSearchState { int64_t best_rd; MB_MODE_INFO best_mbmode; @@ -510,32 +604,21 @@ typedef struct InterModeSearchState { int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES]; int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES]; int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES]; - int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES]; + int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; + // The rd of simple translation in single inter modes + int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; + + // Single search results by [directions][modes][reference frames] + SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; + SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] + [FWD_REFS]; + int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; + + MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; } InterModeSearchState; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - -typedef struct InterModeRdModel { - int ready; - double a; - double b; - double dist_mean; - int skip_count; - int non_skip_count; - int fp_skip_count; - int bracket_idx; -} InterModeRdModel; - -InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; - -#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 -static int inter_mode_data_idx[4]; -static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; - int inter_mode_data_block_idx(BLOCK_SIZE bsize) { if (bsize == BLOCK_8X8) return 1; if (bsize == BLOCK_16X16) return 2; @@ -543,137 +626,152 @@ int inter_mode_data_block_idx(BLOCK_SIZE bsize) { return -1; } -void av1_inter_mode_data_init() { +void av1_inter_mode_data_init(TileDataEnc *tile_data) { for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { - const int block_idx = inter_mode_data_block_idx(i); - if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; - InterModeRdModel *md = &inter_mode_rd_models[i]; + InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; md->ready = 0; - md->skip_count = 0; - md->non_skip_count = 0; - md->fp_skip_count = 0; - md->bracket_idx = 0; + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; } } -void av1_inter_mode_data_show(const AV1_COMMON *cm) { - printf("frame_offset %d\n", cm->frame_offset); - for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { - const int block_idx = inter_mode_data_block_idx(i); - if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; - InterModeRdModel *md = &inter_mode_rd_models[i]; - if (md->ready) { - printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i, - md->non_skip_count, md->skip_count, md->fp_skip_count); +static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + aom_clear_system_state(); + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + const double est_ld = md->a * sse + md->b; + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld); + *est_dist = (int64_t)round(md->dist_mean); } + return 1; } + return 0; } -static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse, - int curr_cost) { - aom_clear_system_state(); - InterModeRdModel *md = &inter_mode_rd_models[bsize]; - if (md->ready) { - const double est_ld = md->a * sse + md->b; - const double est_residue_cost = (sse - md->dist_mean) / est_ld; - const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost; - const int64_t int64_dist_mean = (int64_t)round(md->dist_mean); - const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean); +static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult, + int64_t sse, int curr_cost) { + int est_residue_cost; + int64_t est_dist; + if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) { + int rate = est_residue_cost + curr_cost; + int64_t est_rd = RDCOST(rdmult, rate, est_dist); return est_rd; } return 0; } -#define DATA_BRACKETS 7 -static const int data_num_threshold[DATA_BRACKETS] = { - 200, 400, 800, 1600, 3200, 6400, INT32_MAX -}; - -void av1_inter_mode_data_fit(int rdmult) { +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { aom_clear_system_state(); for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { const int block_idx = inter_mode_data_block_idx(bsize); - InterModeRdModel *md = &inter_mode_rd_models[bsize]; + InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; if (block_idx == -1) continue; - int data_num = inter_mode_data_idx[block_idx]; - if (data_num < data_num_threshold[md->bracket_idx]) { + if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { continue; + } else { + if (md->ready == 0) { + md->dist_mean = md->dist_sum / md->num; + md->ld_mean = md->ld_sum / md->num; + md->sse_mean = md->sse_sum / md->num; + md->sse_sse_mean = md->sse_sse_sum / md->num; + md->sse_ld_mean = md->sse_ld_sum / md->num; + } else { + const double factor = 3; + md->dist_mean = + (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); + md->ld_mean = + (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); + md->sse_mean = + (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); + md->sse_sse_mean = + (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / + (factor + 1); + md->sse_ld_mean = + (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / + (factor + 1); + } + + const double my = md->ld_mean; + const double mx = md->sse_mean; + const double dx = sqrt(md->sse_sse_mean); + const double dxy = md->sse_ld_mean; + + md->a = (dxy - mx * my) / (dx * dx - mx * mx); + md->b = my - md->a * mx; + md->ready = 1; + + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; } - double my = 0; - double mx = 0; - double dx = 0; - double dxy = 0; - double dist_mean = 0; - const int train_num = data_num; - for (int i = 0; i < train_num; ++i) { - const double sse = (double)inter_mode_data_sse[block_idx][i]; - const double dist = (double)inter_mode_data_dist[block_idx][i]; - const double residue_cost = inter_mode_data_residue_cost[block_idx][i]; - const double ld = (sse - dist) / residue_cost; - dist_mean += dist; - my += ld; - mx += sse; - dx += sse * sse; - dxy += sse * ld; - } - dist_mean = dist_mean / data_num; - my = my / train_num; - mx = mx / train_num; - dx = sqrt(dx / train_num); - dxy = dxy / train_num; - - md->dist_mean = dist_mean; - md->a = (dxy - mx * my) / (dx * dx - mx * mx); - md->b = my - md->a * mx; - ++md->bracket_idx; - md->ready = 1; - assert(md->bracket_idx < DATA_BRACKETS); - (void)rdmult; -#if 0 - int skip_count = 0; - int fp_skip_count = 0; - double avg_error = 0; - const int test_num = data_num; - for (int i = 0; i < data_num; ++i) { - const int64_t sse = inter_mode_data_sse[block_idx][i]; - const int64_t dist = inter_mode_data_dist[block_idx][i]; - const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i]; - const int64_t all_cost = inter_mode_data_all_cost[block_idx][i]; - const int64_t est_rd = - get_est_rd(bsize, rdmult, sse, all_cost - residue_cost); - const int64_t real_rd = RDCOST(rdmult, all_cost, dist); - const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i]; - if (est_rd > ref_best_rd) { - ++skip_count; - if (real_rd < ref_best_rd) { - ++fp_skip_count; - } - } - avg_error += abs(est_rd - real_rd) * 100. / real_rd; - } - avg_error /= test_num; - printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n", - test_num, bsize, avg_error, skip_count, fp_skip_count); -#endif } } -static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist, - int residue_cost, int all_cost, - int64_t ref_best_rd) { +static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int64_t dist, int residue_cost) { if (residue_cost == 0 || sse == dist) return; const int block_idx = inter_mode_data_block_idx(bsize); if (block_idx == -1) return; - if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) { - const int data_idx = inter_mode_data_idx[block_idx]; - inter_mode_data_sse[block_idx][data_idx] = sse; - inter_mode_data_dist[block_idx][data_idx] = dist; - inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost; - inter_mode_data_all_cost[block_idx][data_idx] = all_cost; - inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd; - ++inter_mode_data_idx[block_idx]; + InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; + if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { + aom_clear_system_state(); + const double ld = (sse - dist) * 1. / residue_cost; + ++rd_model->num; + rd_model->dist_sum += dist; + rd_model->ld_sum += ld; + rd_model->sse_sum += sse; + rd_model->sse_sse_sum += sse * sse; + rd_model->sse_ld_sum += sse * ld; + } +} + +static void inter_modes_info_push(InterModesInfo *inter_modes_info, + int mode_rate, int64_t sse, int64_t est_rd, + const MB_MODE_INFO *mbmi) { + const int num = inter_modes_info->num; + assert(num < MAX_INTER_MODES); + inter_modes_info->mbmi_arr[num] = *mbmi; + inter_modes_info->mode_rate_arr[num] = mode_rate; + inter_modes_info->sse_arr[num] = sse; + inter_modes_info->est_rd_arr[num] = est_rd; + ++inter_modes_info->num; +} + +static int compare_rd_idx_pair(const void *a, const void *b) { + if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { + return 0; + } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { + return 1; + } else { + return -1; + } +} + +static void inter_modes_info_sort(const InterModesInfo *inter_modes_info, + RdIdxPair *rd_idx_pair_arr) { + if (inter_modes_info->num == 0) { + return; + } + for (int i = 0; i < inter_modes_info->num; ++i) { + rd_idx_pair_arr[i].idx = i; + rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; } + qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), + compare_rd_idx_pair); } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS @@ -1528,13 +1626,13 @@ static void score_2D_transform_pow8(float *scores_2D, float shift) { // will lead to pruning i+1 TX types on average static const float *prune_2D_adaptive_thresholds[] = { // TX_4X4 - (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f, - 0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f, - 0.08606f, 0.09827f }, + (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, + 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, + 0.09778f, 0.11780f }, // TX_8X8 - (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f, - 0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f, - 0.09363f, 0.11682f }, + (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, + 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, + 0.10803f, 0.14124f }, // TX_16X16 (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, 0.06897f, 0.07629f, 0.08875f, 0.11169f }, @@ -1543,35 +1641,37 @@ static const float *prune_2D_adaptive_thresholds[] = { // TX_64X64 NULL, // TX_4X8 - (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f, - 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, - 0.09119f, 0.10828f }, + (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, + 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, + 0.10168f, 0.12585f }, // TX_8X4 - (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f, - 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, - 0.09167f, 0.10974f }, + (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, + 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, + 0.10583f, 0.13123f }, // TX_8X16 - (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f, - 0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f, - 0.09509f, 0.12097f }, + (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, + 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, + 0.10730f, 0.14221f }, // TX_16X8 - (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f, - 0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f, - 0.09485f, 0.12048f }, + (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, + 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, + 0.10339f, 0.13464f }, // TX_16X32 - (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f, - 0.06506f, 0.07385f, 0.08606f, 0.10925f }, + NULL, // TX_32X16 - (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f, - 0.06531f, 0.07336f, 0.08582f, 0.11072f }, + NULL, // TX_32X64 NULL, // TX_64X32 NULL, // TX_4X16 - NULL, + (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, + 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, + 0.10242f, 0.12878f }, // TX_16X4 - NULL, + (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, + 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, + 0.10217f, 0.12610f }, // TX_8X32 NULL, // TX_32X8 @@ -1631,7 +1731,18 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, cur_scores_2D[3]; } score_2D_average /= 16; - score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); + + const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } }; + int pruning_aggressiveness = 1; + if (tx_set_type == EXT_TX_SET_ALL16) { + score_2D_transform_pow8(scores_2D, (10 - score_2D_average)); + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0]; + } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) { + score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1]; + } // Always keep the TX type with the highest score, prune all others with // score below score_thresh. @@ -1645,18 +1756,6 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, } } - int pruning_aggressiveness = 0; - if (prune_mode == PRUNE_2D_ACCURATE) { - if (tx_set_type == EXT_TX_SET_ALL16) - pruning_aggressiveness = 6; - else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) - pruning_aggressiveness = 4; - } else if (prune_mode == PRUNE_2D_FAST) { - if (tx_set_type == EXT_TX_SET_ALL16) - pruning_aggressiveness = 10; - else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) - pruning_aggressiveness = 7; - } const float score_thresh = prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1]; @@ -1724,9 +1823,11 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } static void model_rd_from_sse(const AV1_COMP *const cpi, - const MACROBLOCKD *const xd, BLOCK_SIZE bsize, - int plane, int64_t sse, int *rate, - int64_t *dist) { + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)num_samples; + const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int dequant_shift = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; @@ -1734,15 +1835,17 @@ static void model_rd_from_sse(const AV1_COMP *const cpi, // Fast approximate the modelling function. if (cpi->sf.simple_model_rd_from_var) { const int64_t square_error = sse; - int quantizer = (pd->dequant_Q3[1] >> dequant_shift); + int quantizer = pd->dequant_Q3[1] >> dequant_shift; if (quantizer < 120) - *rate = (int)((square_error * (280 - quantizer)) >> - (16 - AV1_PROB_COST_SHIFT)); + *rate = (int)AOMMIN( + (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), + INT_MAX); else *rate = 0; + assert(*rate >= 0); *dist = (square_error * quantizer) >> 8; } else { - av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize], + av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], pd->dequant_Q3[1] >> dequant_shift, rate, dist); } @@ -1776,22 +1879,23 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, - int plane_to, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int *plane_rate, - int64_t *plane_sse, int64_t *plane_dist) { + int plane_to, int mi_row, int mi_col, + int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, + int64_t *plane_dist) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. int plane; + (void)mi_row; + (void)mi_col; const int ref = xd->mi[0]->ref_frame[0]; int64_t rate_sum = 0; int64_t dist_sum = 0; int64_t total_sse = 0; - x->pred_sse[ref] = 0; - for (plane = plane_from; plane <= plane_to; ++plane) { struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -1805,26 +1909,31 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, if (x->skip_chroma_rd && plane) continue; - // TODO(geza): Write direct sse functions that do not compute - // variance as well. - sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); total_sse += sse; - - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist); - rate_sum += rate; dist_sum += dist; if (plane_rate) plane_rate[plane] = rate; if (plane_sse) plane_sse[plane] = sse; if (plane_dist) plane_dist[plane] = dist; + assert(rate_sum >= 0); } if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + rate_sum = AOMMIN(rate_sum, INT_MAX); *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum; } @@ -1949,7 +2058,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, assert(visible_cols > 0); #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8) + if (x->using_dist_8x8 && plane == 0) return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, tx_bsize, txb_cols, txb_rows, visible_cols, visible_rows, x->qindex); @@ -1967,8 +2076,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize, - int force_sse) { + const BLOCK_SIZE tx_bsize) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, @@ -1978,8 +2086,7 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, #if CONFIG_DIST_8X8 int txb_height = block_size_high[tx_bsize]; int txb_width = block_size_wide[tx_bsize]; - if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 && - txb_height >= 8) { + if (x->using_dist_8x8 && plane == 0) { const int src_stride = x->plane[plane].src.stride; const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0]; @@ -2145,29 +2252,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, MAX_TX_SIZE, eob, cpi->common.reduced_tx_set_used); -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) { - // Save decoded pixels for inter block in pd->pred to avoid - // block_8x8_rd_txfm_daala_dist() need to produce them - // by calling av1_inverse_transform_block() again. - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - int16_t *pred = &x->pred_luma[pred_idx]; - int i, j; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = - CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i]; - } else { - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i]; - } - } -#endif // CONFIG_DIST_8X8 return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, blk_row, blk_col, plane_bsize, tx_bsize); } @@ -2258,11 +2343,11 @@ static void get_2x2_normalized_sses_and_sads( } } +// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values +// 0: Do not collect any RD stats +// 1: Collect RD stats for transform units +// 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS - // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values - // 0: Do not collect any RD stats - // 1: Collect RD stats for transform units - // 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS == 1 static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -2274,7 +2359,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, // Generate small sample to restrict output size. static unsigned int seed = 21743; - if (lcg_rand16(&seed) % 100 > 0) return; + if (lcg_rand16(&seed) % 256 > 0) return; const char output_file[] = "tu_stats.txt"; FILE *fout = fopen(output_file, "a"); @@ -2336,7 +2421,8 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist); + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples, + &model_rate, &model_dist); const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); @@ -2360,7 +2446,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, } #endif // CONFIG_COLLECT_RD_STATS == 1 -#if CONFIG_COLLECT_RD_STATS == 2 +#if CONFIG_COLLECT_RD_STATS >= 2 static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, BLOCK_SIZE plane_bsize) { @@ -2369,7 +2455,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, // Generate small sample to restrict output size. static unsigned int seed = 95014; - if (lcg_rand16(&seed) % 100 > 0) return; + if (lcg_rand16(&seed) % 256 > 0) return; const char output_file[] = "pu_stats.txt"; FILE *fout = fopen(output_file, "a"); @@ -2390,8 +2476,10 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const double rate_norm = (double)rd_stats->rate / num_samples; const double dist_norm = (double)rd_stats->dist / num_samples; + const double rdcost_norm = + (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples; - fprintf(fout, "%g %g", rate_norm, dist_norm); + fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm); const int src_stride = p->src.stride; const uint8_t *const src = p->src.buf; @@ -2426,14 +2514,18 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, " %g", sad_norm_arr[i]); } - fprintf(fout, " %d %d %d", q_step, bw, bh); + fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh); int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist); + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rdcost_norm = + (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples; const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; - fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, + model_rdcost_norm); double mean = get_mean(src_diff, diff_stride, bw, bh); mean /= (1 << shift); @@ -2450,53 +2542,51 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, "\n"); fclose(fout); } -#endif // CONFIG_COLLECT_RD_STATS == 2 +#endif // CONFIG_COLLECT_RD_STATS >= 2 #endif // CONFIG_COLLECT_RD_STATS -static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, - BLOCK_SIZE plane_bsize, int plane, int64_t *rsse, +static void model_rd_with_dnn(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, int *rate, int64_t *dist) { const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int log_numpels = num_pels_log2_lookup[plane_bsize]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + const struct macroblock_plane *const p = &x->plane[plane]; int bw, bh; - const int diff_stride = block_size_wide[plane_bsize]; get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); - const int num_samples = bw * bh; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; - const int q_step = pd->dequant_Q3[1] >> dequant_shift; - const int src_stride = p->src.stride; const uint8_t *const src = p->src.buf; const int dst_stride = pd->dst.stride; const uint8_t *const dst = pd->dst.buf; const int16_t *const src_diff = p->src_diff; + const int diff_stride = block_size_wide[plane_bsize]; const int shift = (xd->bd - 8); - int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh); - sse = ROUND_POWER_OF_TWO(sse, shift * 2); - const double sse_norm = (double)sse / num_samples; if (sse == 0) { if (rate) *rate = 0; if (dist) *dist = 0; - if (rsse) *rsse = sse; return; } if (plane) { int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, - &model_dist); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); if (rate) *rate = model_rate; if (dist) *dist = model_dist; - if (rsse) *rsse = sse; return; } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + double sse_norm_arr[4]; get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, @@ -2506,25 +2596,26 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); mean /= (1 << shift); } - const double variance = sse_norm - mean * mean; - assert(variance >= 0.0); + double sse_norm_sum = 0.0, sse_frac_arr[3]; + for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k]; + for (int k = 0; k < 3; ++k) + sse_frac_arr[k] = + sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25; const double q_sqr = (double)(q_step * q_step); const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0); + const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0); double hor_corr, vert_corr; get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); - float features[11]; + float features[NUM_FEATURES_PUSTATS]; features[0] = (float)hor_corr; features[1] = (float)log_numpels; - features[2] = (float)q_sqr; + features[2] = (float)mean_sqr_by_sse_norm; features[3] = (float)q_sqr_by_sse_norm; - features[4] = (float)sse_norm_arr[0]; - features[5] = (float)sse_norm_arr[1]; - features[6] = (float)sse_norm_arr[2]; - features[7] = (float)sse_norm_arr[3]; - features[8] = (float)sse_norm; - features[9] = (float)variance; - features[10] = (float)vert_corr; + features[4] = (float)sse_frac_arr[0]; + features[5] = (float)sse_frac_arr[1]; + features[6] = (float)sse_frac_arr[2]; + features[7] = (float)vert_corr; float rate_f, dist_by_sse_norm_f; av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f); @@ -2532,27 +2623,29 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm)); int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); // Check if skip is better - if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) { + if (rate_i == 0) { dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { rate_i = 0; - } else if (rate_i == 0) { dist_i = sse << 4; } if (rate) *rate = rate_i; if (dist) *dist = dist_i; - if (rsse) *rsse = sse; return; } -void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, - MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, - int plane_to, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int *plane_rate, - int64_t *plane_sse, int64_t *plane_dist) { +static void model_rd_for_sb_with_dnn( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -2562,19 +2655,306 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int64_t dist_sum = 0; int64_t total_sse = 0; - x->pred_sse[ref] = 0; + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + int bw, bh; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + + model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +// Fits a surface for rate and distortion using as features: +// log2(sse_norm + 1) and log2(sse_norm/qstep^2) +static void model_rd_with_surffit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xm = log(sse_norm + 1.0) / log(2.0); + const double yl = log(sse_norm / qstepsqr) / log(2.0); + double rate_f, dist_by_sse_norm_f; + + av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static void model_rd_for_sb_with_surffit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + + model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +// Fits a curve for rate and distortion using as feature: +// log2(sse_norm/qstep^2) +static void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xqr = log(sse_norm / qstepsqr) / log(2.0); + + double rate_f, dist_by_sse_norm_f; + av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static void model_rd_for_sb_with_fullrdy( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; int64_t sse; int rate; int64_t dist; if (x->skip_chroma_rd && plane) continue; - model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + + RD_STATS rd_stats; + if (plane == 0) { + select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX); + if (rd_stats.invalid_rate) { + rate = 0; + dist = sse << 4; + } else { + rate = rd_stats.rate; + dist = rd_stats.dist; + } + } else { + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + } if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); @@ -2638,8 +3018,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx]; cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; - if (intra_hash_idx > 0 && - intra_txb_rd_info->entropy_context == cur_joint_ctx && + if (intra_txb_rd_info->entropy_context == cur_joint_ctx && x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) { mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type; const TX_TYPE ref_tx_type = @@ -2727,7 +3106,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_DIST_8X8 if (x->using_dist_8x8) use_transform_domain_distortion = 0; #endif - int calc_pixel_domain_distortion_final = cpi->sf.use_transform_domain_distortion == 1 && use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD && @@ -2740,7 +3118,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; int64_t block_sse = - pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1); + pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize); if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); block_sse *= 16; @@ -2834,7 +3212,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, assert(best_rd != INT64_MAX); best_rd_stats->skip = best_eob == 0; - if (best_eob == 0) best_tx_type = DCT_DCT; if (plane == 0) { update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, best_tx_type); @@ -2914,24 +3291,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, int64_t rd1, rd2, rd; RD_STATS this_rd_stats; -#if CONFIG_DIST_8X8 - // If sub8x8 tx, 8x8 or larger partition, and luma channel, - // dist-8x8 disables early skip, because the distortion metrics for - // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition - // (new distortion metric) are different. - // Exception is: dist-8x8 is enabled but still MSE is used, - // i.e. "--tune=" encoder option is not used. - int bw = block_size_wide[plane_bsize]; - int bh = block_size_high[plane_bsize]; - int disable_early_skip = - x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 && - (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) && - x->tune_metric != AOM_TUNE_PSNR; -#endif // CONFIG_DIST_8X8 - av1_init_rd_stats(&this_rd_stats); - if (args->exit_early) return; + if (args->exit_early) { + args->incomplete_exit = 1; + return; + } if (!is_inter_block(mbmi)) { av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); @@ -2954,11 +3319,14 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, #endif // CONFIG_RD_DEBUG av1_set_txb_context(x, plane, block, tx_size, a, l); - if (plane == 0) { - x->blk_skip[blk_row * - (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) + - blk_col] = (x->plane[plane].eobs[block] == 0); - } + const int blk_idx = + blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) + + blk_col; + + if (plane == 0) + set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0); + else + set_blk_skip(x, plane, blk_idx, 0); rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse); @@ -2972,100 +3340,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, args->this_rd += rd; -#if CONFIG_DIST_8X8 - if (!disable_early_skip) -#endif - if (args->this_rd > args->best_rd) { - args->exit_early = 1; - return; - } -} - -#if CONFIG_DIST_8X8 -static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, - struct rdcost_block_args *args) { - MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblockd_plane *const pd = &xd->plane[0]; - const struct macroblock_plane *const p = &x->plane[0]; - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - const uint8_t *src = &p->src.buf[0]; - const uint8_t *dst = &pd->dst.buf[0]; - const int16_t *pred = &x->pred_luma[0]; - int bw = block_size_wide[bsize]; - int bh = block_size_high[bsize]; - int visible_w = bw; - int visible_h = bh; - - int i, j; - int64_t rd, rd1, rd2; - int64_t sse = INT64_MAX, dist = INT64_MAX; - int qindex = x->qindex; - - assert((bw & 0x07) == 0); - assert((bh & 0x07) == 0); - - get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w, - &visible_h); - - const int diff_stride = block_size_wide[bsize]; - const int16_t *diff = p->src_diff; - sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w, - visible_h, qindex); - sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); - sse *= 16; - - if (!is_inter_block(mbmi)) { - dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh, - visible_w, visible_h, qindex); - dist *= 16; - } else { - // For inter mode, the decoded pixels are provided in x->pred_luma, - // while the predicted pixels are in dst. - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred16); - else - pred8 = (uint8_t *)pred16; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) - CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i]; - } else { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i]; - } - - dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh, - visible_w, visible_h, qindex); - dist *= 16; - } - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) { - assert(args->rd_stats.sse == sse); - assert(args->rd_stats.dist == dist); + if (args->this_rd > args->best_rd) { + args->exit_early = 1; + return; } -#endif // DEBUG_DIST_8X8 - - args->rd_stats.sse = sse; - args->rd_stats.dist = dist; - - rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist); - rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse); - rd = AOMMIN(rd1, rd2); - - args->rd_stats.rdcost = rd; - args->this_rd = rd; - - if (args->this_rd > args->best_rd) args->exit_early = 1; } -#endif // CONFIG_DIST_8X8 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, RD_STATS *rd_stats, int64_t ref_best_rd, int plane, @@ -3089,16 +3368,12 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm, &args); -#if CONFIG_DIST_8X8 - int bw = block_size_wide[bsize]; - int bh = block_size_high[bsize]; - if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 && - bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) - dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args); -#endif + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early; - if (args.exit_early) { + if (invalid_rd) { av1_invalid_rd_stats(rd_stats); } else { *rd_stats = args.rd_stats; @@ -3269,6 +3544,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16); for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) { +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue; + } +#endif RD_STATS this_rd_stats; if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD; rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE); @@ -3284,10 +3564,13 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, } if (n == TX_4X4) break; } - mbmi->tx_size = best_tx_size; - memcpy(mbmi->txk_type, best_txk_type, - sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); - memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + } // Reset the pruning flags. av1_zero(x->tx_search_prune); @@ -3429,7 +3712,8 @@ static int conditional_skipintra(PREDICTION_MODE mode, // Model based RD estimation for luma intra blocks. static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, - BLOCK_SIZE bsize, int mode_cost) { + BLOCK_SIZE bsize, int mode_cost, int mi_row, + int mi_col) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -3450,10 +3734,9 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, } } // RD estimation. - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, - &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, - NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_INTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate, + &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL); if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { mode_cost += x->angle_delta_cost[mbmi->mode - V_PRED] @@ -3519,13 +3802,16 @@ static void optimize_palette_colors(uint16_t *color_cache, int n_cache, // Given the base colors as specified in centroids[], calculate the RD cost // of palette mode. -static void palette_rd_y( - const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, - BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n, - uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, - uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, - int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion, - int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) { +static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x, + MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int dc_mode_cost, const int *data, + int *centroids, int n, uint16_t *color_cache, + int n_cache, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, + int64_t *best_model_rd, int *rate, int *rate_tokenonly, + int *rate_overhead, int64_t *distortion, + int *skippable, PICK_MODE_CONTEXT *ctx, + uint8_t *blk_skip) { optimize_palette_colors(color_cache, n_cache, n, 1, centroids); int k = av1_remove_duplicates(centroids, n); if (k < PALETTE_MIN_SIZE) { @@ -3551,7 +3837,8 @@ static void palette_rd_y( extend_palette_color_map(color_map, cols, rows, block_width, block_height); const int palette_mode_cost = intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost); - int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost); + int64_t this_model_rd = + intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) return; @@ -3580,11 +3867,11 @@ static void palette_rd_y( } static int rd_pick_palette_intra_sby( - const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, - int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx, - uint8_t *best_blk_skip) { + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, + PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) { int rate_overhead = 0; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -3668,10 +3955,11 @@ static int rd_pick_palette_intra_sby( // where the dominant colors and the k-means results are similar. for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) { for (i = 0; i < n; ++i) centroids[i] = top_colors[i]; - palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, - color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, - distortion, skippable, ctx, best_blk_skip); + palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data, + centroids, n, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, &rate_overhead, distortion, skippable, ctx, + best_blk_skip); } // K-means clustering. @@ -3688,10 +3976,11 @@ static int rd_pick_palette_intra_sby( } av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr); } - palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, - color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, - distortion, skippable, ctx, best_blk_skip); + palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data, + centroids, n, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, &rate_overhead, distortion, skippable, ctx, + best_blk_skip); } } @@ -3705,10 +3994,11 @@ static int rd_pick_palette_intra_sby( // Return 1 if an filter intra mode is selected; return 0 otherwise. static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int mode_cost, - int64_t *best_rd, int64_t *best_model_rd, + int mi_row, int mi_col, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + int mode_cost, int64_t *best_rd, + int64_t *best_model_rd, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; @@ -3727,7 +4017,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t this_rd, this_model_rd; RD_STATS tokenonly_rd_stats; mbmi->filter_intra_mode_info.filter_intra_mode = mode; - this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) continue; @@ -3770,20 +4060,18 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, // Run RD calculation with given luma intra prediction angle., and return // the RD cost. Update the best mode info. if the RD cost is the best so far. static int64_t calc_rd_given_intra_angle( - const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate, - RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size, - int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type, - uint8_t *best_blk_skip) { - int this_rate; + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta, + int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta, + TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd, + TX_TYPE *best_txk_type, uint8_t *best_blk_skip) { RD_STATS tokenonly_rd_stats; int64_t this_rd, this_model_rd; MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; const int n4 = bsize_to_num_blk(bsize); assert(!is_inter_block(mbmi)); - mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta; - this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) return INT64_MAX; @@ -3791,10 +4079,9 @@ static int64_t calc_rd_given_intra_angle( super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in); if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX; - this_rate = - tokenonly_rd_stats.rate + mode_cost + - x->angle_delta_cost[mbmi->mode - V_PRED] - [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]]; + int this_rate = + mode_cost + tokenonly_rd_stats.rate + + x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta]; this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { @@ -3815,32 +4102,32 @@ static int64_t calc_rd_given_intra_angle( // With given luma directional intra prediction mode, pick the best angle delta // Return the RD cost corresponding to the best angle delta. static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, RD_STATS *rd_stats, - BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd, + int mi_row, int mi_col, int *rate, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int mode_cost, int64_t best_rd, int64_t *best_model_rd) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; assert(!is_inter_block(mbmi)); - int i, angle_delta, best_angle_delta = 0; - int first_try = 1; - int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + + int best_angle_delta = 0; + int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; TX_SIZE best_tx_size = mbmi->tx_size; - const int n4 = bsize_to_num_blk(bsize); TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; - for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; - for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - for (i = 0; i < 2; ++i) { - best_rd_in = (best_rd == INT64_MAX) - ? INT64_MAX - : (best_rd + (best_rd >> (first_try ? 3 : 5))); - this_rd = calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_txk_type, best_blk_skip); + int first_try = 1; + for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { + const int64_t best_rd_in = + (best_rd == INT64_MAX) ? INT64_MAX + : (best_rd + (best_rd >> (first_try ? 3 : 5))); + const int64_t this_rd = calc_rd_given_intra_angle( + cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in, + (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats, + &best_angle_delta, &best_tx_size, &best_rd, best_model_rd, + best_txk_type, best_blk_skip); rd_cost[2 * angle_delta + i] = this_rd; if (first_try && this_rd == INT64_MAX) return best_rd; first_try = 0; @@ -3852,28 +4139,31 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, } assert(best_rd != INT64_MAX); - for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - int64_t rd_thresh; - for (i = 0; i < 2; ++i) { + for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { int skip_search = 0; - rd_thresh = best_rd + (best_rd >> 5); + const int64_t rd_thresh = best_rd + (best_rd >> 5); if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) skip_search = 1; if (!skip_search) { - calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_txk_type, best_blk_skip); + calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost, + best_rd, (1 - 2 * i) * angle_delta, + MAX_ANGLE_DELTA, rate, rd_stats, + &best_angle_delta, &best_tx_size, &best_rd, + best_model_rd, best_txk_type, best_blk_skip); } } } - mbmi->tx_size = best_tx_size; - mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; - memcpy(mbmi->txk_type, best_txk_type, - sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); - memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + } return best_rd; } @@ -4052,10 +4342,10 @@ static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, // This function is used only for intra_only frames static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int64_t best_rd, - PICK_MODE_CONTEXT *ctx) { + int mi_row, int mi_col, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + int64_t best_rd, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); @@ -4098,13 +4388,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO best_mbmi = *mbmi; /* Y Search for intra prediction mode */ - for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) { + for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) { RD_STATS this_rd_stats; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd, this_model_rd; mbmi->mode = intra_rd_search_mode_order[mode_idx]; mbmi->angle_delta[PLANE_TYPE_Y] = 0; - this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]); + this_model_rd = + intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col); if (best_model_rd != INT64_MAX && this_model_rd > best_model_rd + (best_model_rd >> 1)) continue; @@ -4113,8 +4404,9 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; if (is_directional_mode && av1_use_angle_delta(bsize)) { this_rd_stats.rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize, - bmode_costs[mbmi->mode], best_rd, &best_model_rd); + rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate, + &this_rd_stats, bsize, bmode_costs[mbmi->mode], + best_rd, &best_model_rd); } else { super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd); } @@ -4151,16 +4443,16 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } if (try_palette) { - rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, - best_palette_color_map, &best_rd, &best_model_rd, - rate, rate_tokenonly, distortion, skippable, ctx, - ctx->blk_skip); + rd_pick_palette_intra_sby( + cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi, + best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly, + distortion, skippable, ctx, ctx->blk_skip); } if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { - if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, - skippable, bsize, bmode_costs[DC_PRED], - &best_rd, &best_model_rd, ctx)) { + if (rd_pick_filter_intra_sby( + cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable, + bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) { best_mbmi = *mbmi; } } @@ -4230,16 +4522,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, int blk_row, int blk_col, int plane, int block, - int plane_bsize, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, RD_STATS *rd_stats, + int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost, TXB_RD_INFO *rd_info_array) { const struct macroblock_plane *const p = &x->plane[plane]; - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); const uint16_t cur_joint_ctx = - (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx; - + (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; const int txk_type_idx = av1_get_txk_type_index(plane_bsize, blk_row, blk_col); // Look up RD and terminate early in case when we've already processed exactly @@ -4264,7 +4552,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, RD_STATS this_rd_stats; search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); + txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); av1_merge_rd_stats(rd_stats, &this_rd_stats); @@ -4428,8 +4716,8 @@ static void try_tx_block_no_split( rd_stats->zero_rate = zero_blk_rate; const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); mbmi->inter_tx_size[index] = tx_size; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta, - ptl, rd_stats, ftxs_mode, ref_best_rd, + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, + &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); assert(rd_stats->rate < INT_MAX); @@ -4444,12 +4732,12 @@ static void try_tx_block_no_split( rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; - x->blk_skip[blk_row * bw + blk_col] = 1; + set_blk_skip(x, 0, blk_row * bw + blk_col, 1); p->eobs[block] = 0; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, DCT_DCT); } else { - x->blk_skip[blk_row * bw + blk_col] = 0; + set_blk_skip(x, 0, blk_row * bw + blk_col, 0); rd_stats->skip = 0; } @@ -4482,7 +4770,6 @@ static void try_tx_block_split( MACROBLOCKD *const xd = &x->e_mbd; const int max_blocks_high = max_block_high(xd, plane_bsize, 0); const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); - struct macroblock_plane *const p = &x->plane[0]; const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; @@ -4490,10 +4777,7 @@ static void try_tx_block_split( RD_STATS this_rd_stats; int this_cost_valid = 1; int64_t tmp_rd = 0; -#if CONFIG_DIST_8X8 - int sub8x8_eob[4] = { 0, 0, 0, 0 }; - struct macroblockd_plane *const pd = &xd->plane[0]; -#endif + split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1]; assert(tx_size < TX_SIZES_ALL); @@ -4511,123 +4795,22 @@ static void try_tx_block_split( &this_cost_valid, ftxs_mode, (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (!this_cost_valid) goto LOOP_EXIT; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && tx_size == TX_8X8) { - sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block]; - } -#endif // CONFIG_DIST_8X8 + if (!this_cost_valid) goto LOOP_EXIT; + av1_merge_rd_stats(split_rd_stats, &this_rd_stats); tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (no_split_rd < tmp_rd) { - this_cost_valid = 0; - goto LOOP_EXIT; - } + + if (no_split_rd < tmp_rd) { + this_cost_valid = 0; + goto LOOP_EXIT; + } block += sub_step; } } LOOP_EXIT : {} -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) { - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - - const uint8_t *src = - &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; - const uint8_t *dst = - &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; - - int64_t dist_8x8; - const int qindex = x->qindex; - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - const int16_t *pred = &x->pred_luma[pred_idx]; - int i, j; - int row, col; - - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]); - - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8, - 8, 8, 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.sse == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats->sse = dist_8x8; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred8_16); - else - pred8 = (uint8_t *)pred8_16; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = - pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR( - dst)[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } else { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - dst[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8, - 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats->dist = dist_8x8; - tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); - } -#endif // CONFIG_DIST_8X8 if (this_cost_valid) *split_rd = tmp_rd; } @@ -4660,7 +4843,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, const int try_no_split = 1; int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; - +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) + try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16; +#endif TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; // TX no split @@ -4691,11 +4877,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } } -#if COLLECT_TX_SIZE_DATA - // Do not skip tx_split when collecting tx size data. - try_split = 1; -#endif - // TX split int64_t split_rd = INT64_MAX; RD_STATS split_rd_stats; @@ -4707,54 +4888,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, rd_info_node, &split_rd_stats, &split_rd); } -#if COLLECT_TX_SIZE_DATA - do { - if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break; - -#if 0 - // Randomly select blocks to collect data to reduce output file size. - const int rnd_val = rand() % 2; - if (rnd_val) break; -#endif - - const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); - const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); - const int within_border = - mi_row >= xd->tile.mi_row_start && - (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && - mi_col >= xd->tile.mi_col_start && - (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); - if (!within_border) break; - - FILE *fp = fopen(av1_tx_size_data_output_file, "a"); - if (!fp) break; - - // Split decision, RD cost, block type(inter/intra), q-index, rdmult, - // and block size. - const int split_selected = sum_rd < this_rd; - const int is_inter = 1; - const int txb_w = tx_size_wide[tx_size]; - const int txb_h = tx_size_high[tx_size]; - fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected, - (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex, - x->rdmult, is_inter, txb_w, txb_h); - - // Residue signal. - const int diff_stride = block_size_wide[plane_bsize]; - const int16_t *src_diff = - &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; - for (int r = 0; r < txb_h; ++r) { - for (int c = 0; c < txb_w; ++c) { - fprintf(fp, "%d,", src_diff[c]); - } - src_diff += diff_stride; - } - fprintf(fp, "\n"); - - fclose(fp); - } while (0); -#endif // COLLECT_TX_SIZE_DATA - if (no_split.rd < split_rd) { ENTROPY_CONTEXT *pta = ta + blk_col; ENTROPY_CONTEXT *ptl = tl + blk_row; @@ -4773,7 +4906,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, mbmi->tx_size = tx_size_selected; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, no_split.tx_type); - x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip; + set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip); } else { *rd_stats = split_rd_stats; if (split_rd == INT64_MAX) *is_cost_valid = 0; @@ -4787,7 +4920,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, TXB_RD_INFO_NODE *rd_info_tree) { MACROBLOCKD *const xd = &x->e_mbd; int is_cost_valid = 1; - int64_t this_rd = 0; + int64_t this_rd = 0, skip_rd = 0; if (ref_best_rd < 0) is_cost_valid = 0; @@ -4818,42 +4951,39 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + const int skip_ctx = av1_get_skip_context(xd); + const int s0 = x->skip_cost[skip_ctx][0]; + const int s1 = x->skip_cost[skip_ctx][1]; + skip_rd = RDCOST(x->rdmult, s1, 0); + this_rd = RDCOST(x->rdmult, s0, 0); for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bw) { + int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd))); select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, plane_bsize, ctxa, ctxl, tx_above, tx_left, - &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid, - ftxs_mode, rd_info_tree); + &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode, + rd_info_tree); if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); return; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); - this_rd += - AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), - RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); + skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); block += step; if (rd_info_tree != NULL) rd_info_tree += 1; } } + if (skip_rd <= this_rd) { + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } else { + rd_stats->skip = 0; + } } - const int skip_ctx = av1_get_skip_context(xd); - const int s0 = x->skip_cost[skip_ctx][0]; - const int s1 = x->skip_cost[skip_ctx][1]; - int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); - this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); - if (skip_rd <= this_rd) { - this_rd = skip_rd; - rd_stats->rate = 0; - rd_stats->dist = rd_stats->sse; - rd_stats->skip = 1; - } else { - rd_stats->skip = 0; - } - if (this_rd > ref_best_rd) is_cost_valid = 0; - if (!is_cost_valid) { // reset cost value av1_invalid_rd_stats(rd_stats); @@ -4945,8 +5075,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; rd_stats->zero_rate = zero_blk_rate; rd_stats->ref_rdcost = ref_best_rd; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta, - tl, rd_stats, ftxs_mode, ref_best_rd, NULL); + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, + &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL); const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || @@ -4954,14 +5084,14 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; - x->blk_skip[blk_row * mi_width + blk_col] = 1; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1); x->plane[0].eobs[block] = 0; x->plane[0].txb_entropy_ctx[block] = 0; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, DCT_DCT); } else { rd_stats->skip = 0; - x->blk_skip[blk_row * mi_width + blk_col] = 0; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0); } if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) rd_stats->rate += x->txfm_partition_cost[ctx][0]; @@ -5128,12 +5258,13 @@ static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info, static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash) { // Linear search through the circular buffer to find matching hash. - int index; - for (int i = cur_record->num - 1; i >= 0; i--) { - index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN; - if (cur_record->hash_vals[index] == hash) return index; + for (int i = cur_record->index_start - 1; i >= 0; i--) { + if (cur_record->hash_vals[i] == hash) return i; } - + for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) { + if (cur_record->hash_vals[i] == hash) return i; + } + int index; // If not found - add new RD info into the buffer and return its index if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) { index = (cur_record->index_start + cur_record->num) % @@ -5150,6 +5281,155 @@ static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, return index; } +typedef struct { + int leaf; + int8_t children[4]; +} RD_RECORD_IDX_NODE; + +static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = { + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0, 0, 0, 0 } }, + { 1, { 0, 0, 0, 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0 } }, + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = { + { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 5, 6 } }, + { 0, { 7, 8, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 7, 8 } }, + { 0, { 5, 6, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = { + { 0, { 1, 2, 3, 4 } }, { 0, { 5, 6, 9, 10 } }, { 0, { 7, 8, 11, 12 } }, + { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = { + { 0, { 2, 3, 4, 5 } }, { 0, { 6, 7, 8, 9 } }, + { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } }, + { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } }, + { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } }, + { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = { + { 0, { 2, 3, 6, 7 } }, { 0, { 4, 5, 8, 9 } }, + { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } }, + { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } }, + { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } }, + { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = { + { 0, { 4, 5, 8, 9 } }, { 0, { 6, 7, 10, 11 } }, + { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } }, + { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } }, + { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } }, + { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } }, + { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } }, + { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } }, + { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } }, + { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } }, + { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = { + { 0, { 1, -1, 2, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = { + NULL, // BLOCK_4X4 + NULL, // BLOCK_4X8 + NULL, // BLOCK_8X4 + rd_record_tree_8x8, // BLOCK_8X8 + rd_record_tree_8x16, // BLOCK_8X16 + rd_record_tree_16x8, // BLOCK_16X8 + rd_record_tree_16x16, // BLOCK_16X16 + rd_record_tree_1_2, // BLOCK_16X32 + rd_record_tree_2_1, // BLOCK_32X16 + rd_record_tree_sqr, // BLOCK_32X32 + rd_record_tree_1_2, // BLOCK_32X64 + rd_record_tree_2_1, // BLOCK_64X32 + rd_record_tree_sqr, // BLOCK_64X64 + rd_record_tree_64x128, // BLOCK_64X128 + rd_record_tree_128x64, // BLOCK_128X64 + rd_record_tree_128x128, // BLOCK_128X128 + NULL, // BLOCK_4X16 + NULL, // BLOCK_16X4 + rd_record_tree_1_4, // BLOCK_8X32 + rd_record_tree_4_1, // BLOCK_32X8 + rd_record_tree_1_4, // BLOCK_16X64 + rd_record_tree_4_1, // BLOCK_64X16 +}; + +static const int rd_record_tree_size[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 0, // BLOCK_4X8 + 0, // BLOCK_8X4 + sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X8 + sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X16 + sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X8 + sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X16 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X32 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X16 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X32 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X64 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X32 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X64 + sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X128 + sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X64 + sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X128 + 0, // BLOCK_4X16 + 0, // BLOCK_16X4 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X32 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X8 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X64 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X16 +}; + +static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree, + BLOCK_SIZE bsize) { + const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize]; + const int size = rd_record_tree_size[bsize]; + for (int i = 0; i < size; ++i) { + if (rd_record[i].leaf) { + av1_zero(tree[i].children); + } else { + for (int j = 0; j < 4; ++j) { + const int8_t idx = rd_record[i].children[j]; + tree[i].children[j] = idx > 0 ? &tree[idx] : NULL; + } + } + } +} + // Go through all TX blocks that could be used in TX size search, compute // residual hash values for them and find matching RD info that stores previous // RD search results for these TX blocks. The idea is to prevent repeated @@ -5168,26 +5448,23 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, // Hashing is performed only for square TX sizes larger than TX_4X4 if (max_square_tx_size < TX_8X8) return 0; - - const int bw_mi = mi_size_wide[bsize]; const int diff_stride = bw; const struct macroblock_plane *const p = &x->plane[0]; const int16_t *diff = &p->src_diff[0]; - + init_rd_record_tree(dst_rd_info, bsize); // Coordinates of the top-left corner of current block within the superblock // measured in pixels: const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2; const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2; int cur_rd_info_idx = 0; int cur_tx_depth = 0; - uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; - uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize]; while (cur_tx_depth <= MAX_VARTX_DEPTH) { const int cur_tx_bw = tx_size_wide[cur_tx_size]; const int cur_tx_bh = tx_size_high[cur_tx_size]; if (cur_tx_bw < 8 || cur_tx_bh < 8) break; const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size]; + const int tx_size_idx = cur_tx_size - TX_8X8; for (int row = 0; row < bh; row += cur_tx_bh) { for (int col = 0; col < bw; col += cur_tx_bw) { if (cur_tx_bw != cur_tx_bh) { @@ -5211,48 +5488,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, (uint8_t *)hash_data, 2 * cur_tx_bw * cur_tx_bh); - // Find corresponding RD info based on the hash value. - const int rd_record_idx = - row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) + - col_in_sb; - - int idx = find_tx_size_rd_info( - &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash); + const int record_idx = + row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb; + TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx]; + int idx = find_tx_size_rd_info(records, hash); dst_rd_info[cur_rd_info_idx].rd_info_array = - &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx] - .tx_rd_info[idx]; - } - - // Update the output quadtree RD info structure. - av1_zero(dst_rd_info[cur_rd_info_idx].children); - const int this_mi_row = row / MI_SIZE; - const int this_mi_col = col / MI_SIZE; - if (cur_tx_depth > 0) { // Set up child pointers. - const int mi_index = this_mi_row * bw_mi + this_mi_col; - const int child_idx = child_idx_buf[mi_index]; - assert(child_idx < 4); - dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] = - &dst_rd_info[cur_rd_info_idx]; - } - if (cur_tx_depth < MAX_VARTX_DEPTH) { // Set up parent and child idx. - const int tx_bh_mi = cur_tx_bh / MI_SIZE; - const int tx_bw_mi = cur_tx_bw / MI_SIZE; - for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) { - memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx, - tx_bw_mi); - } - int child_idx = 0; - const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size]; - const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size]; - for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; - i += next_tx_bh_mi) { - for (int j = this_mi_col; j < this_mi_col + tx_bw_mi; - j += next_tx_bw_mi) { - assert(child_idx < 4); - child_idx_buf[i * bw_mi + j] = child_idx++; - } - } + &records->tx_rd_info[idx]; } ++cur_rd_info_idx; } @@ -5300,7 +5542,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, const MACROBLOCKD *xd = &x->e_mbd; const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); - *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1); + *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize); const int64_t mse = *dist / bw / bh; // Normalized quantizer takes the transform upscaling factor (8 for tx size // smaller than 32) into account. @@ -5354,7 +5596,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize, memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN); memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); mbmi->tx_size = tx_size; - memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4); + for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1); rd_stats->skip = 1; rd_stats->rate = 0; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) @@ -5388,17 +5630,21 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int model_rate; int64_t model_dist; int model_skip; - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, - &model_skip, NULL, NULL, NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist, + &model_skip, NULL, NULL, NULL, NULL); const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); // If the modeled rd is a lot worse than the best so far, breakout. // TODO(debargha, urvang): Improve the model and make the check below // tighter. assert(cpi->sf.model_based_prune_tx_search_level >= 0 && cpi->sf.model_based_prune_tx_search_level <= 2); + static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE, + 4 + MODELRD_TYPE_TX_SEARCH_PRUNE }; if (!model_skip && - model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) > - ref_best_rd) + ((model_rd * + prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >> + 3) > ref_best_rd) return; } @@ -5431,7 +5677,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, // Precompute residual hashes and find existing or add new RD records to // store and reuse rate and distortion values to speed up TX size search. - TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256]; + TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64]; int found_rd_info = 0; if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) { found_rd_info = @@ -5479,34 +5725,61 @@ static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, assert(plane > 0); assert(tx_size < TX_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; ENTROPY_CONTEXT *ta = above_ctx + blk_col; ENTROPY_CONTEXT *tl = left_ctx + blk_row; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, - ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL); + &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL); + + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int blk_idx = blk_row * mi_width + blk_col; + av1_set_txb_context(x, plane, block, tx_size, ta, tl); + if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip == 1) && + !xd->lossless[mbmi->segment_id]) { + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + } + + // Set chroma blk_skip to 0 + set_blk_skip(x, plane, blk_idx, 0); } // Return value 0: early termination triggered, no valid rd cost available; // 1: rd cost values are valid. static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd, + int64_t non_skip_ref_best_rd, + int64_t skip_ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int plane; int is_cost_valid = 1; int64_t this_rd = 0; + int64_t skip_rd = 0; - if (ref_best_rd < 0) is_cost_valid = 0; + if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0; av1_init_rd_stats(rd_stats); - if (x->skip_chroma_rd) return is_cost_valid; + if (x->skip_chroma_rd) { + if (!is_cost_valid) av1_invalid_rd_stats(rd_stats); + + return is_cost_valid; + } + const BLOCK_SIZE bsizec = scale_chroma_bsize( bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); @@ -5531,36 +5804,31 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, const int step = bh * bw; ENTROPY_CONTEXT ta[MAX_MIB_SIZE]; ENTROPY_CONTEXT tl[MAX_MIB_SIZE]; - RD_STATS pn_rd_stats; - av1_init_rd_stats(&pn_rd_stats); av1_get_entropy_contexts(bsizec, pd, ta, tl); for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bw) { + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse); + if ((this_rd > non_skip_ref_best_rd) && + (skip_rd > skip_ref_best_rd)) { + av1_invalid_rd_stats(rd_stats); + return 0; + } block += step; } } - - if (pn_rd_stats.rate == INT_MAX) { - is_cost_valid = 0; - break; - } - - av1_merge_rd_stats(rd_stats, &pn_rd_stats); - - this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist), - RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse)); - - if (this_rd > ref_best_rd) { - is_cost_valid = 0; - break; - } } - } - - if (!is_cost_valid) { + } else { // reset cost value av1_invalid_rd_stats(rd_stats); } @@ -6137,9 +6405,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { (mv->col >> 3) > mv_limits->col_max; } -static INLINE int get_single_mode(int this_mode, int ref_idx, - int is_comp_pred) { - int single_mode; +static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode, + int ref_idx, int is_comp_pred) { + PREDICTION_MODE single_mode; if (is_comp_pred) { single_mode = ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode); @@ -6149,63 +6417,6 @@ static INLINE int get_single_mode(int this_mode, int ref_idx, return single_mode; } -/* If the current mode shares the same mv with other modes with higher prority, - * skip this mode. This priority order is nearest > global > near. */ -static int skip_repeated_mv(const AV1_COMMON *const cm, - const MACROBLOCK *const x, int this_mode, - const MV_REFERENCE_FRAME ref_frames[2]) { - const int is_comp_pred = ref_frames[1] > INTRA_FRAME; - const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - if (!is_comp_pred) { - if (this_mode == NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { - // NEARMV has the same motion vector as NEARESTMV - return 1; - } - if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { - // NEARMV has the same motion vector as GLOBALMV - return 1; - } - } - if (this_mode == GLOBALMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { - // GLOBALMV has the same motion vector as NEARESTMV - return 1; - } - } - } else { - for (int i = 0; i < 2; ++i) { - const int single_mode = get_single_mode(this_mode, i, is_comp_pred); - if (single_mode == NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { - // NEARMV has the same motion vector as NEARESTMV in compound mode - return 1; - } - } - } - if (this_mode == NEAR_NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && - cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { - // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV - return 1; - } - } - if (this_mode == GLOBAL_GLOBALMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && - cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { - // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV - return 1; - } - } - } - return 0; -} - static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row, int mi_col, int_mv *ref_mv_sub8x8[2], @@ -6215,10 +6426,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, const int num_planes = av1_num_planes(cm); const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; + const int plane = 0; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; // This function should only ever be called for compound modes assert(has_second_ref(mbmi)); + const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] }; const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; int_mv ref_mv[2]; int ite, ref; @@ -6228,11 +6441,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, struct macroblockd_plane *const pd = &xd->plane[0]; const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic; const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir; - int is_global[2]; + + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); + conv_params.use_jnt_comp_avg = 0; + WarpTypesAllowed warp_types[2]; for (ref = 0; ref < 2; ++ref) { const WarpedMotionParams *const wm = &xd->global_motion[xd->mi[0]->ref_frame[ref]]; - is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype); + const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype); + warp_types[ref].global_warp_allowed = is_global; + warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; } // Do joint motion search in compound mode to get more accurate mv. @@ -6244,30 +6462,38 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, }; // Prediction buffer from second frame. - DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); - uint8_t *second_pred; + DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]); + uint8_t *second_pred = get_buf_by_bd(xd, second_pred16); (void)ref_mv_sub8x8; + const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode); + const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0); + MV *const best_mv = &x->best_mv.as_mv; + const int search_range = SEARCH_RANGE_8P; + const int sadpb = x->sadperbit16; // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. for (ite = 0; ite < 4; ite++) { struct buf_2d ref_yv12[2]; int bestsme = INT_MAX; - int sadpb = x->sadperbit16; - MV *const best_mv = &x->best_mv.as_mv; - int search_range = 3; - MvLimits tmp_mv_limits = x->mv_limits; int id = ite % 2; // Even iterations search in the first reference frame, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. - const int plane = 0; - ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd); - conv_params.use_jnt_comp_avg = 0; - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global[!id]; - warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; - + if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) { + if (cur_mv[id].as_int == init_mv[id].as_int) { + break; + } else { + int_mv cur_int_mv, init_int_mv; + cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; + cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3; + init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; + if (cur_int_mv.as_int == init_int_mv.as_int) { + break; + } + } + } for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = av1_get_ref_mv(x, ref); // Swap out the reference frame for a version that's been scaled to @@ -6294,26 +6520,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, ref_yv12[1] = xd->plane[plane].pre[1]; // Get the prediction block from the 'other' reference frame. - InterpFilters interp_filters = EIGHTTAP_REGULAR; + const InterpFilters interp_filters = EIGHTTAP_REGULAR; // Since we have scaled the reference frames to match the size of the // current frame we must use a unit scaling factor during mode selection. - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); - av1_highbd_build_inter_predictor( - ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, - &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters, - &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd, cm->allow_warped_motion); - } else { - second_pred = (uint8_t *)second_pred_alloc_16; - av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, - second_pred, pw, &cur_mv[!id].as_mv, - &cm->sf_identity, pw, ph, &conv_params, - interp_filters, &warp_types, p_col, p_row, - plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd, cm->allow_warped_motion); - } + av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, + second_pred, pw, &cur_mv[!id].as_mv, + &cm->sf_identity, pw, ph, &conv_params, + interp_filters, &warp_types[!id], p_col, p_row, + plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); const int order_idx = id != 0; av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset, @@ -6325,15 +6541,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv); // Use the mv result from the single mode as mv predictor. - // Use the mv result from the single mode as mv predictor. *best_mv = cur_mv[id].as_mv; best_mv->col >>= 3; best_mv->row >>= 3; - av1_set_mvcost( - x, id, - mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); + av1_set_mvcost(x, id, ref_mv_idx); // Small-range full-pixel motion search. bestsme = av1_refining_search_8p_c(x, sadpb, search_range, @@ -6385,7 +6598,6 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, // Restore the pointer to the first prediction buffer. if (id) xd->plane[plane].pre[0] = ref_yv12[0]; - if (bestsme < last_besterr[id]) { cur_mv[id].as_mv = *best_mv; last_besterr[id] = bestsme; @@ -6397,10 +6609,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, *rate_mv = 0; for (ref = 0; ref < 2; ++ref) { - av1_set_mvcost( - x, ref, - mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); - + av1_set_mvcost(x, ref, ref_mv_idx); const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); @@ -6710,16 +6919,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, switch (mbmi->motion_mode) { case SIMPLE_TRANSLATION: - bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - sadpb, cond_cost_list(cpi, cost_list), - &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col), - (MI_SIZE * mi_row), 0); + bestsme = av1_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1, + (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0); break; case OBMC_CAUSAL: - bestsme = av1_obmc_full_pixel_diamond( - cpi, x, &mvp_full, step_param, sadpb, - MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv, - &(x->best_mv.as_mv), 0); + bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb, + MAX_MVSEARCH_STEPS - 1 - step_param, + 1, &cpi->fn_ptr[bsize], &ref_mv, + &(x->best_mv.as_mv), 0); break; default: assert(0 && "Invalid motion mode!\n"); } @@ -6850,25 +7059,17 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x, av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, cm->width, cm->height); - ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); WarpTypesAllowed warp_types; warp_types.global_warp_allowed = is_global; warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; // Get the prediction block from the 'other' reference frame. - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - av1_highbd_build_inter_predictor( - ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - 0, mbmi->interp_filters, &warp_types, p_col, p_row, plane, - MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, - cm->allow_warped_motion); - } else { - av1_build_inter_predictor( - ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane, - !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, - cm->allow_warped_motion); - } + av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw, + other_mv, &sf, pw, ph, &conv_params, + mbmi->interp_filters, &warp_types, p_col, p_row, + plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset, &xd->jcp_param.bck_offset, @@ -6921,7 +7122,7 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, int bestsme = INT_MAX; int sadpb = x->sadperbit16; MV *const best_mv = &x->best_mv.as_mv; - int search_range = 3; + int search_range = SEARCH_RANGE_8P; MvLimits tmp_mv_limits = x->mv_limits; @@ -7056,12 +7257,12 @@ static void do_masked_motion_search_indexed( // near mv modes to reduce distortion in subsequent blocks and also improve // visual quality. #define NEW_MV_DISCOUNT_FACTOR 8 -static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, - int ref_mv_idx, +static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, const MV_REFERENCE_FRAME *ref_frame, const MB_MODE_INFO_EXT *mbmi_ext); static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x, - int this_mode, int_mv this_mv) { + PREDICTION_MODE this_mode, int_mv this_mv) { if (this_mode == NEWMV && this_mv.as_int != 0 && !cpi->rc.is_src_frame_alt_ref) { // Only discount new_mv when nearst_mv and all near_mv are zero, and the @@ -7176,6 +7377,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; + assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; @@ -7199,28 +7401,27 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, (int64_t)aom_sum_squares_i16(residual1, N)) * (1 << WEDGE_WEIGHT_BITS) / 2; int16_t *ds = residual0; - if (N < 64) - av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N); - else - av1_wedge_compute_delta_squares(ds, residual0, residual1, N); + + av1_wedge_compute_delta_squares(ds, residual0, residual1, N); for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); - // TODO(jingning): Make sse2 functions support N = 16 case - if (N < 64) - wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit); - else - wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); + wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); - if (N < 64) - sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); - else - sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + // int rate2; + // int64_t dist2; + // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2); + // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n", + // sse, rate, dist, rate2, dist2); dist = dist2; + // rate = rate2; + rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); @@ -7248,6 +7449,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; + assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; @@ -7259,13 +7461,11 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); - if (N < 64) - sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); - else - sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); @@ -7317,50 +7517,45 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, MB_MODE_INFO *const mbmi = xd->mi[0]; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - const int N = bw * bh; + const int N = 1 << num_pels_log2_lookup[bsize]; int rate; - uint64_t sse; int64_t dist; - int64_t rd0; DIFFWTD_MASK_TYPE cur_mask_type; int64_t best_rd = INT64_MAX; DIFFWTD_MASK_TYPE best_mask_type = 0; const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; // try each mask type and its inverse for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { // build mask and inverse if (hbd) av1_build_compound_diffwtd_mask_highbd( - xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, + tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); else - av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, - bw, bh, bw); + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, + p0, bw, p1, bw, bh, bw); // compute rd for mask - sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N); + uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, + tmp_mask[cur_mask_type], N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); - rd0 = RDCOST(x->rdmult, rate, dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + const int64_t rd0 = RDCOST(x->rdmult, rate, dist); if (rd0 < best_rd) { best_mask_type = cur_mask_type; best_rd = rd0; } } - - // make final mask mbmi->interinter_comp.mask_type = best_mask_type; - if (hbd) - av1_build_compound_diffwtd_mask_highbd( - xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0), - bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); - else - av1_build_compound_diffwtd_mask( - xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw); - + if (best_mask_type == DIFFWTD_38_INV) { + memcpy(xd->seg_mask, seg_mask, N * 2); + } return best_rd; } @@ -7413,9 +7608,12 @@ static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x, } } -static int interinter_compound_motion_search( - const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, - const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) { +static int interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode, + int mi_row, int mi_col) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int_mv tmp_mv[2]; @@ -7440,11 +7638,40 @@ static int interinter_compound_motion_search( return tmp_rate_mv; } +static void get_inter_predictors_masked_compound( + const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1, + int16_t *residual1, int16_t *diff10, int *strides) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int can_use_previous = cm->allow_warped_motion; + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous); + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); + const struct buf_2d *const src = &x->plane[0].src; + if (get_bitdepth_data_path_index(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), + bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, + bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); + } +} + static int64_t build_and_cost_compound_type( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, - const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv, - BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, - int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) { + const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, + int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, + uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, + int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd, + int *calc_pred_masked_compound) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -7456,19 +7683,30 @@ static int64_t build_and_cost_compound_type( int64_t tmp_skip_sse_sb; const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + if (*calc_pred_masked_compound) { + get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0, + preds1, residual1, diff10, strides); + *calc_pred_masked_compound = 0; + } + best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10); *rs2 += get_interinter_compound_mask_rate(x, mbmi); best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); - if (have_newmv_in_inter_mode(this_mode) && - use_masked_motion_search(compound_type)) { + // Although the true rate_mv might be different after motion search, but it + // is unlikely to be the best mode considering the transform rd cost and other + // mode overhead cost + int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); + if (mode_rd > ref_best_rd) return INT64_MAX; + + if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) { *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col); av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); if (rd >= best_rd_cur) { mbmi->mv[0].as_int = cur_mv[0].as_int; @@ -7508,12 +7746,72 @@ typedef struct { int (*single_newmv_valid)[REF_FRAMES]; // Pointer to array of predicted rate-distortion // Should point to first of 2 arrays in 2D array - int64_t (*modelled_rd)[REF_FRAMES]; + int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES]; InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES]; int ref_frame_cost; int single_comp_cost; + int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES]; + int skip_motion_mode; + INTERINTRA_MODE *inter_intra_mode; } HandleInterModeArgs; +/* If the current mode shares the same mv with other modes with higher cost, + * skip this mode. */ +static int skip_repeated_mv(const AV1_COMMON *const cm, + const MACROBLOCK *const x, + PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frames[2], + InterModeSearchState *search_state) { + const int is_comp_pred = ref_frames[1] > INTRA_FRAME; + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + PREDICTION_MODE compare_mode = MB_MODE_COUNT; + if (!is_comp_pred) { + if (this_mode == NEARMV) { + if (ref_mv_count == 0) { + // NEARMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // NEARMV has the same motion vector as GLOBALMV + compare_mode = GLOBALMV; + } + } + if (this_mode == GLOBALMV) { + if (ref_mv_count == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // GLOBALMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1) { + // GLOBALMV has the same motion vector as NEARMV + compare_mode = NEARMV; + } + } + + if (compare_mode != MB_MODE_COUNT) { + // Use modelled_rd to check whether compare mode was searched + if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] != + INT64_MAX) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames); + const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx); + const int this_cost = cost_mv_ref(x, this_mode, mode_ctx); + + // Only skip if the mode cost is larger than compare mode cost + if (this_cost > compare_cost) { + search_state->modelled_rd[this_mode][0][ref_frames[0]] = + search_state->modelled_rd[compare_mode][0][ref_frames[0]]; + return 1; + } + } + } + } + return 0; +} + static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, const AV1_COMMON *cm, const MACROBLOCK *x) { @@ -7640,62 +7938,97 @@ static INLINE int64_t interpolation_filter_rd( const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - int tmp_rate, tmp_skip_sb = 0; - int64_t tmp_dist, tmp_skip_sse = INT64_MAX; + int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 }; + int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 }; const InterpFilters last_best = mbmi->interp_filters; mbmi->interp_filters = filter_sets[filter_idx]; const int tmp_rs = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); - if (!skip_pred) { - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); - av1_subtract_plane(x, bsize, 0); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, - &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb, - &tmp_skip_sse, NULL, NULL, NULL); -#endif + assert(skip_pred != 2); + assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags)); + assert(rate[0] >= 0); + assert(dist[0] >= 0); + assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1)); + assert(skip_sse_sb[0] >= 0); + assert(rate[1] >= 0); + assert(dist[1] >= 0); + assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1)); + assert(skip_sse_sb[1] >= 0); + + if (skip_pred != cpi->default_interp_skip_flags) { + if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) { + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); + PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], + &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL); + tmp_rate[1] = tmp_rate[0]; + tmp_dist[1] = tmp_dist[0]; + } else { + // only luma MC is skipped + tmp_rate[1] = rate[0]; + tmp_dist[1] = dist[0]; + } if (num_planes > 1) { - int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); - if (tmp_y_rd > *rd) { - mbmi->interp_filters = last_best; - return 0; + for (int plane = 1; plane < num_planes; ++plane) { + int tmp_rate_uv, tmp_skip_sb_uv; + int64_t tmp_dist_uv, tmp_skip_sse_uv; + int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]); + if (tmp_rd >= *rd) { + mbmi->interp_filters = last_best; + return 0; + } + av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane); + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv, + &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL); + tmp_rate[1] = + (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX); + tmp_dist[1] += tmp_dist_uv; + tmp_skip_sb[1] &= tmp_skip_sb_uv; + tmp_skip_sse[1] += tmp_skip_sse_uv; } - int tmp_rate_uv, tmp_skip_sb_uv; - int64_t tmp_dist_uv, tmp_skip_sse_uv; - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); - for (int plane = 1; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1, - &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv, - &tmp_skip_sse_uv, NULL, NULL, NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv, - &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, - NULL, NULL); -#endif - tmp_rate += tmp_rate_uv; - tmp_skip_sb &= tmp_skip_sb_uv; - tmp_dist += tmp_dist_uv; - tmp_skip_sse += tmp_skip_sse_uv; } } else { - tmp_rate = *rate; - tmp_dist = *dist; + // both luma and chroma MC is skipped + tmp_rate[1] = rate[1]; + tmp_dist[1] = dist[1]; } - int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); + int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]); + if (tmp_rd < *rd) { *rd = tmp_rd; *switchable_rate = tmp_rs; - *skip_txfm_sb = tmp_skip_sb; - *skip_sse_sb = tmp_skip_sse; - *rate = tmp_rate; - *dist = tmp_dist; - if (!skip_pred) { + if (skip_pred != cpi->default_interp_skip_flags) { + if (skip_pred == 0) { + // Overwrite the data as current filter is the best one + tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1]; + tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1]; + memcpy(rate, tmp_rate, sizeof(*rate) * 2); + memcpy(dist, tmp_dist, sizeof(*dist) * 2); + memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2); + memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2); + // As luma MC data is computed, no need to recompute after the search + x->recalc_luma_mc_data = 0; + } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) { + // As luma MC data is not computed, update of luma data can be skipped + rate[1] = tmp_rate[1]; + dist[1] = tmp_dist[1]; + skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1]; + skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1]; + // As luma MC data is not recomputed and current filter is the best, + // indicate the possibility of recomputing MC data + // If current buffer contains valid MC data, toggle to indicate that + // luma MC data needs to be recomputed + x->recalc_luma_mc_data ^= 1; + } swap_dst_buf(xd, dst_bufs, num_planes); } return 1; @@ -7715,8 +8048,8 @@ static INLINE int find_best_horiz_interp_filter_rd( int i; const int bw = block_size_wide[bsize]; assert(best_dual_mode == 0); - if ((bw <= 4) && (!skip_hor)) { - int skip_pred = 1; + if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) { + int skip_pred = cpi->default_interp_skip_flags; // Process the filters in reverse order to enable reusing rate and // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { @@ -7726,7 +8059,7 @@ static INLINE int find_best_horiz_interp_filter_rd( dist)) { best_dual_mode = i; } - skip_pred = 0; + skip_pred = skip_hor; } } else { for (i = 1; i < SWITCHABLE_FILTERS; ++i) { @@ -7751,8 +8084,8 @@ static INLINE void find_best_vert_interp_filter_rd( int best_dual_mode, int filter_set_size) { int i; const int bh = block_size_high[bsize]; - if ((bh <= 4) && (!skip_ver)) { - int skip_pred = 1; + if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) { + int skip_pred = cpi->default_interp_skip_flags; // Process the filters in reverse order to enable reusing rate and // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP assert(filter_set_size == DUAL_FILTER_SET_SIZE); @@ -7762,7 +8095,7 @@ static INLINE void find_best_vert_interp_filter_rd( switchable_rate, skip_txfm_sb, skip_sse_sb, dst_bufs, i, switchable_ctx, skip_pred, rate, dist); - skip_pred = 0; + skip_pred = skip_ver; } } else { for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size; @@ -7784,6 +8117,7 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st, return 0; } } + if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0; return 1; } @@ -7806,11 +8140,11 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x, const int comp_idx = mbmi->compound_idx; const int offset = x->interp_filter_stats_idx[comp_idx]; if (offset < MAX_INTERP_FILTER_STATS) { - INTERPOLATION_FILTER_STATS stat = { - mbmi->interp_filters, - { mbmi->mv[0], mbmi->mv[1] }, - { mbmi->ref_frame[0], mbmi->ref_frame[1] }, - }; + INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters, + { mbmi->mv[0], mbmi->mv[1] }, + { mbmi->ref_frame[0], + mbmi->ref_frame[1] }, + mbmi->interinter_comp.type }; x->interp_filter_stats[comp_idx][offset] = stat; x->interp_filter_stats_idx[comp_idx]++; } @@ -7821,15 +8155,22 @@ static int64_t interpolation_filter_search( int mi_row, int mi_col, const BUFFER_SET *const tmp_dst, BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, - int64_t *const skip_sse_sb) { + int64_t *const skip_sse_sb, const int skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int need_search = av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); - int i, tmp_rate; - int64_t tmp_dist; + int i; + // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative + // data of all planes + int tmp_rate[2] = { 0, 0 }; + int64_t tmp_dist[2] = { 0, 0 }; + int best_skip_txfm_sb[2] = { 1, 1 }; + int64_t best_skip_sse_sb[2] = { 0, 0 }; + const int ref_frame = xd->mi[0]->ref_frame[0]; (void)single_filter; int match_found = -1; @@ -7845,18 +8186,32 @@ static int64_t interpolation_filter_search( switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); *switchable_rate = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); - for (int plane = 0; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, - &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL, - NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist, - skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL); -#endif // DNN_BASED_RD_INTERP_FILTER - *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist); + if (!skip_build_pred) + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); + PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], + &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL); + if (num_planes > 1) + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1], + &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL, + NULL); + tmp_rate[1] = + (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX); + assert(tmp_rate[1] >= 0); + tmp_dist[1] = tmp_dist[0] + tmp_dist[1]; + best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1]; + best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1]; + *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]); + *skip_txfm_sb = best_skip_txfm_sb[1]; + *skip_sse_sb = best_skip_sse_sb[1]; + x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4); if (assign_filter != SWITCHABLE || match_found != -1) { return 0; @@ -7866,22 +8221,71 @@ static int64_t interpolation_filter_search( av1_broadcast_interp_filter(EIGHTTAP_REGULAR)); return 0; } - int skip_hor = 1; - int skip_ver = 1; + if (args->modelled_rd != NULL) { + if (has_second_ref(mbmi)) { + const int ref_mv_idx = mbmi->ref_mv_idx; + int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + const int mode0 = compound_ref0_mode(mbmi->mode); + const int mode1 = compound_ref1_mode(mbmi->mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) { + return INT64_MAX; + } + } + } + + x->recalc_luma_mc_data = 0; + // skip_flag=xx (in binary form) + // Setting 0th flag corresonds to skipping luma MC and setting 1st bt + // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip + // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only" + // Skip_flag=2 is not a valid case + // skip_flag=3 corresponds to "Skip both luma and chroma MC" + int skip_hor = cpi->default_interp_skip_flags; + int skip_ver = cpi->default_interp_skip_flags; const int is_compound = has_second_ref(mbmi); - for (int k = 0; k < num_planes - 1; ++k) { - struct macroblockd_plane *const pd = &xd->plane[k]; - const int bw = pd->width; - const int bh = pd->height; - for (int j = 0; j < 1 + is_compound; ++j) { - const MV mv = mbmi->mv[j].as_mv; + assert(is_intrabc_block(mbmi) == 0); + for (int j = 0; j < 1 + is_compound; ++j) { + const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME]; + const struct scale_factors *const sf = &ref_buf->sf; + // TODO(any): Refine skip flag calculation considering scaling + if (av1_is_scaled(sf)) { + skip_hor = 0; + skip_ver = 0; + break; + } + const MV mv = mbmi->mv[j].as_mv; + int skip_hor_plane = 0; + int skip_ver_plane = 0; + for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) { + struct macroblockd_plane *const pd = &xd->plane[k]; + const int bw = pd->width; + const int bh = pd->height; const MV mv_q4 = clamp_mv_to_umv_border_sb( xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; - skip_hor &= (sub_x == 0); - skip_ver &= (sub_y == 0); - } + skip_hor_plane |= ((sub_x == 0) << k); + skip_ver_plane |= ((sub_y == 0) << k); + } + skip_hor = skip_hor & skip_hor_plane; + skip_ver = skip_ver & skip_ver_plane; + // It is not valid that "luma MV is sub-pel, whereas chroma MV is not" + assert(skip_hor != 2); + assert(skip_ver != 2); + } + // When compond prediction type is compound segment wedge, luma MC and chroma + // MC need to go hand in hand as mask generated during luma MC is reuired for + // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during + // vertical filter decision may be incorrect as temporary MC evaluation + // overwrites the mask. Make skip_ver as 0 for this case so that mask is + // populated during luma MC + if (is_compound && mbmi->compound_idx == 1 && + mbmi->interinter_comp.type == COMPOUND_DIFFWTD) { + assert(mbmi->comp_group_idx == 1); + if (skip_hor == 0 && skip_ver == 1) skip_ver = 0; } // do interp_filter search const int filter_set_size = DUAL_FILTER_SET_SIZE; @@ -7895,14 +8299,14 @@ static int64_t interpolation_filter_search( // EIGHTTAP_REGULAR mode is calculated beforehand best_dual_mode = find_best_horiz_interp_filter_rd( x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, - skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor, - &tmp_rate, &tmp_dist, best_dual_mode); + best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor, + tmp_rate, tmp_dist, best_dual_mode); // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes find_best_vert_interp_filter_rd( x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, - skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, - &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size); + best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, + tmp_rate, tmp_dist, best_dual_mode, filter_set_size); } else { // EIGHTTAP_REGULAR mode is calculated beforehand for (i = 1; i < filter_set_size; ++i) { @@ -7912,12 +8316,25 @@ static int64_t interpolation_filter_search( if (filter_x != filter_y) continue; } interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i, switchable_ctx, 0, &tmp_rate, - &tmp_dist); + switchable_rate, best_skip_txfm_sb, + best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0, + tmp_rate, tmp_dist); + assert(x->recalc_luma_mc_data == 0); } } swap_dst_buf(xd, dst_bufs, num_planes); + // Recompute final MC data if required + if (x->recalc_luma_mc_data == 1) { + // Recomputing final luma MC data is required only if the same was skipped + // in either of the directions Condition below is necessary, but not + // sufficient + assert((skip_hor == 1) || (skip_ver == 1)); + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + *skip_txfm_sb = best_skip_txfm_sb[1]; + *skip_sse_sb = best_skip_sse_sb[1]; + x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4); + // save search results if (cpi->sf.skip_repeat_interpolation_filter_search) { assert(match_found == -1); @@ -7926,6 +8343,301 @@ static int64_t interpolation_filter_search( return 0; } +static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, RD_STATS *rd_stats, + RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, + int mode_rate, int64_t ref_best_rd) { + /* + * This function combines y and uv planes' transform search processes + * together, when the prediction is generated. It first does subtration to + * obtain the prediction error. Then it calls + * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and + * handles the early terminations happen in those functions. At the end, it + * computes the rd_stats/_y/_uv accordingly. + */ + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int skip_txfm_sb = 0; + const int num_planes = av1_num_planes(cm); + const int ref_frame_1 = mbmi->ref_frame[1]; + const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); + const int64_t rd_thresh = + ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; + const int skip_ctx = av1_get_skip_context(xd); + const int64_t min_header_rate = + mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + // Account for minimum skip and non_skip rd. + // Eventually either one of them will be added to mode_rate + const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); + + if (min_header_rd_possible > ref_best_rd) { + av1_invalid_rd_stats(rd_stats_y); + av1_invalid_rd_stats(rd_stats); + return 0; + } + + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_y); + av1_init_rd_stats(rd_stats_uv); + rd_stats->rate = mode_rate; + + if (!cpi->common.all_lossless) + check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb); + if (!skip_txfm_sb) { + int64_t non_skip_rdcosty = INT64_MAX; + int64_t skip_rdcosty = INT64_MAX; + int64_t min_rdcosty = INT64_MAX; + int is_cost_valid_uv = 0; + + // cost and distortion + av1_subtract_plane(x, bsize, 0); + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { + // Motion mode + select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh); +#if CONFIG_COLLECT_RD_STATS == 2 + PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 2 + } else { + super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats_y->skip); + } + + if (rd_stats_y->rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + // TODO(angiebird): check if we need this + // restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + + av1_merge_rd_stats(rd_stats, rd_stats_y); + + non_skip_rdcosty = RDCOST( + x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist); + skip_rdcosty = + RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse); + min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty); + + if (min_rdcosty > ref_best_rd) { + int64_t tokenonly_rdy = + AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist), + RDCOST(x->rdmult, 0, rd_stats_y->sse)); + // Invalidate rd_stats_y to skip the rest of the motion modes search + if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) > + rd_thresh) + av1_invalid_rd_stats(rd_stats_y); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + + if (num_planes > 1) { + /* clang-format off */ + is_cost_valid_uv = + inter_block_uvrd(cpi, x, rd_stats_uv, bsize, + ref_best_rd - non_skip_rdcosty, + ref_best_rd - skip_rdcosty, FTXS_NONE); + if (!is_cost_valid_uv) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + /* clang-format on */ + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } else { + av1_init_rd_stats(rd_stats_uv); + } + if (rd_stats->skip) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + mbmi->skip = 0; + // here mbmi->skip temporarily plays a role as what this_skip2 does + + int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + } else if (!xd->lossless[mbmi->segment_id] && + (RDCOST(x->rdmult, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][0], + rd_stats->dist) >= + RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + rd_stats->dist = rd_stats->sse; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + mbmi->skip = 1; + } else { + rd_stats->rate += x->skip_cost[skip_ctx][0]; + mbmi->skip = 0; + } + } else { + x->skip = 1; + mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); + // The cost of skip bit needs to be added. + mbmi->skip = 0; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + + rd_stats->dist = 0; + rd_stats->sse = 0; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->skip = 1; + int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + } + return 1; +} + +static int handle_inter_intra_mode(const AV1_COMP *const cpi, + MACROBLOCK *const x, BLOCK_SIZE bsize, + int mi_row, int mi_col, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, + int64_t ref_best_rd, int *rate_mv, + int *tmp_rate2, BUFFER_SET *orig_dst) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + + INTERINTRA_MODE best_interintra_mode = II_DC_PRED; + int64_t rd, best_interintra_rd = INT64_MAX; + int rmode, rate_sum; + int64_t dist_sum; + int tmp_rate_mv = 0; + int tmp_skip_txfm_sb; + int bw = block_size_wide[bsize]; + int64_t tmp_skip_sse_sb; + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); + uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + const int *const interintra_mode_cost = + x->interintra_mode_cost[size_group_lookup[bsize]]; + const int_mv mv0 = mbmi->mv[0]; + const int is_wedge_used = is_interintra_wedge_used(bsize); + int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0; + mbmi->ref_frame[1] = NONE_FRAME; + xd->plane[0].dst.buf = tmp_buf; + xd->plane[0].dst.stride = bw; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); + + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = INTRA_FRAME; + mbmi->use_wedge_interintra = 0; + best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]]; + int j = 0; + if (cpi->sf.reuse_inter_intra_mode == 0 || + best_interintra_mode == INTERINTRA_MODES) { + for (j = 0; j < INTERINTRA_MODES; ++j) { + mbmi->interintra_mode = (INTERINTRA_MODE)j; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); + if (rd < best_interintra_rd) { + best_interintra_rd = rd; + best_interintra_mode = mbmi->interintra_mode; + } + } + args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; + } + if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) { + mbmi->interintra_mode = best_interintra_mode; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum); + best_interintra_rd = rd; + if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { + return -1; + } + if (is_wedge_used) { + int64_t best_interintra_rd_nowedge = rd; + int64_t best_interintra_rd_wedge = INT64_MAX; + int_mv tmp_mv; + // Disable wedge search if source variance is small + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { + mbmi->use_wedge_interintra = 1; + + rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + + x->wedge_interintra_cost[bsize][1]; + + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + + best_interintra_rd_wedge += + RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0); + rd = INT64_MAX; + // Refine motion vector. + if (have_newmv_in_inter_mode(mbmi->mode)) { + // get negative of mask + const uint8_t *mask = av1_get_contiguous_soft_mask( + mbmi->interintra_wedge_index, 1, bsize); + tmp_mv = mbmi->mv[0]; + compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, + mi_col, intrapred, mask, bw, &tmp_rate_mv, + 0); + if (mbmi->mv[0].as_int != tmp_mv.as_int) { + mbmi->mv[0].as_int = tmp_mv.as_int; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, + bsize); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge, + dist_sum); + } + } + if (rd >= best_interintra_rd_wedge) { + tmp_mv.as_int = mv0.as_int; + tmp_rate_mv = *rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, + dist_sum); + best_interintra_rd_wedge = rd; + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->use_wedge_interintra = 1; + mbmi->mv[0].as_int = tmp_mv.as_int; + *tmp_rate2 += tmp_rate_mv - *rate_mv; + *rate_mv = tmp_rate_mv; + } else { + mbmi->use_wedge_interintra = 0; + mbmi->mv[0].as_int = mv0.as_int; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + } else { + mbmi->use_wedge_interintra = 0; + } + } // if (is_interintra_wedge_used(bsize)) + if (num_planes > 1) { + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + return 0; +} + // TODO(afergs): Refactor the MBMI references in here - there's four // TODO(afergs): Refactor optional args - add them to a struct or remove static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, @@ -7933,11 +8645,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, HandleInterModeArgs *const args, - int64_t ref_best_rd, const int *refs, int rate_mv, - BUFFER_SET *orig_dst + int64_t ref_best_rd, const int *refs, + int *rate_mv, BUFFER_SET *orig_dst #if CONFIG_COLLECT_INTER_MODE_RD_STATS , - int64_t *best_est_rd + TileDataEnc *tile_data, int64_t *best_est_rd, + int do_tx_search, InterModesInfo *inter_modes_info #endif ) { const AV1_COMMON *const cm = &cpi->common; @@ -7946,41 +8659,49 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, MB_MODE_INFO *mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; - int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0; + const int rate2_nocoeff = rd_stats->rate; + int best_xskip, best_disable_skip = 0; RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; MB_MODE_INFO base_mbmi, best_mbmi; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int rate_mv0 = *rate_mv; + int interintra_allowed = cm->seq_params.enable_interintra_compound && is_interintra_allowed(mbmi) && mbmi->compound_idx; int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE]; - int total_samples; - - (void)rate_mv; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; av1_invalid_rd_stats(&best_rd_stats); - aom_clear_system_state(); - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); - total_samples = mbmi->num_proj_ref[0]; - rate2_nocoeff = rd_stats->rate; + mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + if (cm->switchable_motion_mode) { + last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->allow_warped_motion); + } + if (last_motion_mode_allowed == WARPED_CAUSAL) { + mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); + } + int total_samples = mbmi->num_proj_ref; + if (total_samples == 0) { + last_motion_mode_allowed = OBMC_CAUSAL; + } base_mbmi = *mbmi; - MOTION_MODE last_motion_mode_allowed = - cm->switchable_motion_mode - ? motion_mode_allowed(xd->global_motion, xd, mbmi, - cm->allow_warped_motion) - : SIMPLE_TRANSLATION; - assert(mbmi->ref_frame[1] != INTRA_FRAME); - const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; + const int switchable_rate = + av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0; int64_t best_rd = INT64_MAX; - + int best_rate_mv = rate_mv0; for (int mode_index = (int)SIMPLE_TRANSLATION; mode_index <= (int)last_motion_mode_allowed + interintra_allowed; mode_index++) { + if (args->skip_motion_mode && mode_index) continue; int64_t tmp_rd = INT64_MAX; int tmp_rate2 = rate2_nocoeff; int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; int skip_txfm_sb = 0; + int tmp_rate_mv = rate_mv0; *mbmi = base_mbmi; if (is_interintra_mode) { @@ -7995,10 +8716,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, // The prediction is calculated before motion_mode_rd() is called in // handle_inter_mode() } else if (mbmi->motion_mode == OBMC_CAUSAL) { - mbmi->motion_mode = OBMC_CAUSAL; - if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { - int tmp_rate_mv = 0; - + uint32_t cur_mv = mbmi->mv[0].as_int; + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv); mbmi->mv[0].as_int = x->best_mv.as_int; #if USE_DISCOUNT_NEWMV_TEST @@ -8006,36 +8726,38 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } #endif - tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } + if (mbmi->mv[0].as_int != cur_mv) { + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); } - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); av1_build_obmc_inter_prediction( cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride, args->left_pred_buf, args->left_pred_stride); } else if (mbmi->motion_mode == WARPED_CAUSAL) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; mbmi->motion_mode = WARPED_CAUSAL; - mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; mbmi->interp_filters = av1_broadcast_interp_filter( av1_unswitchable_filter(cm->interp_filter)); memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); // Select the samples according to motion vector difference - if (mbmi->num_proj_ref[0] > 1) { - mbmi->num_proj_ref[0] = selectSamples( - &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize); + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); } - if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, + if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, - &mbmi->wm_params[0], mi_row, mi_col)) { + &mbmi->wm_params, mi_row, mi_col)) { // Refine MV for NEWMV mode - if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { - int tmp_rate_mv = 0; + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { const int_mv mv0 = mbmi->mv[0]; - const WarpedMotionParams wm_params0 = mbmi->wm_params[0]; - int num_proj_ref0 = mbmi->num_proj_ref[0]; + const WarpedMotionParams wm_params0 = mbmi->wm_params; + int num_proj_ref0 = mbmi->num_proj_ref; // Refine MV in a small range. av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0, @@ -8057,12 +8779,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } #endif - tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; } else { // Restore the old MV and WM parameters. mbmi->mv[0] = mv0; - mbmi->wm_params[0] = wm_params0; - mbmi->num_proj_ref[0] = num_proj_ref0; + mbmi->wm_params = wm_params0; + mbmi->num_proj_ref = num_proj_ref0; } } @@ -8071,144 +8793,10 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, continue; } } else if (is_interintra_mode) { - INTERINTRA_MODE best_interintra_mode = II_DC_PRED; - int64_t rd, best_interintra_rd = INT64_MAX; - int rmode, rate_sum; - int64_t dist_sum; - int j; - int tmp_rate_mv = 0; - int tmp_skip_txfm_sb; - int bw = block_size_wide[bsize]; - int64_t tmp_skip_sse_sb; - DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); - uint8_t *tmp_buf, *intrapred; - const int *const interintra_mode_cost = - x->interintra_mode_cost[size_group_lookup[bsize]]; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_); - intrapred = CONVERT_TO_BYTEPTR(intrapred_); - } else { - tmp_buf = tmp_buf_; - intrapred = intrapred_; - } - const int_mv mv0 = mbmi->mv[0]; - - mbmi->ref_frame[1] = NONE_FRAME; - xd->plane[0].dst.buf = tmp_buf; - xd->plane[0].dst.stride = bw; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); - - restore_dst_buf(xd, *orig_dst, num_planes); - mbmi->ref_frame[1] = INTRA_FRAME; - mbmi->use_wedge_interintra = 0; - for (j = 0; j < INTERINTRA_MODES; ++j) { - mbmi->interintra_mode = (INTERINTRA_MODE)j; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); - rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); - if (rd < best_interintra_rd) { - best_interintra_rd = rd; - best_interintra_mode = mbmi->interintra_mode; - } - } - mbmi->interintra_mode = best_interintra_mode; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum); - best_interintra_rd = rd; - - if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { - // restore ref_frame[1] - mbmi->ref_frame[1] = ref_frame_1; - continue; - } - - if (is_interintra_wedge_used(bsize)) { - int64_t best_interintra_rd_nowedge = INT64_MAX; - int64_t best_interintra_rd_wedge = INT64_MAX; - int_mv tmp_mv; - InterpFilters backup_interp_filters = mbmi->interp_filters; - int rwedge = x->wedge_interintra_cost[bsize][0]; - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum); - best_interintra_rd_nowedge = rd; - - // Disable wedge search if source variance is small - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { - mbmi->use_wedge_interintra = 1; - - rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + - x->wedge_interintra_cost[bsize][1]; - - best_interintra_rd_wedge = - pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); - - best_interintra_rd_wedge += - RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0); - // Refine motion vector. - if (have_newmv_in_inter_mode(mbmi->mode)) { - // get negative of mask - const uint8_t *mask = av1_get_contiguous_soft_mask( - mbmi->interintra_wedge_index, 1, bsize); - tmp_mv = av1_get_ref_mv(x, 0); - compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, - mi_col, intrapred, mask, bw, - &tmp_rate_mv, 0); - mbmi->mv[0].as_int = tmp_mv.as_int; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, - bsize); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, - NULL); - rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge, - dist_sum); - if (rd >= best_interintra_rd_wedge) { - tmp_mv.as_int = mv0.as_int; - tmp_rate_mv = rate_mv; - mbmi->interp_filters = backup_interp_filters; - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - } else { - tmp_mv.as_int = mv0.as_int; - tmp_rate_mv = rate_mv; - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - // Evaluate closer to true rd - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, - INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, - dist_sum); - best_interintra_rd_wedge = rd; - if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { - mbmi->use_wedge_interintra = 1; - mbmi->mv[0].as_int = tmp_mv.as_int; - tmp_rate2 += tmp_rate_mv - rate_mv; - } else { - mbmi->use_wedge_interintra = 0; - mbmi->mv[0].as_int = mv0.as_int; - mbmi->interp_filters = backup_interp_filters; - } - } else { - mbmi->use_wedge_interintra = 0; - } - } // if (is_interintra_wedge_used(bsize)) - restore_dst_buf(xd, *orig_dst, num_planes); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + const int ret = handle_inter_intra_mode( + cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv, + &tmp_rate2, orig_dst); + if (ret < 0) continue; } if (!cpi->common.all_lossless) @@ -8220,8 +8808,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, rd_stats->sse = 0; rd_stats->skip = 1; rd_stats->rate = tmp_rate2; - if (av1_is_interp_needed(xd)) - rd_stats->rate += av1_get_switchable_rate(cm, x, xd); + if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate; if (interintra_allowed) { rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]] [mbmi->ref_frame[1] == INTRA_FRAME]; @@ -8246,167 +8833,86 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode]; } } + if (!skip_txfm_sb) { #if CONFIG_COLLECT_INTER_MODE_RD_STATS int64_t est_rd = 0; int est_skip = 0; - if (cpi->sf.inter_mode_rd_model_estimation) { - InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type]; + if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && + cm->tile_rows == 1) { + InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type]; if (md->ready) { const int64_t curr_sse = get_sse(cpi, x); - est_rd = - get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate); + est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse, + rd_stats->rate); est_skip = est_rd * 0.8 > *best_est_rd; -#if INTER_MODE_RD_TEST - if (est_rd < *best_est_rd) { - *best_est_rd = est_rd; - } -#else // INTER_MODE_RD_TEST if (est_skip) { - ++md->skip_count; mbmi->ref_frame[1] = ref_frame_1; continue; } else { if (est_rd < *best_est_rd) { *best_est_rd = est_rd; } - ++md->non_skip_count; } -#endif // INTER_MODE_RD_TEST } } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS + } - int64_t rdcosty = INT64_MAX; - int is_cost_valid_uv = 0; - - // cost and distortion - av1_subtract_plane(x, bsize, 0); - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - // Motion mode - select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, - ref_best_rd); -#if CONFIG_COLLECT_RD_STATS == 2 - PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); -#endif // CONFIG_COLLECT_RD_STATS == 2 +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (!do_tx_search) { + const int64_t curr_sse = get_sse(cpi, x); + int est_residue_cost = 0; + int64_t est_dist = 0; + const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, + &est_residue_cost, &est_dist); + (void)has_est_rd; + assert(has_est_rd); + const int mode_rate = rd_stats->rate; + rd_stats->rate += est_residue_cost; + rd_stats->dist = est_dist; + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (cm->reference_mode == SINGLE_REFERENCE) { + if (!is_comp_pred) { + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, mbmi); + } } else { - super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); - memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats_y->skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, mbmi); } - - if (rd_stats_y->rate == INT_MAX) { - av1_invalid_rd_stats(rd_stats); - if (mbmi->motion_mode != SIMPLE_TRANSLATION || - mbmi->ref_frame[1] == INTRA_FRAME) { - mbmi->ref_frame[1] = ref_frame_1; - continue; - } else { - restore_dst_buf(xd, *orig_dst, num_planes); - mbmi->ref_frame[1] = ref_frame_1; + } else { +#endif + int mode_rate = rd_stats->rate; + if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y, + rd_stats_uv, mode_rate, ref_best_rd)) { + if (rd_stats_y->rate == INT_MAX && mode_index == 0) { return INT64_MAX; } + continue; } - - av1_merge_rd_stats(rd_stats, rd_stats_y); - - rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse)); - if (num_planes > 1) { - /* clang-format off */ - is_cost_valid_uv = - inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty, - FTXS_NONE); - if (!is_cost_valid_uv) { - mbmi->ref_frame[1] = ref_frame_1; - continue; + if (!skip_txfm_sb) { + const int64_t curr_rd = + RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (curr_rd < ref_best_rd) { + ref_best_rd = curr_rd; } - /* clang-format on */ - av1_merge_rd_stats(rd_stats, rd_stats_uv); - } else { - av1_init_rd_stats(rd_stats_uv); - } -#if CONFIG_RD_DEBUG - // record transform block coefficient cost - // TODO(angiebird): So far rd_debug tool only detects discrepancy of - // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi - // here because we already collect the coefficient cost. Move this part to - // other place when we need to compare non-coefficient cost. - mbmi->rd_stats = *rd_stats; -#endif // CONFIG_RD_DEBUG - const int skip_ctx = av1_get_skip_context(xd); - if (rd_stats->skip) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - mbmi->skip = 0; - // here mbmi->skip temporarily plays a role as what this_skip2 does - } else if (!xd->lossless[mbmi->segment_id] && - (RDCOST(x->rdmult, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][0], - rd_stats->dist) >= RDCOST(x->rdmult, - x->skip_cost[skip_ctx][1], - rd_stats->sse))) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - rd_stats->dist = rd_stats->sse; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - mbmi->skip = 1; - } else { - rd_stats->rate += x->skip_cost[skip_ctx][0]; - mbmi->skip = 0; - } - *disable_skip = 0; + *disable_skip = 0; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && - cm->tile_rows == 1) { -#if INTER_MODE_RD_TEST - if (md->ready) { - int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (est_skip) { - ++md->skip_count; - if (real_rd < ref_best_rd) { - ++md->fp_skip_count; - } - // int fp_skip = real_rd < ref_best_rd; - // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd - // %ld ref_best_rd %ld\n", - // est_skip, fp_skip, est_rd, *best_est_rd, real_rd, - // ref_best_rd); - } else { - ++md->non_skip_count; - } + if (cpi->sf.inter_mode_rd_model_estimation) { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse, + rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][mbmi->skip]); } -#endif // INTER_MODE_RD_TEST - inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][mbmi->skip], - rd_stats->rate, ref_best_rd); - } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS - int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (curr_rd < ref_best_rd) { - ref_best_rd = curr_rd; + } else { + *disable_skip = 1; } - } else { - x->skip = 1; - *disable_skip = 1; - mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); - - // The cost of skip bit needs to be added. - mbmi->skip = 0; - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; - - rd_stats->dist = 0; - rd_stats->sse = 0; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - rd_stats->skip = 1; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS } +#endif if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { if (is_nontrans_global_motion(xd, xd->mi[0])) { @@ -8416,23 +8922,24 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, } tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if ((mbmi->motion_mode == SIMPLE_TRANSLATION && - mbmi->ref_frame[1] != INTRA_FRAME) || - (tmp_rd < best_rd)) { + if (mode_index == 0) + args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; + if ((mode_index == 0) || (tmp_rd < best_rd)) { best_mbmi = *mbmi; best_rd = tmp_rd; best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; + best_rate_mv = tmp_rate_mv; if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); best_xskip = x->skip; best_disable_skip = *disable_skip; if (best_xskip) break; } } mbmi->ref_frame[1] = ref_frame_1; - + *rate_mv = best_rate_mv; if (best_rd == INT64_MAX) { av1_invalid_rd_stats(rd_stats); restore_dst_buf(xd, *orig_dst, num_planes); @@ -8443,7 +8950,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, *rd_stats_y = best_rd_stats_y; if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; memcpy(x->blk_skip, best_blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); x->skip = best_xskip; *disable_skip = best_disable_skip; @@ -8482,15 +8989,9 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, return 0; } -#ifndef NDEBUG -static INLINE int is_single_inter_mode(int this_mode) { - return this_mode >= SINGLE_INTER_MODE_START && - this_mode < SINGLE_INTER_MODE_END; -} -#endif - -static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) { - assert(is_single_inter_mode(single_mode)); +static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode, + uint8_t ref_mv_idx) { + assert(is_inter_singleref_mode(single_mode)); int ref_mv_offset; if (single_mode == NEARESTMV) { ref_mv_offset = 0; @@ -8502,14 +9003,15 @@ static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) { return ref_mv_offset; } -static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, - int ref_mv_idx, +static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, const MV_REFERENCE_FRAME *ref_frame, const MB_MODE_INFO_EXT *mbmi_ext) { const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); const int is_comp_pred = ref_frame[1] > INTRA_FRAME; - const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred); - assert(is_single_inter_mode(single_mode)); + const PREDICTION_MODE single_mode = + get_single_mode(this_mode, ref_idx, is_comp_pred); + assert(is_inter_singleref_mode(single_mode)); if (single_mode == NEWMV) { this_mv->as_int = INVALID_MV; } else if (single_mode == GLOBALMV) { @@ -8533,7 +9035,7 @@ static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, } // This function update the non-new mv for the current prediction mode -static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode, +static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode, const AV1_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; @@ -8543,7 +9045,8 @@ static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode, int_mv this_mv; get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame, x->mbmi_ext); - const int single_mode = get_single_mode(this_mode, i, is_comp_pred); + const PREDICTION_MODE single_mode = + get_single_mode(this_mode, i, is_comp_pred); if (single_mode == NEWMV) { cur_mv[i] = this_mv; } else { @@ -8584,18 +9087,29 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, return cost; } -static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int mi_col, int mi_row, - int_mv *cur_mv, int masked_compound_used, - BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst, - int *rate_mv, int64_t *rd, - RD_STATS *rd_stats, int64_t ref_best_rd) { +// Struct for buffers used by compound_type_rd() function. +// For sizes and alignment of these arrays, refer to +// alloc_compound_type_rd_buffers() function. +typedef struct { + uint8_t *pred0; + uint8_t *pred1; + int16_t *residual1; // src - pred1 + int16_t *diff10; // pred1 - pred0 + uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask +} CompoundTypeRdBuffers; + +static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_col, int mi_row, + int_mv *cur_mv, int masked_compound_used, + BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, + CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, + int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; - const int this_mode = mbmi->mode; + const PREDICTION_MODE this_mode = mbmi->mode; const int bw = block_size_wide[bsize]; - const int bh = block_size_high[bsize]; int rate_sum, rs2; int64_t dist_sum; @@ -8605,45 +9119,19 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t tmp_skip_sse_sb; INTERINTER_COMPOUND_DATA best_compound_data; best_compound_data.type = COMPOUND_AVERAGE; - DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 - DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 - uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE]; - uint8_t *preds0[1] = { pred0 }; - uint8_t *preds1[1] = { pred1 }; + uint8_t *preds0[1] = { buffers->pred0 }; + uint8_t *preds1[1] = { buffers->pred1 }; int strides[1] = { bw }; int tmp_rate_mv; const int num_pix = 1 << num_pels_log2_lookup[bsize]; const int mask_len = 2 * num_pix * sizeof(uint8_t); COMPOUND_TYPE cur_type; int best_compmode_interinter_cost = 0; - int can_use_previous = cm->allow_warped_motion; + int calc_pred_masked_compound = 1; best_mv[0].as_int = cur_mv[0].as_int; best_mv[1].as_int = cur_mv[1].as_int; *rd = INT64_MAX; - if (masked_compound_used) { - // get inter predictors to use for masked compound modes - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous); - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); - const struct buf_2d *const src = &x->plane[0].src; - if (get_bitdepth_data_path_index(xd)) { - aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(pred1), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1), - bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd); - } else { - aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1, - bw); - aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw); - } - } - const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0]; - const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst; - const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst; for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; if (!is_interinter_compound_used(cur_type, bsize)) continue; @@ -8662,17 +9150,17 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, } masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; rs2 = masked_type_cost; - // No need to call av1_build_inter_predictors_sby here - // 1. COMPOUND_AVERAGE is always the first candidate - // 2. av1_build_inter_predictors_sby has been called by - // interpolation_filter_search - int64_t est_rd = - estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd < ref_best_rd) { + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + int64_t est_rd = + estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (est_rd != INT64_MAX) + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + } // use spare buffer for following compound type try - restore_dst_buf(xd, *backup_buf, 1); - if (est_rd != INT64_MAX) - best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + restore_dst_buf(xd, *tmp_dst, 1); } else { mbmi->comp_group_idx = 1; masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1]; @@ -8682,19 +9170,20 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, *rd / 3 < ref_best_rd) { best_rd_cur = build_and_cost_compound_type( cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, - &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row, - mi_col); + &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, + strides, mi_row, mi_col, rd_stats->rate, ref_best_rd, + &calc_pred_masked_compound); } } if (best_rd_cur < *rd) { *rd = best_rd_cur; best_compound_data = mbmi->interinter_comp; if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) { - memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len); + memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len); } best_compmode_interinter_cost = rs2; if (have_newmv_in_inter_mode(this_mode)) { - if (use_masked_motion_search(cur_type)) { + if (cur_type == COMPOUND_WEDGE) { best_tmp_rate_mv = tmp_rate_mv; best_mv[0].as_int = mbmi->mv[0].as_int; best_mv[1].as_int = mbmi->mv[1].as_int; @@ -8712,28 +9201,69 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->comp_group_idx = (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1; mbmi->interinter_comp = best_compound_data; - memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len); + memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len); } if (have_newmv_in_inter_mode(this_mode)) { mbmi->mv[0].as_int = best_mv[0].as_int; mbmi->mv[1].as_int = best_mv[1].as_int; - if (use_masked_motion_search(mbmi->interinter_comp.type)) { + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { rd_stats->rate += best_tmp_rate_mv - *rate_mv; *rate_mv = best_tmp_rate_mv; } } - restore_dst_buf(xd, *best_buf, 1); + restore_dst_buf(xd, *orig_dst, 1); return best_compmode_interinter_cost; } +static INLINE int is_single_newmv_valid(HandleInterModeArgs *args, + MB_MODE_INFO *mbmi, + PREDICTION_MODE this_mode) { + for (int ref_idx = 0; ref_idx < 2; ++ref_idx) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1); + const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx]; + if (single_mode == NEWMV && + args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) { + return 0; + } + } + return 1; +} + +static int get_drl_refmv_count(const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE mode) { + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV); + const int has_drl = + (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1); + const int ref_set = + has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1; + + return ref_set; +} + +typedef struct { + int64_t rd; + int drl_cost; + int rate_mv; + int_mv mv; +} inter_mode_info; + static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, - HandleInterModeArgs *args, int64_t ref_best_rd + HandleInterModeArgs *args, int64_t ref_best_rd, + uint8_t *const tmp_buf, + CompoundTypeRdBuffers *rd_buffers #if CONFIG_COLLECT_INTER_MODE_RD_STATS , - int64_t *best_est_rd + TileDataEnc *tile_data, int64_t *best_est_rd, + const int do_tx_search, + InterModesInfo *inter_modes_info #endif ) { const AV1_COMMON *cm = &cpi->common; @@ -8742,15 +9272,26 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const int is_comp_pred = has_second_ref(mbmi); - const int this_mode = mbmi->mode; + const PREDICTION_MODE this_mode = mbmi->mode; int i; int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int rate_mv = 0; - DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); int64_t rd = INT64_MAX; - BUFFER_SET orig_dst, tmp_dst; + + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + struct macroblockd_plane *p = xd->plane; + BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; @@ -8765,36 +9306,29 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO best_mbmi = *mbmi; int best_disable_skip; int best_xskip; - int plane_rate[MAX_MB_PLANE] = { 0 }; - int64_t plane_sse[MAX_MB_PLANE] = { 0 }; - int64_t plane_dist[MAX_MB_PLANE] = { 0 }; int64_t newmv_ret_val = INT64_MAX; int_mv backup_mv[2] = { { 0 } }; int backup_rate_mv = 0; + inter_mode_info mode_info[MAX_REF_MV_SERCH]; int comp_idx; const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp & (mbmi->mode != GLOBAL_GLOBALMV); - const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) && - mbmi_ext->ref_mv_count[ref_frame_type] > 2) || - ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) && - mbmi_ext->ref_mv_count[ref_frame_type] > 1); - // TODO(jingning): This should be deprecated shortly. - const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; - const int ref_set = - has_drl ? AOMMIN(MAX_REF_MV_SERCH, - mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset) - : 1; + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + mode_info[ref_mv_idx].mv.as_int = INVALID_MV; + mode_info[ref_mv_idx].rd = INT64_MAX; + if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) { if (mbmi->ref_frame[0] == LAST2_FRAME || mbmi->ref_frame[0] == LAST3_FRAME || mbmi->ref_frame[1] == LAST2_FRAME || mbmi->ref_frame[1] == LAST3_FRAME) { - if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset] + if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv] .weight < REF_CAT_LEVEL) { continue; } @@ -8811,41 +9345,40 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); - mbmi->num_proj_ref[0] = 0; - mbmi->num_proj_ref[1] = 0; + mbmi->num_proj_ref = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->ref_mv_idx = ref_mv_idx; - if (is_comp_pred) { - for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) { - const int single_mode = - get_single_mode(this_mode, ref_idx, is_comp_pred); - if (single_mode == NEWMV && - args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]] - .as_int == INVALID_MV) - continue; - } + if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) { + continue; } rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; - rd_stats->rate += + const int drl_cost = get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); + rd_stats->rate += drl_cost; + mode_info[ref_mv_idx].drl_cost = drl_cost; + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + continue; + } - const RD_STATS backup_rd_stats = *rd_stats; - const MB_MODE_INFO backup_mbmi = *mbmi; int64_t best_rd2 = INT64_MAX; + const RD_STATS backup_rd_stats = *rd_stats; // If !search_jnt_comp, we need to force mbmi->compound_idx = 1. for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) { int rs = 0; int compmode_interinter_cost = 0; - *rd_stats = backup_rd_stats; - *mbmi = backup_mbmi; mbmi->compound_idx = comp_idx; - if (is_comp_pred && comp_idx == 0) { + *rd_stats = backup_rd_stats; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->comp_group_idx = 0; - mbmi->compound_idx = 0; const int comp_group_idx_ctx = get_comp_group_idx_context(xd); const int comp_index_ctx = get_comp_index_context(cm, xd); @@ -8885,32 +9418,69 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } else { rd_stats->rate += rate_mv; } + + if (cpi->sf.skip_repeated_newmv) { + if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) { + int skip = 0; + int this_rate_mv = 0; + for (i = 0; i < ref_mv_idx; ++i) { + // Check if the motion search result same as previous results + if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) { + // If the compared mode has no valid rd, it is unlikely this + // mode will be the best mode + if (mode_info[i].rd == INT64_MAX) { + skip = 1; + break; + } + // Compare the cost difference including drl cost and mv cost + if (mode_info[i].mv.as_int != INVALID_MV) { + const int compare_cost = + mode_info[i].rate_mv + mode_info[i].drl_cost; + const int_mv ref_mv = av1_get_ref_mv(x, 0); + this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv, + &ref_mv.as_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); + const int this_cost = this_rate_mv + drl_cost; + + if (compare_cost < this_cost) { + skip = 1; + break; + } else { + // If the cost is less than current best result, make this + // the best and update corresponding variables + if (best_mbmi.ref_mv_idx == i) { + assert(best_rd != INT64_MAX); + best_mbmi.ref_mv_idx = ref_mv_idx; + best_rd_stats.rate += this_cost - compare_cost; + best_rd = RDCOST(x->rdmult, best_rd_stats.rate, + best_rd_stats.dist); + if (best_rd < ref_best_rd) ref_best_rd = best_rd; + + skip = 1; + break; + } + } + } + } + } + if (skip) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = + args->modelled_rd[this_mode][i][refs[0]]; + args->simple_rd[this_mode][ref_mv_idx][refs[0]] = + args->simple_rd[this_mode][i][refs[0]]; + mode_info[ref_mv_idx].rd = mode_info[i].rd; + mode_info[ref_mv_idx].rate_mv = this_rate_mv; + mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int; + + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } + } } for (i = 0; i < is_comp_pred + 1; ++i) { mbmi->mv[i].as_int = cur_mv[i].as_int; } - - // Initialise tmp_dst and orig_dst buffers to prevent "may be used - // uninitialized" warnings in GCC when the stream is monochrome. - memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride)); - memset(orig_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(orig_dst.stride, 0, sizeof(tmp_dst.stride)); - - // do first prediction into the destination buffer. Do the next - // prediction into a temporary buffer. Then keep track of which one - // of these currently holds the best predictor, and use the other - // one for future predictions. In the end, copy from tmp_buf to - // dst if necessary. - for (i = 0; i < num_planes; i++) { - tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; - tmp_dst.stride[i] = MAX_SB_SIZE; - } - for (i = 0; i < num_planes; i++) { - orig_dst.plane[i] = xd->plane[i].dst.buf; - orig_dst.stride[i] = xd->plane[i].dst.stride; - } - const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx); #if USE_DISCOUNT_NEWMV_TEST // We don't include the cost of the second reference here, because there @@ -8937,47 +9507,62 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, continue; } - ret_val = interpolation_filter_search( - x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, - args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb); - if (ret_val != 0) { - restore_dst_buf(xd, orig_dst, num_planes); - continue; - } else if (cpi->sf.model_based_post_interp_filter_breakout && - ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) { - restore_dst_buf(xd, orig_dst, num_planes); - if ((rd >> 4) > ref_best_rd) break; - continue; - } - + int skip_build_pred = 0; if (is_comp_pred && comp_idx) { + // Find matching interp filter or set to default interp filter + const int need_search = + av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); + int match_found = -1; + const InterpFilter assign_filter = cm->interp_filter; + if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) { + match_found = find_interp_filter_in_stats(x, mbmi); + } + if (!need_search || match_found == -1) { + set_default_interp_filters(mbmi, assign_filter); + } + int64_t best_rd_compound; compmode_interinter_cost = compound_type_rd( cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used, - &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats, - ref_best_rd); + &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, + rd_stats, ref_best_rd); if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { restore_dst_buf(xd, orig_dst, num_planes); continue; } - if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) { - int tmp_rate; - int64_t tmp_dist; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, - bsize); - for (int plane = 0; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, - &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, - plane_sse, plane_dist); - rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist); + // No need to call av1_build_inter_predictors_sby if + // COMPOUND_AVERAGE is selected because it is the first + // candidate in compound_type_rd, and the following + // compound types searching uses tmp_dst buffer + if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) { + if (num_planes > 1) + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst, + bsize); + skip_build_pred = 1; } } + ret_val = interpolation_filter_search( + x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, + args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb, + skip_build_pred, args, ref_best_rd); + if (args->modelled_rd != NULL && !is_comp_pred) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; + } + if (ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + if ((rd >> 3) * 2 > ref_best_rd) break; + continue; + } + if (search_jnt_comp) { // if 1/2 model rd is larger than best_rd in jnt_comp mode, // use jnt_comp mode, save additional search - if ((rd >> 1) > best_rd) { + if ((rd >> 3) * 4 > best_rd) { restore_dst_buf(xd, orig_dst, num_planes); continue; } @@ -8991,31 +9576,31 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (is_comp_pred) { const int mode0 = compound_ref0_mode(this_mode); const int mode1 = compound_ref1_mode(this_mode); - const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], - args->modelled_rd[mode1][refs[1]]); - if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { + const int64_t mrd = + AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) { restore_dst_buf(xd, orig_dst, num_planes); continue; } - } else { - args->modelled_rd[this_mode][refs[0]] = rd; - } - } - - if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { - // if current pred_error modeled rd is substantially more than the best - // so far, do not bother doing full rd - if (rd / 2 > ref_best_rd) { - restore_dst_buf(xd, orig_dst, num_planes); - continue; } } - rd_stats->rate += compmode_interinter_cost; if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) { // TODO(chengchen): this speed feature introduces big loss. // Need better estimation of rate distortion. + int dummy_rate; + int64_t dummy_dist; + int plane_rate[MAX_MB_PLANE] = { 0 }; + int64_t plane_sse[MAX_MB_PLANE] = { 0 }; + int64_t plane_dist[MAX_MB_PLANE] = { 0 }; + + model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND]( + cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate, + &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse, + plane_dist); + rd_stats->rate += rs; rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2]; rd_stats_y->rate = plane_rate[0]; @@ -9028,18 +9613,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, rd_stats_uv->dist = plane_dist[1] + plane_dist[2]; } else { #if CONFIG_COLLECT_INTER_MODE_RD_STATS - ret_val = - motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, - disable_skip, mi_row, mi_col, args, ref_best_rd, - refs, rate_mv, &orig_dst, best_est_rd); + ret_val = motion_mode_rd( + cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, + mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst, + tile_data, best_est_rd, do_tx_search, inter_modes_info); #else ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mi_row, mi_col, - args, ref_best_rd, refs, rate_mv, &orig_dst); + args, ref_best_rd, refs, &rate_mv, &orig_dst); #endif } + mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int; + mode_info[ref_mv_idx].rate_mv = rate_mv; if (ret_val != INT64_MAX) { int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + mode_info[ref_mv_idx].rd = tmp_rd; if (tmp_rd < best_rd) { best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; @@ -9049,7 +9637,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, best_disable_skip = *disable_skip; best_xskip = x->skip; memcpy(best_blk_skip, x->blk_skip, - sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w); } if (tmp_rd < best_rd2) { @@ -9062,8 +9650,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } restore_dst_buf(xd, orig_dst, num_planes); } - - args->modelled_rd = NULL; } if (best_rd == INT64_MAX) return INT64_MAX; @@ -9078,7 +9664,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, assert(IMPLIES(mbmi->comp_group_idx == 1, mbmi->interinter_comp.type != COMPOUND_AVERAGE)); memcpy(x->blk_skip, best_blk_skip, - sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w); return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); } @@ -9186,8 +9772,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int sadpb = x->sadperbit16; int cost_list[5]; int bestsme = av1_full_pixel_search( - cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, + sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1); x->mv_limits = tmp_mv_limits; @@ -9229,8 +9815,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, } else { super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats.skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats.skip); } if (num_planes > 1) { super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); @@ -9254,7 +9840,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, best_skip = x->skip; best_rdcost = rdc_noskip; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); } if (!xd->lossless[mbmi->segment_id]) { @@ -9271,7 +9857,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, best_skip = x->skip; best_rdcost = rdc_skip; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); } } } @@ -9279,7 +9865,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, *rd_cost = best_rdcost; x->skip = best_skip; memcpy(x->blk_skip, best_blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); return best_rd; } @@ -9302,8 +9888,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, mbmi->mv[0].as_int = 0; const int64_t intra_yrd = - rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, - &y_skip, bsize, best_rd, ctx); + rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, bsize, best_rd, ctx); if (intra_yrd < best_rd) { // Only store reconstructed luma when there's chroma RDO. When there's no @@ -9447,6 +10033,17 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = ref_frame; mbmi->ref_frame[1] = second_ref_frame; + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) { + if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX || + x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) { + return; + } + MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; + av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, + mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, + mi_col, mbmi_ext->mode_context); + } assert(this_mode == NEAREST_NEARESTMV); if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) { @@ -9508,7 +10105,7 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, memset(search_state->best_mbmode.inter_tx_size, search_state->best_mbmode.tx_size, sizeof(search_state->best_mbmode.inter_tx_size)); - set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h, search_state->best_mbmode.skip && is_inter_block(mbmi), xd); // Set up color-related variables for skip mode. @@ -9595,11 +10192,12 @@ static void sf_refine_fast_tx_type_search( } else { super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats_y.skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats_y.skip); } if (num_planes > 1) { - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE); + inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX, + FTXS_NONE); } else { av1_init_rd_stats(&rd_stats_uv); } @@ -9647,7 +10245,7 @@ static void sf_refine_fast_tx_type_search( static void set_params_rd_pick_inter_mode( const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2], - uint32_t mode_skip_mask[REF_FRAMES], + uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES], unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES], struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { @@ -9700,18 +10298,45 @@ static void set_params_rd_pick_inter_mode( x->pred_mv_sad[ref_frame] = INT_MAX; x->mbmi_ext->mode_context[ref_frame] = 0; x->mbmi_ext->compound_mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) { + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + int skip = 1; + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_frame || rf[1] == ref_frame) { + skip = 0; + break; + } + } + } + if (skip) continue; + } + } assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, yv12_mb); } } - - // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector - // references for compound prediction, as not every pair of reference frames - // woud be examined for the RD evaluation. + // ref_frame = ALTREF_FRAME for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { x->mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; + if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) && + (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) { + continue; + } + + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + continue; + } + } av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, mi_col, mbmi_ext->mode_context); @@ -9838,9 +10463,10 @@ static void set_params_rd_pick_inter_mode( } } -static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi, +static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, RD_STATS *rd_cost, + PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize, + MB_MODE_INFO *const mbmi, PALETTE_MODE_INFO *const pmi, unsigned int *ref_costs_single, InterModeSearchState *search_state) { @@ -9867,9 +10493,9 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; rate_overhead_palette = rd_pick_palette_intra_sby( - cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, - best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL, - NULL, NULL, NULL, ctx, best_blk_skip); + cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, &best_rd_palette, + &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip); if (pmi->palette_size[0] == 0) return; memcpy(x->blk_skip, best_blk_skip, @@ -9986,15 +10612,49 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state, av1_zero(search_state->single_newmv); av1_zero(search_state->single_newmv_rate); av1_zero(search_state->single_newmv_valid); - for (int i = 0; i < MB_MODE_COUNT; ++i) - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) - search_state->modelled_rd[i][ref_frame] = INT64_MAX; + for (int i = 0; i < MB_MODE_COUNT; ++i) { + for (int j = 0; j < MAX_REF_MV_SERCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + SingleInterModeState *state; + + state = &search_state->single_state[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + state = &search_state->single_state_modelled[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + } + } + } + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME; + } + } + } + av1_zero(search_state->single_state_cnt); + av1_zero(search_state->single_state_modelled_cnt); } +// Case 1: return 0, means don't skip this mode +// Case 2: return 1, means skip this mode completely +// Case 3: return 2, means skip compound only, but still try single motion modes static int inter_mode_search_order_independent_skip( - const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index, - int mi_row, int mi_col, uint32_t *mode_skip_mask, - uint16_t *ref_frame_skip_mask) { + const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x, + BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col, + uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask, + InterModeSearchState *search_state) { const SPEED_FEATURES *const sf = &cpi->sf; const AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; @@ -10003,6 +10663,32 @@ static int inter_mode_search_order_independent_skip( const unsigned char segment_id = mbmi->segment_id; const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame; const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode; + int skip_motion_mode = 0; + if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { + const int ref_type = av1_ref_frame_type(ref_frame); + int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type); + if (ref_type <= ALTREF_FRAME && skip_ref) { + // Since the compound ref modes depends on the motion estimation result of + // two single ref modes( best mv of single ref modes as the start point ) + // If current single ref mode is marked skip, we need to check if it will + // be used in compound ref modes. + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(ctx->skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_type || rf[1] == ref_type) { + // Found a not skipped compound ref mode which contains current + // single ref. So this single ref can't be skipped completly + // Just skip it's motion mode search, still try it's simple + // transition mode. + skip_motion_mode = 1; + skip_ref = 0; + break; + } + } + } + } + if (skip_ref) return 1; + } if (cpi->sf.mode_pruning_based_on_two_pass_partition_search && !x->cb_partition_scan) { @@ -10115,9 +10801,12 @@ static int inter_mode_search_order_independent_skip( return 1; } - if (skip_repeated_mv(cm, x, this_mode, ref_frame)) { + if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) { return 1; } + if (skip_motion_mode) { + return 2; + } return 0; } @@ -10139,12 +10828,13 @@ static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index, set_default_interp_filters(mbmi, cm->interp_filter); } -static int handle_intra_mode(InterModeSearchState *search_state, - const AV1_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int ref_frame_cost, - const PICK_MODE_CONTEXT *ctx, int disable_skip, - RD_STATS *rd_stats, RD_STATS *rd_stats_y, - RD_STATS *rd_stats_uv) { +static int64_t handle_intra_mode(InterModeSearchState *search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, int disable_skip, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv) { const AV1_COMMON *cm = &cpi->common; const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -10159,9 +10849,19 @@ static int handle_intra_mode(InterModeSearchState *search_state, const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; const int num_planes = av1_num_planes(cm); - av1_init_rd_stats(rd_stats); - av1_init_rd_stats(rd_stats_y); - av1_init_rd_stats(rd_stats_uv); + const int skip_ctx = av1_get_skip_context(xd); + + int known_rate = intra_mode_cost[mbmi->mode]; + known_rate += ref_frame_cost; + if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED) + known_rate += intra_cost_penalty; + known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0); + if (known_rd > search_state->best_rd) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } + TX_SIZE uv_tx; int is_directional_mode = av1_is_directional_mode(mbmi->mode); if (is_directional_mode && av1_use_angle_delta(bsize)) { @@ -10178,20 +10878,33 @@ static int handle_intra_mode(InterModeSearchState *search_state, search_state->directional_mode_skip_mask); search_state->angle_stats_ready = 1; } - if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0; + if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX; + av1_init_rd_stats(rd_stats_y); rd_stats_y->rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, - intra_mode_cost[mbmi->mode], search_state->best_rd, - &model_rd); + rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y, + bsize, intra_mode_cost[mbmi->mode], + search_state->best_rd, &model_rd); } else { + av1_init_rd_stats(rd_stats_y); mbmi->angle_delta[PLANE_TYPE_Y] = 0; super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd); } uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); - + int try_filter_intra = 0; + int64_t best_rd_tmp = INT64_MAX; if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + if (rd_stats_y->rate != INT_MAX) { + const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] + + intra_mode_cost[mbmi->mode]; + best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); + try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd); + } else { + try_filter_intra = !(search_state->best_mbmode.skip); + } + } + if (try_filter_intra) { RD_STATS rd_stats_y_fi; int filter_intra_selected_flag = 0; TX_SIZE best_tx_size = mbmi->tx_size; @@ -10199,20 +10912,12 @@ static int handle_intra_mode(InterModeSearchState *search_state, memcpy(best_txk_type, mbmi->txk_type, sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; - int64_t best_rd_tmp = INT64_MAX; - if (rd_stats_y->rate != INT_MAX) { - best_rd_tmp = RDCOST(x->rdmult, - rd_stats_y->rate + x->filter_intra_cost[bsize][0] + - intra_mode_cost[mbmi->mode], - rd_stats_y->dist); - } mbmi->filter_intra_mode_info.use_filter_intra = 1; for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES; ++fi_mode) { int64_t this_rd_tmp; mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; - super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd); if (rd_stats_y_fi.rate == INT_MAX) { continue; @@ -10223,6 +10928,9 @@ static int handle_intra_mode(InterModeSearchState *search_state, intra_mode_cost[mbmi->mode]); this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); + if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) { + break; + } if (this_rd_tmp < best_rd_tmp) { best_tx_size = mbmi->tx_size; memcpy(best_txk_type, mbmi->txk_type, @@ -10249,12 +10957,23 @@ static int handle_intra_mode(InterModeSearchState *search_state, mbmi->filter_intra_mode_info.use_filter_intra = 0; } } - - if (rd_stats_y->rate == INT_MAX) return 0; - + if (rd_stats_y->rate == INT_MAX) return INT64_MAX; + const int mode_cost_y = + intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]); + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_uv); if (num_planes > 1) { uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); if (search_state->rate_uv_intra[uv_tx] == INT_MAX) { + int rate_y = + rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate; + const int64_t rdy = + RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist); + if (search_state->best_rd < (INT64_MAX / 2) && + rdy > (search_state->best_rd + (search_state->best_rd >> 2))) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } choose_intra_uv_mode( cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx], &search_state->rate_uv_tokenonly[uv_tx], @@ -10262,6 +10981,14 @@ static int handle_intra_mode(InterModeSearchState *search_state, &search_state->mode_uv[uv_tx]); if (try_palette) search_state->pmi_uv[uv_tx] = *pmi; search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV]; + + const int uv_rate = search_state->rate_uv_tokenonly[uv_tx]; + const int64_t uv_dist = search_state->dist_uvs[uv_tx]; + const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist); + if (uv_rd > search_state->best_rd) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } } rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx]; @@ -10277,10 +11004,7 @@ static int handle_intra_mode(InterModeSearchState *search_state, } mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx]; } - - rd_stats->rate = - rd_stats_y->rate + - intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]); + rd_stats->rate = rd_stats_y->rate + mode_cost_y; if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { // super_block_yrd above includes the cost of the tx_size in the // tokenonly rate, but for intra blocks, tx_size is always coded @@ -10308,14 +11032,13 @@ static int handle_intra_mode(InterModeSearchState *search_state, rd_stats_y->rate = 0; rd_stats_uv->rate = 0; // Cost the skip mb case - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; + rd_stats->rate += x->skip_cost[skip_ctx][1]; } else { // Add in the cost of the no skip flag. - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0]; + rd_stats->rate += x->skip_cost[skip_ctx][0]; } // Calculate the final RD estimate for this mode. - int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - + const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); // Keep record of best intra rd if (this_rd < search_state->best_intra_rd) { search_state->best_intra_rd = this_rd; @@ -10333,14 +11056,322 @@ static int handle_intra_mode(InterModeSearchState *search_state, search_state->best_pred_rd[i] = AOMMIN(search_state->best_pred_rd[i], this_rd); } - return 1; + return this_rd; } -void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, +static void collect_single_states(MACROBLOCK *x, + InterModeSearchState *search_state, + const MB_MODE_INFO *const mbmi) { + int i, j; + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1; + const int mode_offset = INTER_OFFSET(this_mode); + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + + // Simple rd + int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < simple_rd) simple_rd = rd; + } + + // Insertion sort of single_state + SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 }; + SingleInterModeState *state_s = search_state->single_state[dir][mode_offset]; + i = search_state->single_state_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j) + state_s[j] = state_s[j - 1]; + state_s[j] = this_state_s; + search_state->single_state_cnt[dir][mode_offset]++; + + // Modelled rd + int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < modelled_rd) modelled_rd = rd; + } + + // Insertion sort of single_state_modelled + SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 }; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode_offset]; + i = search_state->single_state_modelled_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j) + state_m[j] = state_m[j - 1]; + state_m[j] = this_state_m; + search_state->single_state_modelled_cnt[dir][mode_offset]++; +} + +static void analyze_single_states(const AV1_COMP *cpi, + InterModeSearchState *search_state) { + int i, j, dir, mode; + if (cpi->sf.prune_comp_search_by_single_result >= 1) { + for (dir = 0; dir < 2; ++dir) { + int64_t best_rd; + SingleInterModeState(*state)[FWD_REFS]; + + // Use the best rd of GLOBALMV or NEWMV to prune the unlikely + // reference frames for all the modes (NEARESTMV and NEARMV may not + // have same motion vectors). Always keep the best of each mode + // because it might form the best possible combination with other mode. + state = search_state->single_state[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 1) > best_rd) { + state[mode][i].valid = 0; + } + } + } + + state = search_state->single_state_modelled[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; + ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 1) > best_rd) { + state[mode][i].valid = 0; + } + } + } + } + } + + // Ordering by simple rd first, then by modelled rd + for (dir = 0; dir < 2; ++dir) { + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + const int state_cnt_s = search_state->single_state_cnt[dir][mode]; + const int state_cnt_m = + search_state->single_state_modelled_cnt[dir][mode]; + SingleInterModeState *state_s = search_state->single_state[dir][mode]; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode]; + int count = 0; + const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m); + for (i = 0; i < state_cnt_s; ++i) { + if (state_s[i].rd == INT64_MAX) break; + if (state_s[i].valid) + search_state->single_rd_order[dir][mode][count++] = + state_s[i].ref_frame; + } + if (count < max_candidates) { + for (i = 0; i < state_cnt_m; ++i) { + if (state_m[i].rd == INT64_MAX) break; + if (state_m[i].valid) { + int ref_frame = state_m[i].ref_frame; + int match = 0; + // Check if existing already + for (j = 0; j < count; ++j) { + if (search_state->single_rd_order[dir][mode][j] == ref_frame) { + match = 1; + break; + } + } + if (!match) { + // Check if this ref_frame is removed in simple rd + int valid = 1; + for (j = 0; j < state_cnt_s; j++) { + if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) { + valid = 0; + break; + } + } + if (valid) + search_state->single_rd_order[dir][mode][count++] = ref_frame; + } + if (count >= max_candidates) break; + } + } + } + } + } +} + +static int compound_skip_get_candidates( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const int dir, const PREDICTION_MODE mode) { + const int mode_offset = INTER_OFFSET(mode); + const SingleInterModeState *state = + search_state->single_state[dir][mode_offset]; + const SingleInterModeState *state_modelled = + search_state->single_state_modelled[dir][mode_offset]; + int max_candidates = 0; + int candidates; + + for (int i = 0; i < FWD_REFS; ++i) { + if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break; + max_candidates++; + } + + candidates = max_candidates; + if (cpi->sf.prune_comp_search_by_single_result >= 2) { + candidates = AOMMIN(2, max_candidates); + } + if (cpi->sf.prune_comp_search_by_single_result >= 3) { + if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX && + state[0].ref_frame == state_modelled[0].ref_frame) + candidates = 1; + if (mode == NEARMV || mode == GLOBALMV) candidates = 1; + } + return candidates; +} + +static int compound_skip_by_single_states( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, + const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) { + const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame }; + const int mode[2] = { compound_ref0_mode(this_mode), + compound_ref1_mode(this_mode) }; + const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) }; + const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1, + refs[1] <= GOLDEN_FRAME ? 0 : 1 }; + int ref_searched[2] = { 0, 0 }; + int ref_mv_match[2] = { 1, 1 }; + int i, j; + + for (i = 0; i < 2; ++i) { + const SingleInterModeState *state = + search_state->single_state[mode_dir[i]][mode_offset[i]]; + const int state_cnt = + search_state->single_state_cnt[mode_dir[i]][mode_offset[i]]; + for (j = 0; j < state_cnt; ++j) { + if (state[j].ref_frame == refs[i]) { + ref_searched[i] = 1; + break; + } + } + } + + const int ref_set = get_drl_refmv_count(x, refs, this_mode); + for (i = 0; i < 2; ++i) { + if (mode[i] == NEARESTMV || mode[i] == NEARMV) { + const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME }; + int idential = 1; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) { + int_mv single_mv; + int_mv comp_mv; + get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs, + x->mbmi_ext); + get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext); + + idential &= (single_mv.as_int == comp_mv.as_int); + if (!idential) { + ref_mv_match[i] = 0; + break; + } + } + } + } + + for (i = 0; i < 2; ++i) { + if (ref_searched[i] && ref_mv_match[i]) { + const int candidates = + compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]); + const MV_REFERENCE_FRAME *ref_order = + search_state->single_rd_order[mode_dir[i]][mode_offset[i]]; + int match = 0; + for (j = 0; j < candidates; ++j) { + if (refs[i] == ref_order[j]) { + match = 1; + break; + } + } + if (!match) return 1; + } + } + + return 0; +} + +static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode, + InterModeSearchState *search_state) { + const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0]; + const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1]; + if (search_state->num_available_refs > 2) { + if ((ref_frame == search_state->dist_order_refs[0] && + second_ref_frame == search_state->dist_order_refs[1]) || + (ref_frame == search_state->dist_order_refs[1] && + second_ref_frame == search_state->dist_order_refs[0])) + return 1; // drop this pair of refs + } + return 0; +} + +static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state, + const MODE_DEFINITION *mode, + int64_t distortion2) { + const PREDICTION_MODE this_mode = mode->mode; + MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0]; + const int idx = ref_frame - LAST_FRAME; + if (idx && distortion2 > search_state->dist_refs[idx]) { + search_state->dist_refs[idx] = distortion2; + search_state->dist_order_refs[idx] = ref_frame; + } + + // Reach the last single ref prediction mode + if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) { + // bubble sort dist_refs and the order index + for (int i = 0; i < REF_FRAMES; ++i) { + for (int k = i + 1; k < REF_FRAMES; ++k) { + if (search_state->dist_refs[i] < search_state->dist_refs[k]) { + int64_t tmp_dist = search_state->dist_refs[i]; + search_state->dist_refs[i] = search_state->dist_refs[k]; + search_state->dist_refs[k] = tmp_dist; + + int tmp_idx = search_state->dist_order_refs[i]; + search_state->dist_order_refs[i] = search_state->dist_order_refs[k]; + search_state->dist_order_refs[k] = tmp_idx; + } + } + } + for (int i = 0; i < REF_FRAMES; ++i) { + if (search_state->dist_refs[i] == -1) break; + search_state->num_available_refs = i; + } + search_state->num_available_refs++; + } +} + +static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm, + CompoundTypeRdBuffers *const bufs) { + CHECK_MEM_ERROR( + cm, bufs->pred0, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0))); + CHECK_MEM_ERROR( + cm, bufs->pred1, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1))); + CHECK_MEM_ERROR( + cm, bufs->residual1, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1))); + CHECK_MEM_ERROR( + cm, bufs->diff10, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10))); + CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf, + (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE * + sizeof(*bufs->tmp_best_mask_buf))); +} + +static void release_compound_type_rd_buffers( + CompoundTypeRdBuffers *const bufs) { + aom_free(bufs->pred0); + aom_free(bufs->pred1); + aom_free(bufs->residual1); + aom_free(bufs->diff10); + aom_free(bufs->tmp_best_mask_buf); + av1_zero(*bufs); // Set all pointers to NULL for safety. +} + +void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { - const AV1_COMMON *const cm = &cpi->common; + AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -10350,9 +11381,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const struct segmentation *const seg = &cm->seg; PREDICTION_MODE this_mode; - MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; - int i, k; + int i; struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; unsigned int ref_costs_single[REF_FRAMES]; unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; @@ -10364,28 +11394,57 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, InterModeSearchState search_state; init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize, best_rd_so_far); - + INTERINTRA_MODE interintra_modes[REF_FRAMES] = { + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES + }; HandleInterModeArgs args = { { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }, NULL, NULL, - NULL, NULL, + NULL, search_state.modelled_rd, { { 0 } }, INT_MAX, - INT_MAX + INT_MAX, search_state.simple_rd, + 0, interintra_modes }; for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; av1_invalid_rd_stats(rd_cost); // init params, set frame modes, speed features - set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col, - ref_frame_skip_mask, mode_skip_mask, - ref_costs_single, ref_costs_comp, yv12_mb); + set_params_rd_pick_inter_mode( + cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask, + ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb); #if CONFIG_COLLECT_INTER_MODE_RD_STATS int64_t best_est_rd = INT64_MAX; + // TODO(angiebird): Turn this on when this speed feature is well tested +#if 1 + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + const int do_tx_search = !md->ready; +#else + const int do_tx_search = 1; +#endif + InterModesInfo *inter_modes_info = &tile_data->inter_modes_info; + inter_modes_info->num = 0; #endif + int intra_mode_num = 0; + int intra_mode_idx_ls[MAX_MODES]; + int reach_first_comp_mode = 0; + + // Temporary buffers used by handle_inter_mode(). + // We allocate them once and reuse it in every call to that function. + // Note: Must be allocated on the heap due to large size of the arrays. + uint8_t *tmp_buf_orig; + CHECK_MEM_ERROR( + cm, tmp_buf_orig, + (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE)); + uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig); + + CompoundTypeRdBuffers rd_buffers; + alloc_compound_type_rd_buffers(cm, &rd_buffers); + for (int midx = 0; midx < MAX_MODES; ++midx) { int mode_index = mode_map[midx]; int64_t this_rd = INT64_MAX; @@ -10394,42 +11453,44 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, int64_t distortion2 = 0; int skippable = 0; int this_skip2 = 0; - - this_mode = av1_mode_order[mode_index].mode; - ref_frame = av1_mode_order[mode_index].ref_frame[0]; - second_ref_frame = av1_mode_order[mode_index].ref_frame[1]; + const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index]; + const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0]; + const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1]; + const int comp_pred = second_ref_frame > INTRA_FRAME; + this_mode = mode_order->mode; init_mbmi(mbmi, mode_index, cm); x->skip = 0; set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); - if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index, - mi_row, mi_col, mode_skip_mask, - ref_frame_skip_mask)) - continue; - - if (ref_frame == INTRA_FRAME) { - if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) - continue; + // Reach the first compound prediction mode + if (sf->prune_comp_search_by_single_result > 0 && comp_pred && + reach_first_comp_mode == 0) { + analyze_single_states(cpi, &search_state); + reach_first_comp_mode = 1; } + const int ret = inter_mode_search_order_independent_skip( + cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask, + ref_frame_skip_mask, &search_state); + if (ret == 1) continue; + args.skip_motion_mode = (ret == 2); - if (sf->drop_ref) { - if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) { - if (search_state.num_available_refs > 2) { - if ((ref_frame == search_state.dist_order_refs[0] && - second_ref_frame == search_state.dist_order_refs[1]) || - (ref_frame == search_state.dist_order_refs[1] && - second_ref_frame == search_state.dist_order_refs[0])) - continue; - } + if (sf->drop_ref && comp_pred) { + if (sf_check_is_drop_ref(mode_order, &search_state)) { + continue; } } if (search_state.best_rd < search_state.mode_threshold[mode_index]) continue; - const int comp_pred = second_ref_frame > INTRA_FRAME; + if (sf->prune_comp_search_by_single_result > 0 && comp_pred) { + if (compound_skip_by_single_states(cpi, &search_state, this_mode, + ref_frame, second_ref_frame, x)) + continue; + } + const int ref_frame_cost = comp_pred ? ref_costs_comp[ref_frame][second_ref_frame] : ref_costs_single[ref_frame]; @@ -10474,18 +11535,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, } if (ref_frame == INTRA_FRAME) { - RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; - const int ret = handle_intra_mode( - &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip, - &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); - if (!ret) { - continue; - } - rate2 = intra_rd_stats.rate; - distortion2 = intra_rd_stats.dist; - this_rd = RDCOST(x->rdmult, rate2, distortion2); - skippable = intra_rd_stats.skip; - rate_y = intra_rd_stats_y.rate; + intra_mode_idx_ls[intra_mode_num++] = mode_index; + continue; } else { mbmi->angle_delta[PLANE_TYPE_Y] = 0; mbmi->angle_delta[PLANE_TYPE_UV] = 0; @@ -10501,17 +11552,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, args.single_newmv = search_state.single_newmv; args.single_newmv_rate = search_state.single_newmv_rate; args.single_newmv_valid = search_state.single_newmv_valid; - args.modelled_rd = search_state.modelled_rd; args.single_comp_cost = real_compmode_cost; args.ref_frame_cost = ref_frame_cost; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, - &rd_stats_uv, &disable_skip, mi_row, mi_col, - &args, ref_best_rd, &best_est_rd); + this_rd = handle_inter_mode( + cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip, + mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data, + &best_est_rd, do_tx_search, inter_modes_info); #else this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip, mi_row, mi_col, - &args, ref_best_rd); + &args, ref_best_rd, tmp_buf, &rd_buffers); #endif rate2 = rd_stats.rate; skippable = rd_stats.skip; @@ -10520,6 +11571,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, rate_uv = rd_stats_uv.rate; } + if (sf->prune_comp_search_by_single_result > 0 && + is_inter_singleref_mode(this_mode)) { + collect_single_states(x, &search_state, mbmi); + } + if (this_rd == INT64_MAX) continue; this_skip2 = mbmi->skip; @@ -10554,10 +11610,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, search_state.best_mbmode = *mbmi; search_state.best_skip2 = this_skip2; search_state.best_mode_skippable = skippable; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (do_tx_search) { + // When do_tx_search == 0, handle_inter_mode won't provide correct + // rate_y and rate_uv because txfm_search process is replaced by + // rd estimation. + // Therfore, we should avoid updating best_rate_y and best_rate_uv + // here. These two values will be updated when txfm_search is called + search_state.best_rate_y = + rate_y + + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; + search_state.best_rate_uv = rate_uv; + } +#else // CONFIG_COLLECT_INTER_MODE_RD_STATS search_state.best_rate_y = rate_y + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; search_state.best_rate_uv = rate_uv; +#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } @@ -10588,43 +11658,124 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT]) search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; } + if (sf->drop_ref && second_ref_frame == NONE_FRAME) { + // Collect data from single ref mode, and analyze data. + sf_drop_ref_analyze(&search_state, mode_order, distortion2); + } - if (sf->drop_ref) { - if (second_ref_frame == NONE_FRAME) { - const int idx = ref_frame - LAST_FRAME; - if (idx && distortion2 > search_state.dist_refs[idx]) { - search_state.dist_refs[idx] = distortion2; - search_state.dist_order_refs[idx] = ref_frame; - } + if (x->skip && !comp_pred) break; + } - // Reach the last single ref prediction mode - if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) { - // bubble sort dist_refs and the order index - for (i = 0; i < REF_FRAMES; ++i) { - for (k = i + 1; k < REF_FRAMES; ++k) { - if (search_state.dist_refs[i] < search_state.dist_refs[k]) { - int64_t tmp_dist = search_state.dist_refs[i]; - search_state.dist_refs[i] = search_state.dist_refs[k]; - search_state.dist_refs[k] = tmp_dist; - - int tmp_idx = search_state.dist_order_refs[i]; - search_state.dist_order_refs[i] = - search_state.dist_order_refs[k]; - search_state.dist_order_refs[k] = tmp_idx; - } - } - } + aom_free(tmp_buf_orig); + tmp_buf_orig = NULL; + release_compound_type_rd_buffers(&rd_buffers); - for (i = 0; i < REF_FRAMES; ++i) { - if (search_state.dist_refs[i] == -1) break; - search_state.num_available_refs = i; - } - search_state.num_available_refs++; - } +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (!do_tx_search) { + inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); + search_state.best_rd = INT64_MAX; + + int64_t top_est_rd = + inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]; + for (int j = 0; j < inter_modes_info->num; ++j) { + const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; + *mbmi = inter_modes_info->mbmi_arr[data_idx]; + int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; + if (curr_est_rd * 0.9 > top_est_rd) { + continue; + } + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + + x->skip = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + + if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_rate, search_state.best_rd)) { + continue; + } else { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse, + rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + + if (rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = rd_stats.rdcost; + // Note index of best mode so far + const int mode_index = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + search_state.best_mode_index = mode_index; + *rd_cost = rd_stats; + search_state.best_rd = rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = mbmi->skip; + search_state.best_mode_skippable = rd_stats.skip; + search_state.best_rate_y = + rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip]; + search_state.best_rate_uv = rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } } + } +#endif - if (x->skip && !comp_pred) break; + for (int j = 0; j < intra_mode_num; ++j) { + const int mode_index = intra_mode_idx_ls[j]; + const MV_REFERENCE_FRAME ref_frame = + av1_mode_order[mode_index].ref_frame[0]; + assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME); + assert(ref_frame == INTRA_FRAME); + if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break; + init_mbmi(mbmi, mode_index, cm); + x->skip = 0; + set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME); + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + } + + RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; + + const int ref_frame_cost = ref_costs_single[ref_frame]; + intra_rd_stats.rdcost = handle_intra_mode( + &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0, + &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); + if (intra_rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = intra_rd_stats.rdcost; + // Note index of best mode so far + search_state.best_mode_index = mode_index; + *rd_cost = intra_rd_stats; + search_state.best_rd = intra_rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = 0; + search_state.best_mode_skippable = intra_rd_stats.skip; + search_state.best_rate_y = + intra_rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip]; + search_state.best_rate_uv = intra_rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } } // In effect only when speed >= 2. @@ -10635,7 +11786,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, // Only try palette mode when the best mode so far is an intra mode. if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) { - search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi, + search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single, &search_state); } @@ -10776,11 +11927,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); + mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); // Select the samples according to motion vector difference - if (mbmi->num_proj_ref[0] > 1) - mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, - mbmi->num_proj_ref[0], bsize); + if (mbmi->num_proj_ref > 1) + mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); } set_default_interp_filters(mbmi, cm->interp_filter); @@ -10853,7 +12004,7 @@ static INLINE void calc_target_weighted_pred_above( struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; - const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE); @@ -10899,7 +12050,7 @@ static INLINE void calc_target_weighted_pred_left( struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; - const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw); @@ -10982,8 +12133,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, int above_stride, const uint8_t *left, int left_stride) { const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - const int bw = xd->n8_w << MI_SIZE_LOG2; - const int bh = xd->n8_h << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; + const int bh = xd->n4_h << MI_SIZE_LOG2; int32_t *mask_buf = x->mask_buf; int32_t *wsrc_buf = x->wsrc_buf; diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h index 12df472c1..4c11f90b8 100644 --- a/third_party/aom/av1/encoder/rdopt.h +++ b/third_party/aom/av1/encoder/rdopt.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RDOPT_H_ -#define AV1_ENCODER_RDOPT_H_ +#ifndef AOM_AV1_ENCODER_RDOPT_H_ +#define AOM_AV1_ENCODER_RDOPT_H_ #include "av1/common/blockd.h" #include "av1/common/txb_common.h" @@ -25,6 +25,10 @@ extern "C" { #endif #define MAX_REF_MV_SERCH 3 +#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1 +#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2 +#define DEFAULT_INTERP_SKIP_FLAG \ + (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG) struct TileInfo; struct macroblock; @@ -111,7 +115,7 @@ unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd); -void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi, +void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, @@ -123,14 +127,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip( BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); #if CONFIG_COLLECT_INTER_MODE_RD_STATS -#define INTER_MODE_RD_TEST 0 -void av1_inter_mode_data_init(); -void av1_inter_mode_data_fit(int rdmult); -void av1_inter_mode_data_show(const AV1_COMMON *cm); +void av1_inter_mode_data_init(struct TileDataEnc *tile_data); +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult); #endif #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_RDOPT_H_ +#endif // AOM_AV1_ENCODER_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c new file mode 100644 index 000000000..23d920fc3 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.c @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <stdio.h> +#include <limits.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" + +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/obmc.h" +#include "av1/encoder/reconinter_enc.h" + +static INLINE void calc_subpel_params( + MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv, + int plane, const int pre_x, const int pre_y, int x, int y, + struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params, + int bw, int bh) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int is_scaled = av1_is_scaled(sf); + if (is_scaled) { + int ssx = pd->subsampling_x; + int ssy = pd->subsampling_y; + int orig_pos_y = (pre_y + y) << SUBPEL_BITS; + orig_pos_y += mv.row * (1 << (1 - ssy)); + int orig_pos_x = (pre_x + x) << SUBPEL_BITS; + orig_pos_x += mv.col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; + subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; + subpel_params->xs = sf->x_step_q4; + subpel_params->ys = sf->y_step_q4; + } else { + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS; + subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride + + (x + (mv_q4.col >> SUBPEL_BITS)); + } +} + +static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, + int mi_x, int mi_y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + int is_compound = has_second_ref(mi); + int ref; + const int is_intrabc = is_intrabc_block(mi); + assert(IMPLIES(is_intrabc, !is_compound)); + int is_global[2] = { 0, 0 }; + for (ref = 0; ref < 1 + is_compound; ++ref) { + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + is_global[ref] = is_global_mv_block(mi, wm->wmtype); + } + + const BLOCK_SIZE bsize = mi->sb_type; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) || + (block_size_high[bsize] < 8 && ss_y); + + if (is_intrabc) sub8x8_inter = 0; + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = + (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; + const int col_start = + (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + sub8x8_inter = sub8x8_inter && !build_for_obmc; + if (sub8x8_inter) { + for (int row = row_start; row <= 0 && sub8x8_inter; ++row) { + for (int col = col_start; col <= 0; ++col) { + const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + if (!is_inter_block(this_mbmi)) sub8x8_inter = 0; + if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0; + } + } + } + + if (sub8x8_inter) { + // block size + const int b4_w = block_size_wide[bsize] >> ss_x; + const int b4_h = block_size_high[bsize] >> ss_y; + const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y); + const int b8_w = block_size_wide[plane_bsize] >> ss_x; + const int b8_h = block_size_high[plane_bsize] >> ss_y; + assert(!is_compound); + + const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] }; + + int row = row_start; + for (int y = 0; y < b8_h; y += b4_h) { + int col = col_start; + for (int x = 0; x < b8_w; x += b4_w) { + MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + is_compound = has_second_ref(this_mbmi); + int tmp_dst_stride = 8; + assert(bw < 8 || bh < 8); + ConvolveParams conv_params = get_conv_params_no_round( + 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd); + conv_params.use_jnt_comp_avg = 0; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; + + ref = 0; + const RefBuffer *ref_buf = + &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME]; + + pd->pre[ref].buf0 = + (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer; + pd->pre[ref].buf = + pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y, + ref_buf->buf->uv_stride, + &ref_buf->sf); + pd->pre[ref].width = ref_buf->buf->uv_crop_width; + pd->pre[ref].height = ref_buf->buf->uv_crop_height; + pd->pre[ref].stride = ref_buf->buf->uv_stride; + + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &ref_buf->sf; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + + const MV mv = this_mbmi->mv[ref].as_mv; + + uint8_t *pre; + SubpelParams subpel_params; + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global[ref]; + warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL; + + calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, + &subpel_params, bw, bh); + conv_params.do_average = ref; + if (is_masked_compound_type(mi->interinter_comp.type)) { + // masked compound type has its own average mechanism + conv_params.do_average = 0; + } + + av1_make_inter_predictor( + pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, + b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types, + (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y, + plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion); + + ++col; + } + ++row; + } + + for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref]; + return; + } + + { + ConvolveParams conv_params = get_conv_params_no_round( + 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset, + &conv_params.bck_offset, + &conv_params.use_jnt_comp_avg, is_compound); + + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + for (ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + const MV mv = mi->mv[ref].as_mv; + + uint8_t *pre; + SubpelParams subpel_params; + calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre, + &subpel_params, bw, bh); + + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global[ref]; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + if (ref && is_masked_compound_type(mi->interinter_comp.type)) { + // masked compound type has its own average mechanism + conv_params.do_average = 0; + av1_make_masked_inter_predictor( + pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, + bh, &conv_params, mi->interp_filters, plane, &warp_types, + mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd, + cm->allow_warped_motion); + } else { + conv_params.do_average = ref; + av1_make_inter_predictor( + pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, + bh, &conv_params, mi->interp_filters, &warp_types, + mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref, + mi, build_for_obmc, xd, cm->allow_warped_motion); + } + } + } +} + +static void build_inter_predictors_for_planes(const AV1_COMMON *cm, + MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col, + int plane_from, int plane_to) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = plane_from; plane <= plane_to; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = pd->width; + const int bh = pd->height; + + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + + build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y); + } +} + +void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize) { + av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0); +} + +void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize) { + for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) { + av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, + plane_idx); + } +} + +void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize, int plane_idx) { + build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx, + plane_idx); + + if (is_interintra_pred(xd->mi[0])) { + BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } }; + if (!ctx) { + default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf; + default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride; + ctx = &default_ctx; + } + av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf, + xd->plane[plane_idx].dst.stride, ctx, + plane_idx, bsize); + } +} + +void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize) { + const int num_planes = av1_num_planes(cm); + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); + if (num_planes > 1) + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize); +} + +// TODO(sarahparker): +// av1_build_inter_predictor should be combined with +// av1_make_inter_predictor +void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, const MV *src_mv, + const struct scale_factors *sf, int w, int h, + ConvolveParams *conv_params, + InterpFilters interp_filters, + const WarpTypesAllowed *warp_types, int p_col, + int p_row, int plane, int ref, + enum mv_precision precision, int x, int y, + const MACROBLOCKD *xd, int can_use_previous) { + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = av1_scale_mv(&mv_q4, x, y, sf); + mv.col += SCALE_EXTRA_OFF; + mv.row += SCALE_EXTRA_OFF; + + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + mv.col & SCALE_SUBPEL_MASK, + mv.row & SCALE_SUBPEL_MASK }; + src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride + + (mv.col >> SCALE_SUBPEL_BITS); + + av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf, + w, h, conv_params, interp_filters, warp_types, p_col, + p_row, plane, ref, xd->mi[0], 0, xd, + can_use_previous); +} + +static INLINE void build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int above_mi_col = ctxt->mi_col + rel_mi_col; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *above_mbmi; + + av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width, + above_mbmi, ctxt, num_planes); + mi_x = above_mi_col << MI_SIZE_LOG2; + mi_y = ctxt->mi_row << MI_SIZE_LOG2; + + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x; + int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, + block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; + build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y); + } + *above_mbmi = backup_mbmi; +} + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->up_available) return; + + // Adjust mb_to_bottom_edge to have the correct value for the OBMC + // prediction block. This is half the height of the original block, + // except for 128-wide blocks, where we only use a height of 32. + int this_height = xd->n4_h * MI_SIZE; + int pred_height = AOMMIN(this_height / 2, 32); + xd->mb_to_bottom_edge += (this_height - pred_height) * 8; + + struct build_prediction_ctxt ctxt = { cm, mi_row, + mi_col, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_right_edge }; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_above(cm, xd, mi_col, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_prediction_by_above_pred, &ctxt); + + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ctxt.mb_to_far_edge; + xd->mb_to_bottom_edge -= (this_height - pred_height) * 8; +} + +static INLINE void build_prediction_by_left_pred( + MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int left_mi_row = ctxt->mi_row + rel_mi_row; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *left_mbmi; + + av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height, + left_mbmi, ctxt, num_planes); + mi_x = ctxt->mi_col << MI_SIZE_LOG2; + mi_y = left_mi_row << MI_SIZE_LOG2; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, + block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); + int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; + build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y); + } + *left_mbmi = backup_mbmi; +} + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->left_available) return; + + // Adjust mb_to_right_edge to have the correct value for the OBMC + // prediction block. This is half the width of the original block, + // except for 128-wide blocks, where we only use a width of 32. + int this_width = xd->n4_w * MI_SIZE; + int pred_width = AOMMIN(this_width / 2, 32); + xd->mb_to_right_edge += (this_width - pred_width) * 8; + + struct build_prediction_ctxt ctxt = { cm, mi_row, + mi_col, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_bottom_edge }; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_left(cm, xd, mi_row, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_prediction_by_left_pred, &ctxt); + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_right_edge -= (this_width - pred_width) * 8; + xd->mb_to_bottom_edge = ctxt.mb_to_far_edge; +} + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col) { + const int num_planes = av1_num_planes(cm); + uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; + int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int len = sizeof(uint16_t); + dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); + dst_buf1[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); + dst_buf1[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); + dst_buf2[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); + dst_buf2[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); + } else { + dst_buf1[0] = xd->tmp_obmc_bufs[0]; + dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; + dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; + dst_buf2[0] = xd->tmp_obmc_bufs[1]; + dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; + dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; + } + av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1, + dst_width1, dst_height1, dst_stride1); + av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2, + dst_width2, dst_height2, dst_stride2); + av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm), + mi_row, mi_col, 0, num_planes); + av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1, + dst_buf2, dst_stride2); +} + +// Builds the inter-predictor for the single ref case +// for use in the encoder to search the wedges efficiently. +static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane, + int bw, int bh, int x, int y, + int w, int h, int mi_x, int mi_y, + int ref, uint8_t *const ext_dst, + int ext_dst_stride, + int can_use_previous) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *mi = xd->mi[0]; + + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = &pd->pre[ref]; + uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x; + const MV mv = mi->mv[ref].as_mv; + + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); + WarpTypesAllowed warp_types; + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + const int pre_x = (mi_x) >> pd->subsampling_x; + const int pre_y = (mi_y) >> pd->subsampling_y; + uint8_t *pre; + SubpelParams subpel_params; + calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, + &subpel_params, bw, bh); + + av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, + &subpel_params, sf, w, h, &conv_params, + mi->interp_filters, &warp_types, pre_x + x, + pre_y + y, plane, ref, mi, 0, xd, can_use_previous); +} + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, + int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], + int can_use_previous) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size( + bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x, + mi_y, ref, ext_dst[plane], + ext_dst_stride[plane], can_use_previous); + } +} + +static void build_masked_compound( + uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, block_size_wide[sb_type], w, h, subw, subh); +} + +static void build_masked_compound_highbd( + uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, + const uint8_t *src1_8, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, int bd) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + // const uint8_t *mask = + // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); + aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, block_size_wide[sb_type], w, h, + subw, subh, bd); +} + +static void build_wedge_inter_predictor_from_buf( + MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, + int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + mbmi->interinter_comp.seg_mask = xd->seg_mask; + const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; + + if (is_compound && is_masked_compound_type(comp_data->type)) { + if (!plane && comp_data->type == COMPOUND_DIFFWTD) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + av1_build_compound_diffwtd_mask_highbd( + comp_data->seg_mask, comp_data->mask_type, + CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); + else + av1_build_compound_diffwtd_mask( + comp_data->seg_mask, comp_data->mask_type, ext_dst0, + ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); + } + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + build_masked_compound_highbd( + dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, + mbmi->sb_type, h, w, xd->bd); + else + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type, + h, w); + } else { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + dst, dst_buf->stride, NULL, 0, NULL, 0, w, h, + xd->bd); + else + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, + 0, NULL, 0, w, h); + } +} + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[3], + int ext_dst_stride0[3], + uint8_t *ext_dst1[3], + int ext_dst_stride1[3]) { + int plane; + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size( + bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + build_wedge_inter_predictor_from_buf( + xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], + ext_dst1[plane], ext_dst_stride1[plane]); + } +} diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h new file mode 100644 index 000000000..10d5e8c28 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_ +#define AOM_AV1_ENCODER_RECONINTER_ENC_H_ + +#include "aom/aom_integer.h" +#include "av1/common/filter.h" +#include "av1/common/blockd.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/convolve.h" +#include "av1/common/warped_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize); + +void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize); + +void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize, int plane_idx); + +void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize); + +void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, const MV *src_mv, + const struct scale_factors *sf, int w, int h, + ConvolveParams *conv_params, + InterpFilters interp_filters, + const WarpTypesAllowed *warp_types, int p_col, + int p_row, int plane, int ref, + enum mv_precision precision, int x, int y, + const MACROBLOCKD *xd, int can_use_previous); + +// Detect if the block have sub-pixel level motion vectors +// per component. +#define CHECK_SUBPEL 0 +static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi, + const MACROBLOCKD *const xd, + int dir) { +#if CHECK_SUBPEL + const BLOCK_SIZE bsize = mbmi->sb_type; + int plane; + int ref = (dir >> 1); + + if (dir & 0x01) { + if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1; + } else { + if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1; + } + + return 0; +#else + (void)mbmi; + (void)xd; + (void)dir; + return 1; +#endif +} + +static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) { + MB_MODE_INFO *const mi = xd->mi[0]; + const int is_compound = has_second_ref(mi); + int ref; + for (ref = 0; ref < 1 + is_compound; ++ref) { + int row_col; + for (row_col = 0; row_col < 2; ++row_col) { + const int dir = (ref << 1) + row_col; + if (has_subpel_mv_component(mi, xd, dir)) { + return 1; + } + } + } + return 0; +} + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col); + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, + int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], + int can_use_previous); + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[3], + int ext_dst_stride0[3], + uint8_t *ext_dst1[3], + int ext_dst_stride1[3]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_ diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h index a207b0f26..1ad13d66a 100644 --- a/third_party/aom/av1/encoder/segmentation.h +++ b/third_party/aom/av1/encoder/segmentation.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_SEGMENTATION_H_ -#define AV1_ENCODER_SEGMENTATION_H_ +#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_ +#define AOM_AV1_ENCODER_SEGMENTATION_H_ #include "av1/common/blockd.h" #include "av1/encoder/encoder.h" @@ -35,4 +35,4 @@ void av1_reset_segment_features(AV1_COMMON *cm); } // extern "C" #endif -#endif // AV1_ENCODER_SEGMENTATION_H_ +#endif // AOM_AV1_ENCODER_SEGMENTATION_H_ diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c index d4b4b19c4..4c35baae0 100644 --- a/third_party/aom/av1/encoder/speed_features.c +++ b/third_party/aom/av1/encoder/speed_features.c @@ -98,6 +98,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, sf->use_square_partition_only_threshold = BLOCK_64X64; } + // TODO(huisu@google.com): train models for 720P and above. + if (!is_720p_or_larger) { + sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 + sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } + if (speed >= 1) { if (is_720p_or_larger) { sf->use_square_partition_only_threshold = BLOCK_128X128; @@ -106,6 +115,14 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, } else { sf->use_square_partition_only_threshold = BLOCK_32X32; } + + if (!is_720p_or_larger) { + sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 + sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } } if (speed >= 2) { @@ -126,13 +143,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, if (speed >= 3) { if (is_720p_or_larger) { sf->disable_split_mask = DISABLE_ALL_SPLIT; - sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; sf->partition_search_breakout_dist_thr = (1 << 25); sf->partition_search_breakout_rate_thr = 200; } else { sf->max_intra_bsize = BLOCK_32X32; sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; - sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0; sf->partition_search_breakout_dist_thr = (1 << 23); sf->partition_search_breakout_rate_thr = 120; } @@ -166,6 +181,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, // Speed 0 for all speed features that give neutral coding performance change. sf->reduce_inter_modes = 1; sf->prune_ext_partition_types_search_level = 1; + sf->ml_prune_rect_partition = 1; sf->ml_prune_ab_partition = 1; sf->ml_prune_4_partition = 1; sf->adaptive_txb_search_level = 1; @@ -173,6 +189,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->model_based_prune_tx_search_level = 1; sf->model_based_post_interp_filter_breakout = 1; sf->inter_mode_rd_model_estimation = 1; + sf->prune_ref_frame_for_rect_partitions = + !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame); + sf->less_rectangular_check_level = 1; + sf->gm_search_type = GM_REDUCED_REF_SEARCH; + sf->gm_disable_recode = 1; if (speed >= 1) { sf->gm_erroradv_type = GM_ERRORADV_TR_1; @@ -182,8 +203,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->intra_tx_size_search_init_depth_rect = 1; sf->intra_tx_size_search_init_depth_sqr = 1; sf->tx_size_search_lgr_block = 1; - sf->two_pass_partition_search = 1; - sf->mode_pruning_based_on_two_pass_partition_search = 1; + if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) { + sf->two_pass_partition_search = 1; + sf->mode_pruning_based_on_two_pass_partition_search = 1; + } sf->prune_ext_partition_types_search_level = 2; sf->use_fast_interpolation_filter_search = 1; sf->skip_repeat_interpolation_filter_search = 1; @@ -198,6 +221,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->use_intra_txb_hash = 1; sf->optimize_b_precheck = 1; sf->dual_sgr_penalty_level = 1; + sf->use_accurate_subpel_search = 1; + sf->reuse_inter_intra_mode = 1; + sf->prune_comp_search_by_single_result = 1; + sf->skip_repeated_newmv = 1; + sf->obmc_full_pixel_search_level = 1; } if (speed >= 2) { @@ -206,7 +234,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->selective_ref_frame = 2; sf->fast_cdef_search = 1; - sf->use_rd_breakout = 1; sf->adaptive_rd_thresh = 1; sf->mv.auto_mv_step_size = 1; sf->mv.subpel_iters_per_step = 1; @@ -224,8 +251,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, if (speed >= 3) { sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL; - sf->less_rectangular_check = 1; - sf->mode_skip_start = 10; + sf->less_rectangular_check_level = 2; sf->adaptive_pred_interp_filter = 1; // adaptive_motion_search breaks encoder multi-thread tests. // The values in x->pred_mv[] differ for single and multi-thread cases. @@ -237,6 +263,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->adaptive_rd_thresh = 2; sf->tx_type_search.prune_mode = PRUNE_2D_FAST; sf->gm_search_type = GM_DISABLE_SEARCH; + sf->prune_comp_search_by_single_result = 2; } if (speed >= 4) { @@ -250,10 +277,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 1; sf->cb_partition_search = !boosted; - sf->cb_pred_filter_search = 1; sf->alt_ref_search_fp = 1; - sf->mode_skip_start = 6; - sf->adaptive_interp_filter_search = 1; } if (speed >= 5) { @@ -276,7 +300,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | FLAG_EARLY_TERMINATE; sf->disable_filter_search_var_thresh = 200; - sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->use_fast_coef_costing = 1; sf->partition_search_breakout_rate_thr = 300; sf->use_transform_domain_distortion = 2; @@ -296,33 +319,17 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->simple_model_rd_from_var = 1; } if (speed >= 7) { - const int is_keyframe = cm->frame_type == KEY_FRAME; - const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->default_max_partition_size = BLOCK_32X32; sf->default_min_partition_size = BLOCK_8X8; sf->intra_y_mode_mask[TX_64X64] = INTRA_DC; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->frame_parameter_update = 0; sf->mv.search_method = FAST_HEX; - sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW; - sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST; sf->partition_search_type = REFERENCE_PARTITION; - sf->reuse_inter_pred_sby = 1; - sf->force_frame_boost = - is_keyframe || - (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); - sf->max_delta_qindex = is_keyframe ? 20 : 15; - sf->coeff_prob_appx_step = 4; sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; } if (speed >= 8) { sf->mv.search_method = FAST_DIAMOND; - sf->mv.fullpel_search_step_param = 10; sf->mv.subpel_force_stop = 2; sf->lpf_pick = LPF_PICK_MINIMAL_LPF; } @@ -356,54 +363,6 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; } -static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) { - AV1_COMMON *const cm = &cpi->common; - - if (speed & TXFM_CODING_SF) { - sf->inter_tx_size_search_init_depth_rect = 1; - sf->inter_tx_size_search_init_depth_sqr = 1; - sf->intra_tx_size_search_init_depth_rect = 1; - sf->intra_tx_size_search_init_depth_sqr = 1; - sf->tx_size_search_method = USE_FAST_RD; - sf->tx_type_search.fast_intra_tx_type_search = 1; - sf->tx_type_search.fast_inter_tx_type_search = 1; - } - - if (speed & INTER_PRED_SF) { - sf->selective_ref_frame = 2; - // sf->adaptive_motion_search = 1; - sf->mv.auto_mv_step_size = 1; - sf->adaptive_rd_thresh = 1; - sf->mv.subpel_iters_per_step = 1; - sf->adaptive_pred_interp_filter = 1; - } - - if (speed & INTRA_PRED_SF) { - sf->max_intra_bsize = BLOCK_32X32; - } - - if (speed & PARTITION_SF) { - if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || - has_internal_image_edge(cpi)) { - sf->use_square_partition_only_threshold = - frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4; - } else { - sf->use_square_partition_only_threshold = - frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4; - } - sf->less_rectangular_check = 1; - sf->prune_ext_partition_types_search_level = 2; - } - - if (speed & LOOP_FILTER_SF) { - sf->fast_cdef_search = 1; - } - - if (speed & RD_SKIP_SF) { - sf->use_rd_breakout = 1; - } -} - void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; @@ -432,9 +391,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { #endif // DISABLE_TRELLISQ_SEARCH sf->gm_erroradv_type = GM_ERRORADV_TR_0; sf->mv.reduce_first_step_size = 0; - sf->coeff_prob_appx_step = 1; sf->mv.auto_mv_step_size = 0; - sf->mv.fullpel_search_step_param = 6; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->adaptive_rd_thresh = 0; sf->tx_size_search_method = USE_FULL_RD; @@ -450,7 +407,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; - sf->cb_pred_filter_search = 0; sf->cb_partition_search = 0; sf->alt_ref_search_fp = 0; sf->partition_search_type = SEARCH_PARTITION; @@ -461,22 +417,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->tx_type_search.fast_inter_tx_type_search = 0; sf->tx_type_search.skip_tx_search = 0; sf->selective_ref_frame = 0; - sf->less_rectangular_check = 0; + sf->less_rectangular_check_level = 0; sf->use_square_partition_only_threshold = BLOCK_128X128; + sf->prune_ref_frame_for_rect_partitions = 0; sf->auto_min_max_partition_size = NOT_IN_USE; sf->rd_auto_partition_min_limit = BLOCK_4X4; sf->default_max_partition_size = BLOCK_LARGEST; sf->default_min_partition_size = BLOCK_4X4; sf->adjust_partitioning_from_last_frame = 0; - sf->last_partitioning_redo_frequency = 4; sf->disable_split_mask = 0; sf->mode_search_skip_flags = 0; - sf->force_frame_boost = 0; - sf->max_delta_qindex = 0; sf->disable_filter_search_var_thresh = 0; - sf->adaptive_interp_filter_search = 0; sf->allow_partition_search_skip = 0; - sf->use_accurate_subpel_search = 1; + sf->use_accurate_subpel_search = 2; sf->disable_wedge_search_var_thresh = 0; sf->fast_wedge_sign_estimate = 0; sf->drop_ref = 0; @@ -491,48 +444,46 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->optimize_b_precheck = 0; sf->jnt_comp_fast_tx_search = 0; sf->jnt_comp_skip_mv_search = 0; + sf->reuse_inter_intra_mode = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; } - sf->use_rd_breakout = 0; sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; - sf->use_fast_coef_updates = TWO_LOOP; sf->use_fast_coef_costing = 0; - sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set - sf->schedule_mode_search = 0; - for (i = 0; i < BLOCK_SIZES_ALL; ++i) sf->inter_mode_mask[i] = INTER_ALL; sf->max_intra_bsize = BLOCK_LARGEST; - sf->reuse_inter_pred_sby = 0; // This setting only takes effect when partition_search_type is set // to FIXED_PARTITION. sf->always_this_block_size = BLOCK_16X16; - sf->search_type_check_frequency = 50; // Recode loop tolerance %. sf->recode_tolerance = 25; - sf->default_interp_filter = SWITCHABLE; sf->partition_search_breakout_dist_thr = 0; sf->partition_search_breakout_rate_thr = 0; sf->simple_model_rd_from_var = 0; sf->prune_ext_partition_types_search_level = 0; + sf->ml_prune_rect_partition = 0; sf->ml_prune_ab_partition = 0; sf->ml_prune_4_partition = 0; sf->fast_cdef_search = 0; + for (i = 0; i < PARTITION_BLOCK_SIZES; ++i) + sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled. // Set this at the appropriate speed levels sf->use_transform_domain_distortion = 0; sf->gm_search_type = GM_FULL_SEARCH; + sf->gm_disable_recode = 0; sf->use_fast_interpolation_filter_search = 0; sf->skip_repeat_interpolation_filter_search = 0; sf->use_hash_based_trellis = 0; + sf->prune_comp_search_by_single_result = 0; + sf->skip_repeated_newmv = 0; // Set decoder side speed feature to use less dual sgr modes sf->dual_sgr_penalty_level = 0; sf->inter_mode_rd_model_estimation = 0; - - set_dev_sf(cpi, sf, oxcf->dev_sf); + sf->obmc_full_pixel_search_level = 0; if (oxcf->mode == GOOD) set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed); @@ -599,10 +550,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { x->min_partition_size = sf->default_min_partition_size; x->max_partition_size = sf->default_max_partition_size; - if (!cpi->oxcf.frame_periodic_boost) { - sf->max_delta_qindex = 0; - } - // This is only used in motion vector unit test. if (cpi->oxcf.motion_vector_unit_test == 1) cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv; @@ -611,5 +558,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { #if CONFIG_DIST_8X8 if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0; + + if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8; #endif // CONFIG_DIST_8X8 } diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h index d0408ba2f..41013b2e7 100644 --- a/third_party/aom/av1/encoder/speed_features.h +++ b/third_party/aom/av1/encoder/speed_features.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_SPEED_FEATURES_H_ -#define AV1_ENCODER_SPEED_FEATURES_H_ +#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_ +#define AOM_AV1_ENCODER_SPEED_FEATURES_H_ #include "av1/common/enums.h" @@ -54,25 +54,6 @@ enum { (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV), - INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) | - (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV), - INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | - (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | - (1 << NEW_NEARMV) | (1 << NEAR_NEWMV), - INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | - (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | - (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV), - INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | - (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) | - (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | - (1 << NEAR_NEWMV), - INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | - (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | - (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | - (1 << NEAR_NEARMV), INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | @@ -133,11 +114,6 @@ typedef enum { } SUBPEL_SEARCH_METHODS; typedef enum { - NO_MOTION_THRESHOLD = 0, - LOW_MOTION_THRESHOLD = 7 -} MOTION_THRESHOLD; - -typedef enum { USE_FULL_RD = 0, USE_FAST_RD, USE_LARGESTALL, @@ -179,12 +155,6 @@ typedef enum { } MODE_SEARCH_SKIP_LOGIC; typedef enum { - FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR, - FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH, - FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP, -} INTERP_FILTER_MASK; - -typedef enum { NO_PRUNE = 0, // eliminates one tx type in vertical and horizontal direction PRUNE_ONE = 1, @@ -224,16 +194,6 @@ typedef enum { REFERENCE_PARTITION } PARTITION_SEARCH_TYPE; -typedef enum { - // Does a dry run to see if any of the contexts need to be updated or not, - // before the final run. - TWO_LOOP = 0, - - // No dry run, also only half the coef contexts and bands are updated. - // The rest are not updated at all. - ONE_LOOP_REDUCED = 1 -} FAST_COEFF_UPDATE; - typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; @@ -257,9 +217,6 @@ typedef struct MV_SPEED_FEATURES { // Control when to stop subpel search int subpel_force_stop; - - // This variable sets the step_param used in full pel motion search. - int fullpel_search_step_param; } MV_SPEED_FEATURES; #define MAX_MESH_STEP 4 @@ -332,13 +289,6 @@ typedef struct SPEED_FEATURES { // mode to be evaluated. A high value means we will be faster. int adaptive_rd_thresh; - // Coefficient probability model approximation step size - int coeff_prob_appx_step; - - // The threshold is to determine how slow the motino is, it is used when - // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION - MOTION_THRESHOLD lf_motion_threshold; - // Determine which method we use to determine transform size. We can choose // between options like full rd, largest for prediction size, largest // for intra and model coefs for the rest. @@ -355,11 +305,6 @@ typedef struct SPEED_FEATURES { // largest transform only, since the largest transform block size is 64x64. int tx_size_search_lgr_block; - // After looking at the first set of modes (set by index here), skip - // checking modes for reference frames that don't match the reference frame - // of the best so far. - int mode_skip_start; - PARTITION_SEARCH_TYPE partition_search_type; TX_TYPE_SEARCH tx_type_search; @@ -397,6 +342,9 @@ typedef struct SPEED_FEATURES { // aggressiveness of pruning in order. int prune_ext_partition_types_search_level; + // Use a ML model to prune horz and vert partitions + int ml_prune_rect_partition; + // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions. int ml_prune_ab_partition; @@ -413,12 +361,16 @@ typedef struct SPEED_FEATURES { int mode_pruning_based_on_two_pass_partition_search; // Skip rectangular partition test when partition type none gives better - // rd than partition type split. - int less_rectangular_check; + // rd than partition type split. Can take values 0 - 2, 0 referring to no + // skipping, and 1 - 2 increasing aggressiveness of skipping in order. + int less_rectangular_check_level; // Use square partition only beyond this block size. BLOCK_SIZE use_square_partition_only_threshold; + // Prune reference frames for rectangular partitions. + int prune_ref_frame_for_rect_partitions; + // Sets min and max partition sizes for this superblock based on the // same superblock in last encoded frame, and the left and above neighbor. AUTO_MIN_MAX_MODE auto_min_max_partition_size; @@ -435,10 +387,6 @@ typedef struct SPEED_FEATURES { // frame's partitioning. Only used if use_lastframe_partitioning is set. int adjust_partitioning_from_last_frame; - // How frequently we re do the partitioning from scratch. Only used if - // use_lastframe_partitioning is set. - int last_partitioning_redo_frequency; - // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable // it always, to allow it for only Last frame and Intra, disable it for all // inter modes or to enable it always. @@ -461,8 +409,6 @@ typedef struct SPEED_FEATURES { // Pattern to be used for any exhaustive mesh searches. MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; - int schedule_mode_search; - // Allows sub 8x8 modes to use the prediction filter that was determined // best for 8x8 mode. If set to 0 we always re check all the filters for // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter @@ -472,20 +418,10 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; - // Chessboard pattern prediction filter type search - int cb_pred_filter_search; - int cb_partition_search; int alt_ref_search_fp; - // Use finer quantizer in every other few frames that run variable block - // partition type search. - int force_frame_boost; - - // Maximally allowed base quantization index fluctuation. - int max_delta_qindex; - // Implements various heuristics to skip searching modes // The heuristics selected are based on flags // defined in the MODE_SEARCH_SKIP_HEURISTICS enum @@ -506,22 +442,9 @@ typedef struct SPEED_FEATURES { int intra_y_mode_mask[TX_SIZES]; int intra_uv_mode_mask[TX_SIZES]; - // This variable enables an early break out of mode testing if the model for - // rd built from the prediction signal indicates a value that's much - // higher than the best rd we've seen so far. - int use_rd_breakout; - // This feature controls how the loop filter level is determined. LPF_PICK_METHOD lpf_pick; - // This feature limits the number of coefficients updates we actually do - // by only looking at counts from 1/2 the bands. - FAST_COEFF_UPDATE use_fast_coef_updates; - - // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV - // modes are used in order from LSB to MSB for each BLOCK_SIZE. - int inter_mode_mask[BLOCK_SIZES_ALL]; - // This feature controls whether we do the expensive context update and // calculation in the rd coefficient costing loop. int use_fast_coef_costing; @@ -535,28 +458,13 @@ typedef struct SPEED_FEATURES { // TODO(aconverse): Fold this into one of the other many mode skips BLOCK_SIZE max_intra_bsize; - // The frequency that we check if - // FIXED_PARTITION search type should be used. - int search_type_check_frequency; - - // When partition is pre-set, the inter prediction result from pick_inter_mode - // can be reused in final block encoding process. It is enabled only for real- - // time mode speed 6. - int reuse_inter_pred_sby; - - // default interp filter choice - InterpFilter default_interp_filter; - - // adaptive interp_filter search to allow skip of certain filter types. - int adaptive_interp_filter_search; - - // mask for skip evaluation of certain interp_filter type. - INTERP_FILTER_MASK interp_filter_search_mask; - // Partition search early breakout thresholds. int64_t partition_search_breakout_dist_thr; int partition_search_breakout_rate_thr; + // Thresholds for ML based partition search breakout. + int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES]; + // Allow skipping partition search for still image frame int allow_partition_search_skip; @@ -577,6 +485,9 @@ typedef struct SPEED_FEATURES { GM_SEARCH_TYPE gm_search_type; + // whether to disable the global motion recode loop + int gm_disable_recode; + // Do limited interpolation filter search for dual filters, since best choice // usually includes EIGHTTAP_REGULAR. int use_fast_interpolation_filter_search; @@ -624,6 +535,25 @@ typedef struct SPEED_FEATURES { // Dynamically estimate final rd from prediction error and mode cost int inter_mode_rd_model_estimation; + + // Skip some ref frames in compound motion search by single motion search + // result. Has three levels for now: 0 referring to no skipping, and 1 - 3 + // increasing aggressiveness of skipping in order. + // Note: The search order might affect the result. It is better to search same + // single inter mode as a group. + int prune_comp_search_by_single_result; + + // Reuse the inter_intra_mode search result from NEARESTMV mode to other + // single ref modes + int reuse_inter_intra_mode; + + // Set the full pixel search level of obmc + // 0: obmc_full_pixel_diamond + // 1: obmc_refining_search_sad (faster) + int obmc_full_pixel_search_level; + + // flag to skip NEWMV mode in drl if the motion search result is the same + int skip_repeated_newmv; } SPEED_FEATURES; struct AV1_COMP; @@ -635,4 +565,4 @@ void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_SPEED_FEATURES_H_ +#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_ diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c index d7e4f4eb3..75fdf02a5 100644 --- a/third_party/aom/av1/encoder/temporal_filter.c +++ b/third_party/aom/av1/encoder/temporal_filter.c @@ -25,6 +25,7 @@ #include "av1/encoder/mcomp.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ratectrl.h" +#include "av1/encoder/reconinter_enc.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/temporal_filter.h" #include "aom_dsp/aom_dsp_common.h" @@ -37,13 +38,12 @@ static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, uint8_t *pred, struct scale_factors *scale, int x, int y, - int can_use_previous) { - const int which_mv = 0; + int can_use_previous, int num_planes) { const MV mv = { mv_row, mv_col }; enum mv_precision mv_precision_uv; int uv_stride; // TODO(angiebird): change plane setting accordingly - ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd); + ConvolveParams conv_params = get_conv_params(0, 0, xd->bd); const InterpFilters interp_filters = xd->mi[0]->interp_filters; WarpTypesAllowed warp_types; memset(&warp_types, 0, sizeof(WarpTypesAllowed)); @@ -55,37 +55,21 @@ static void temporal_filter_predictors_mb_c( uv_stride = stride; mv_precision_uv = MV_PRECISION_Q3; } - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, - 16, 16, which_mv, interp_filters, - &warp_types, x, y, 0, MV_PRECISION_Q3, x, - y, xd, can_use_previous); - - av1_highbd_build_inter_predictor( - u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale, - uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types, - x, y, 1, mv_precision_uv, x, y, xd, can_use_previous); - - av1_highbd_build_inter_predictor( - v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale, - uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types, - x, y, 2, mv_precision_uv, x, y, xd, can_use_previous); - return; - } av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, &conv_params, interp_filters, &warp_types, x, y, 0, 0, MV_PRECISION_Q3, x, y, xd, can_use_previous); - av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - &conv_params, interp_filters, &warp_types, x, y, 1, - 0, mv_precision_uv, x, y, xd, can_use_previous); + if (num_planes > 1) { + av1_build_inter_predictor( + u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale, + uv_block_width, uv_block_height, &conv_params, interp_filters, + &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous); - av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - &conv_params, interp_filters, &warp_types, x, y, 2, - 0, mv_precision_uv, x, y, xd, can_use_previous); + av1_build_inter_predictor( + v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale, + uv_block_width, uv_block_height, &conv_params, interp_filters, + &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous); + } } void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, @@ -214,7 +198,8 @@ void av1_highbd_temporal_filter_apply_c( static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, uint8_t *arf_frame_buf, uint8_t *frame_ptr_buf, - int stride) { + int stride, int x_pos, + int y_pos) { MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; @@ -250,11 +235,9 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, x->mvcost = x->mv_cost_stack; x->nmvjointcost = x->nmv_vec_cost; - // Use mv costing from x->mvcost directly - av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, - cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0, - &best_ref_mv1); - + av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, + NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, 0, 0, x_pos, y_pos, 0); x->mv_limits = tmp_mv_limits; // Ignore mv costing by sending NULL pointer instead of cost array @@ -370,7 +353,8 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, // Find best match in this frame by MC int err = temporal_filter_find_matching_mb_c( cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, - frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride); + frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, + mb_col * 16, mb_row * 16); // Assign higher weight to matching MB if it's error // score is lower. If not applying MC default behavior @@ -386,7 +370,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row, mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16, - mb_row * 16, cm->allow_warped_motion); + mb_row * 16, cm->allow_warped_motion, num_planes); // Apply the filter (YUV) if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -556,14 +540,6 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost, strength = group_boost / 300; } - // Adjustments for second level arf in multi arf case. - if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { - strength >>= 1; - } - } - *arnr_frames = frames; *arnr_strength = strength; } @@ -593,21 +569,6 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) { int which_arf = gf_group->arf_update_idx[gf_group->index]; -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) { - // Identify the index to the current ARF. - const int num_arfs_in_gf = cpi->num_extra_arfs + 1; - int arf_idx; - for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) { - if (gf_group->index == cpi->arf_pos_in_gf[arf_idx]) { - which_arf = arf_idx; - break; - } - } - assert(arf_idx < num_arfs_in_gf); - } -#endif // USE_GF16_MULTI_LAYER - // Set the temporal filtering status for the corresponding OVERLAY frame if (strength == 0 && frames_to_blur == 1) cpi->is_arf_filter_off[which_arf] = 1; diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h index bc0863a63..2ddc68b2c 100644 --- a/third_party/aom/av1/encoder/temporal_filter.h +++ b/third_party/aom/av1/encoder/temporal_filter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_ -#define AV1_ENCODER_TEMPORAL_FILTER_H_ +#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ +#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ #ifdef __cplusplus extern "C" { @@ -22,4 +22,4 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance); } // extern "C" #endif -#endif // AV1_ENCODER_TEMPORAL_FILTER_H_ +#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h index de1cbe99c..63b505f36 100644 --- a/third_party/aom/av1/encoder/tokenize.h +++ b/third_party/aom/av1/encoder/tokenize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_TOKENIZE_H_ -#define AV1_ENCODER_TOKENIZE_H_ +#ifndef AOM_AV1_ENCODER_TOKENIZE_H_ +#define AOM_AV1_ENCODER_TOKENIZE_H_ #include "av1/common/entropy.h" #include "av1/encoder/block.h" @@ -70,4 +70,4 @@ static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id, } // extern "C" #endif -#endif // AV1_ENCODER_TOKENIZE_H_ +#endif // AOM_AV1_ENCODER_TOKENIZE_H_ diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h index 69063b801..405bc9e6e 100644 --- a/third_party/aom/av1/encoder/tx_prune_model_weights.h +++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ -#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { @@ -19,79 +19,114 @@ extern "C" { #include "av1/encoder/ml.h" // Tx type model for 4x4 block. -static const float av1_tx_type_nn_weights_4x4_layer0[32] = { - 0.72406f, -0.40019f, 0.51795f, -0.43881f, -0.49746f, -0.41780f, -0.39409f, - -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f, 1.06229f, 0.04188f, - 0.60919f, 0.92405f, -0.39359f, 0.70570f, 0.75375f, 1.11966f, -1.86360f, - -0.35421f, 0.18743f, 0.13346f, -0.21262f, 0.07050f, 0.10533f, -0.47402f, - 1.33417f, 1.72899f, 1.17983f, 0.10552f, +static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, }; -static const float av1_tx_type_nn_bias_4x4_layer0[8] = { - 1.96273f, -0.69845f, -0.10999f, -1.11311f, - 1.35101f, 0.43842f, -0.29264f, -1.15376f, +static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, }; -static const float av1_tx_type_nn_weights_4x4_layer1[32] = { - 0.79770f, 0.08520f, 0.23298f, 0.05285f, 0.87506f, -0.90784f, -0.06197f, - -1.00580f, 0.68639f, -0.34881f, 0.15366f, -1.64658f, 0.80755f, -0.26293f, - 0.10253f, -0.23915f, 1.14696f, -0.10928f, -1.61377f, 0.00863f, 0.98599f, - -0.43872f, 0.61196f, -0.03787f, 1.01060f, 0.17643f, -0.00208f, -0.15738f, - 0.06517f, 0.72885f, 0.24387f, 1.28535f, +static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, }; -static const float av1_tx_type_nn_bias_4x4_layer1[4] = { - 1.23769f, - 1.40308f, - 0.09871f, - 1.82070f, +static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, }; -static const NN_CONFIG av1_tx_type_nnconfig_4x4 = { +static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_hor_layer0, + av1_tx_type_nn_weights_4x4_hor_layer1 }, + { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers { - av1_tx_type_nn_weights_4x4_layer0, - av1_tx_type_nn_weights_4x4_layer1, - }, - { - av1_tx_type_nn_bias_4x4_layer0, - av1_tx_type_nn_bias_4x4_layer1, - }, + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_ver_layer0, + av1_tx_type_nn_weights_4x4_ver_layer1 }, + { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 } }; /******************************************************************************/ // Tx type model for 4x8 block. static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = { - 0.68355f, -0.06887f, 0.68525f, -0.86048f, -0.35906f, -0.28597f, -0.21108f, - 0.12591f, -1.13025f, -0.65695f, -0.25658f, 0.39155f, 0.89011f, 0.19258f, - 0.28316f, 0.61172f, 0.52587f, 0.99182f, 0.75704f, 0.66788f, -1.61814f, - -1.23483f, -0.62868f, -0.11902f, 0.33295f, 0.64796f, 0.92345f, -0.71821f, - 0.07575f, 0.34687f, 0.20518f, -0.87850f, + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, }; static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = { - 1.14049f, -0.18583f, 1.92114f, -0.72057f, - 1.32715f, 0.96713f, 1.09877f, -0.64345f, + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, }; static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = { - 0.71978f, 0.06896f, 1.48617f, 0.97124f, -0.02487f, -0.95359f, 0.68983f, - -0.16313f, 0.51324f, -0.33770f, 0.45938f, -1.08238f, 0.72938f, 0.42300f, - 0.85691f, -0.03783f, 1.12617f, -0.04034f, 0.36923f, 0.25638f, 1.10167f, - 0.41633f, 0.72602f, -0.14797f, 0.66888f, 0.11437f, -0.99797f, -0.20725f, - 1.01163f, 2.06308f, 1.23331f, -0.15481f, + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, }; static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = { - 2.14443f, - 1.98356f, - 0.74616f, - 2.58795f, + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { @@ -101,62 +136,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { { 8, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_4x8_hor_layer0, - av1_tx_type_nn_weights_4x8_hor_layer1, - }, - { - av1_tx_type_nn_bias_4x8_hor_layer0, - av1_tx_type_nn_bias_4x8_hor_layer1, - }, + { av1_tx_type_nn_weights_4x8_hor_layer0, + av1_tx_type_nn_weights_4x8_hor_layer1 }, + { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 } }; static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = { - 0.88859f, 1.02796f, 1.15509f, 0.61719f, 0.85804f, 1.17581f, 0.93524f, - 0.06546f, 0.08018f, -0.78562f, -0.36614f, 0.14149f, -0.30069f, -0.52647f, - -0.82789f, 0.60527f, -1.74026f, -0.20271f, 0.09875f, 0.03708f, 0.09430f, - -0.24043f, -0.38433f, 1.21014f, 1.42443f, 0.69586f, 1.07812f, 1.21748f, - 1.10989f, 0.93122f, 1.04127f, 0.39424f, 0.95592f, 0.12904f, 0.46330f, - 0.49722f, 0.46303f, 0.36979f, 0.60227f, 0.39345f, -2.01632f, -0.05706f, - 0.07766f, -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f, 0.27662f, - 0.42028f, 0.44748f, 1.14585f, 1.38805f, 0.46182f, -0.22982f, -0.07324f, - 0.29886f, -0.46959f, -0.04228f, -0.01064f, 0.24260f, -0.32282f, -0.23804f, - 1.44466f, -0.42190f, -0.36385f, 0.39746f, 0.38557f, -0.09624f, -0.21540f, - 0.57385f, -0.72878f, -0.39677f, -0.00717f, 0.60499f, 1.33849f, 1.05337f, - 1.11947f, 0.38487f, 0.86534f, -0.33970f, 0.71140f, 0.20772f, 0.61132f, - 0.06181f, -0.20027f, 0.13736f, -0.72321f, 0.64586f, -0.56740f, -0.90912f, - -0.20452f, 0.15381f, -0.84346f, 0.19550f, 0.63164f, 1.35441f, 0.63218f, - 0.82883f, 0.38803f, -0.23874f, -0.02962f, 0.23846f, -0.06822f, -0.40159f, - -0.17850f, -0.69524f, 1.12299f, -0.08286f, -0.14150f, -0.28456f, -0.41519f, - -0.12792f, -0.55286f, 0.51655f, 0.06636f, 0.73759f, 0.70072f, 0.12616f, - 0.31282f, 0.17130f, -1.34233f, 0.37221f, 0.95838f, 0.16286f, 1.04301f, - 0.73600f, -0.11233f, + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, }; static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = { - -0.89131f, 0.09124f, -0.71678f, -1.19929f, 0.98963f, 0.16896f, - -0.44943f, -0.97532f, -0.13997f, 1.07136f, -0.46362f, -0.45253f, - -0.63015f, -0.20008f, 1.24048f, -0.21265f, + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, }; static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = { - -0.79795f, 0.45973f, -0.54188f, -1.05095f, 0.64404f, -0.56470f, -0.57018f, - 0.61644f, 0.50229f, 1.14006f, 0.13805f, -0.42058f, -0.07468f, 0.66203f, - 0.93180f, -0.59662f, -0.25152f, 0.00336f, 1.09769f, -1.11921f, 0.15151f, - 0.58750f, -0.42480f, -0.95908f, -0.10980f, 1.31715f, 0.06665f, -0.52371f, - 0.37228f, -0.12364f, 0.54876f, -0.32698f, 0.39863f, -0.97669f, -1.06351f, - 1.82755f, 1.02851f, 0.10322f, -0.08322f, 0.08891f, -0.05715f, 0.93503f, - 0.02096f, -0.39506f, -0.99330f, -0.09407f, 0.75108f, -0.30104f, 1.78314f, - -0.01786f, -0.17392f, 0.00461f, 0.41394f, 0.92566f, 1.11251f, -0.71380f, - -0.04907f, 0.12736f, 0.00208f, 0.94451f, -0.31783f, -0.19655f, 0.64619f, - 0.50359f, + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, }; static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = { - 0.39274f, - 1.27276f, - 0.30322f, - 2.55238f, + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { @@ -166,64 +196,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_4x8_ver_layer0, - av1_tx_type_nn_weights_4x8_ver_layer1, - }, - { - av1_tx_type_nn_bias_4x8_ver_layer0, - av1_tx_type_nn_bias_4x8_ver_layer1, - }, + { av1_tx_type_nn_weights_4x8_ver_layer0, + av1_tx_type_nn_weights_4x8_ver_layer1 }, + { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x4 block. static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = { - 0.64828f, 0.61618f, 0.98975f, -0.14562f, 0.26957f, 1.80872f, 0.58299f, - -0.06917f, 0.00937f, -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f, - -0.85166f, 0.88799f, -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f, - -0.49650f, -0.32171f, 1.37181f, 1.30432f, 0.71843f, 1.01916f, 1.01582f, - 0.90999f, 0.86334f, 1.04603f, 0.40734f, 0.96187f, 0.53742f, 0.07510f, - 0.44167f, 0.02049f, -0.02874f, 0.97191f, 1.03647f, -2.62751f, -0.01390f, - -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f, 0.77191f, - 0.75416f, 0.69067f, 0.93561f, 1.35982f, 0.76193f, 0.57869f, 0.00251f, - -0.87244f, -0.26922f, -0.06682f, 0.07176f, 0.51142f, 0.58948f, 0.13914f, - 0.71165f, -0.40329f, -0.33201f, 0.35293f, 0.33437f, -0.01812f, -0.24765f, - 0.26810f, -0.77088f, 1.35707f, 0.22243f, 0.78402f, 0.66191f, 0.79890f, - 1.90669f, 0.73189f, 0.24222f, -0.34682f, 0.66990f, 0.19554f, 0.58414f, - 0.05060f, -0.21271f, 0.11656f, -0.74907f, 0.68837f, -0.39147f, -1.78263f, - -0.69918f, -0.06838f, -0.26927f, 0.38502f, 0.08305f, 1.29848f, 0.67328f, - 0.67269f, 0.65805f, -0.47778f, -1.02617f, 0.16523f, 0.12223f, -0.35294f, - -0.15866f, -0.56224f, 1.25895f, -0.21422f, -0.33518f, -0.33519f, -0.37414f, - 0.55122f, 0.14806f, 0.44312f, -0.07865f, 0.75295f, 0.10766f, 0.59922f, - 0.48837f, -0.19099f, -2.07991f, 0.35755f, 0.87813f, 0.07559f, 1.00724f, - 0.25223f, -0.06761f, + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, }; static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = { - -0.54227f, 0.08599f, -0.77447f, -1.10920f, 0.89298f, 0.05454f, - -0.73681f, 0.21048f, -0.41041f, 1.25690f, -0.60918f, 0.14661f, - -0.65392f, -0.25881f, 1.67995f, -0.03550f, + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, }; static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = { - -0.22312f, 0.73552f, 0.48399f, -0.66996f, 0.36527f, -0.42228f, -1.10793f, - 0.31167f, 0.16177f, 1.69315f, -0.06287f, -0.35804f, -0.24889f, 0.80824f, - 1.08952f, -0.62838f, 0.30066f, -0.19043f, -0.00518f, -1.31005f, 0.65797f, - 1.07714f, -0.24253f, 0.49779f, 0.05848f, 1.08914f, 0.08015f, -0.38853f, - 0.35108f, -0.11026f, 0.64528f, -0.37615f, 0.39995f, -0.58117f, -1.29627f, - 1.74169f, 0.75558f, -0.04910f, 0.35020f, 0.04556f, 0.12634f, 1.27223f, - 0.02608f, -0.19687f, -0.78649f, -0.22746f, 1.02589f, -0.28411f, 1.42443f, - -0.42115f, -0.21153f, -0.01733f, 0.62001f, 0.87167f, 1.66008f, -0.39179f, - -0.06293f, 0.27012f, 0.16871f, 0.64597f, 0.67358f, -0.20053f, 0.95830f, - 0.44232f, + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, }; static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = { - 0.14889f, - 1.74197f, - 0.53696f, - 2.87574f, + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { @@ -233,42 +258,37 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x4_hor_layer0, - av1_tx_type_nn_weights_8x4_hor_layer1, - }, - { - av1_tx_type_nn_bias_8x4_hor_layer0, - av1_tx_type_nn_bias_8x4_hor_layer1, - }, + { av1_tx_type_nn_weights_8x4_hor_layer0, + av1_tx_type_nn_weights_8x4_hor_layer1 }, + { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 } }; static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = { - 0.81919f, 0.15527f, 0.60055f, -0.54617f, -0.35510f, -0.28223f, -0.20478f, - 0.15001f, -1.84806f, -0.30274f, -0.00865f, 0.33939f, 1.11970f, 0.44630f, - 0.32074f, 0.39637f, 0.08149f, 1.28070f, 0.86703f, 0.76503f, -1.83991f, - -1.13575f, -0.68605f, -0.23690f, 0.07099f, 0.64960f, 0.82543f, -0.72028f, - 0.08220f, 0.34338f, 0.20245f, -0.88920f, + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, }; static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = { - 1.14995f, -0.16021f, 2.38325f, -0.65179f, - 1.09624f, 1.07662f, 0.63837f, -0.64847f, + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, }; static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = { - 0.10278f, 0.06819f, 1.73885f, 1.29889f, -0.18482f, -1.06132f, 0.67003f, - -0.23280f, 0.50181f, -0.33890f, 0.43524f, -1.03147f, 1.09640f, 0.66332f, - 0.47652f, -0.02251f, 0.94245f, -0.03861f, 0.84776f, 0.28377f, 0.92044f, - 0.23572f, 0.52082f, -0.16266f, 0.45290f, 0.11342f, -0.50310f, -0.92633f, - 1.46345f, 1.84714f, 1.06804f, -0.13610f, + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, }; static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = { - 2.41028f, - 1.95675f, - 0.82387f, - 2.41923f, + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { @@ -278,131 +298,181 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { { 8, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x4_ver_layer0, - av1_tx_type_nn_weights_8x4_ver_layer1, - }, - { - av1_tx_type_nn_bias_8x4_ver_layer0, - av1_tx_type_nn_bias_8x4_ver_layer1, - }, + { av1_tx_type_nn_weights_8x4_ver_layer0, + av1_tx_type_nn_weights_8x4_ver_layer1 }, + { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x8 block. -static const float av1_tx_type_nn_weights_8x8_layer0[128] = { - 0.98214f, 1.05643f, 0.91173f, 0.24165f, 0.39961f, 0.25736f, 0.68593f, - 0.10553f, 0.13353f, -0.49687f, -1.66413f, 1.16584f, 2.25147f, -0.72247f, - -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f, - -1.06022f, -1.42138f, 1.10500f, 2.96552f, -0.40638f, 0.02258f, -0.23137f, - 0.34922f, -0.01454f, 0.41251f, 0.35944f, -1.56742f, 0.01406f, 0.88114f, - 1.42462f, 0.87243f, 0.02439f, 0.07035f, 0.34303f, -3.16843f, 0.25798f, - 0.07494f, 0.38926f, -0.12267f, 0.09049f, -0.36711f, 0.01551f, 1.41269f, - 1.33505f, 1.43627f, 1.41909f, 1.44605f, 1.43008f, 1.36721f, 0.19443f, - -0.08606f, 0.17285f, 0.63692f, 0.92092f, 0.61007f, 0.87100f, -0.33631f, - 1.98025f, -0.40686f, -0.33808f, 0.34919f, 0.33817f, -0.01807f, -0.25259f, - 0.26442f, -0.76979f, 1.07788f, -1.38747f, 1.34315f, 2.79947f, 2.02838f, - -0.25062f, 0.00174f, 1.25888f, 0.17344f, 0.20897f, 1.28765f, 1.95749f, - 1.62351f, 1.04556f, 0.43858f, 0.12463f, 1.66399f, 0.03971f, 0.36614f, - 0.56932f, 0.15982f, 0.11587f, 0.21402f, 1.89386f, -0.91267f, -0.79781f, - 1.79155f, 0.60147f, -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f, - -0.11409f, -0.79470f, 0.69697f, -0.16588f, -0.16090f, -0.21236f, -0.52776f, - -0.64455f, 0.09173f, 0.80766f, 0.76097f, 0.20295f, -0.93467f, -0.43509f, - 0.59659f, 0.07788f, -3.79459f, 0.16268f, 0.47343f, 0.05106f, -0.24880f, - 1.18941f, 0.10346f, -}; - -static const float av1_tx_type_nn_bias_8x8_layer0[16] = { - 0.75780f, 0.25628f, 0.19911f, -0.41384f, 1.33909f, 0.31498f, - -1.37171f, -1.09561f, -0.44056f, 0.49001f, -0.65804f, -1.96031f, - 0.64806f, -0.52520f, 1.38838f, 0.15519f, -}; - -static const float av1_tx_type_nn_weights_8x8_layer1[64] = { - -0.63856f, -2.02670f, -0.92947f, 0.00216f, 1.47710f, -2.01099f, -2.11289f, - -0.92288f, 0.19296f, 1.37866f, -0.85975f, -0.78624f, -2.10392f, 0.13976f, - 1.06968f, -2.04120f, 0.57991f, -1.84941f, -0.81512f, -2.08254f, -0.47334f, - 0.12256f, -1.39594f, -1.02829f, 0.06134f, 2.25646f, -1.25196f, -2.65317f, - -1.94473f, 0.10989f, 0.55446f, -1.76557f, 0.33455f, -1.85556f, -3.01878f, - -0.25100f, 1.65520f, -1.61409f, 1.16336f, -1.15560f, 0.13631f, 1.50733f, - -1.07538f, -0.91200f, -1.93132f, 0.09271f, 0.24425f, -1.80655f, -0.01138f, - -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f, -1.60316f, - -0.02495f, 1.04938f, 0.28411f, -0.79809f, -1.48232f, 0.00766f, 0.94016f, - -1.11974f, -}; - -static const float av1_tx_type_nn_bias_8x8_layer1[4] = { - 0.53574f, - 1.57736f, - -0.13698f, - 2.64613f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_8x8 = { +static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_hor_layer0, + av1_tx_type_nn_weights_8x8_hor_layer1 }, + { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers { - av1_tx_type_nn_weights_8x8_layer0, - av1_tx_type_nn_weights_8x8_layer1, - }, - { - av1_tx_type_nn_bias_8x8_layer0, - av1_tx_type_nn_bias_8x8_layer1, - }, + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_ver_layer0, + av1_tx_type_nn_weights_8x8_ver_layer1 }, + { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x16 block. static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = { - 1.36274f, 1.37313f, 1.26859f, 1.26459f, 1.37979f, 1.47217f, 1.29710f, - 0.15765f, 0.31552f, -0.05727f, 0.25562f, 0.47925f, -0.32913f, -0.55757f, - -0.98010f, 0.08568f, -0.62754f, 0.12834f, -0.03717f, 0.06286f, 0.26159f, - 0.26023f, -0.62605f, 1.34500f, 1.47720f, 0.47937f, 0.84793f, 0.87866f, - 0.81260f, 0.74761f, 0.84217f, 0.53321f, -0.78232f, 0.35321f, 0.41240f, - 0.45002f, 0.88973f, 0.51055f, 0.91115f, -0.45512f, -2.37418f, -0.25205f, - 0.05893f, -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f, 0.18796f, - -0.05797f, 0.26484f, 1.23515f, 1.70393f, 0.46022f, -0.14354f, 0.08501f, - -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f, - 1.59912f, -0.40332f, -0.33209f, 0.37274f, 0.36831f, -0.00248f, -0.24295f, - 0.29539f, -0.76136f, -0.22531f, 0.12371f, 0.37889f, 1.02639f, 1.73330f, - 1.09686f, 1.04111f, 0.69006f, -1.27157f, 0.94013f, 0.61621f, 0.62274f, - 0.48759f, 0.55672f, 0.62597f, -0.38846f, 1.72124f, 0.08214f, -0.06650f, - 0.32617f, 0.10958f, 0.24650f, 0.10740f, 1.16861f, 0.50701f, 0.45383f, - 0.90016f, -0.00695f, -0.11986f, -0.07834f, 0.20346f, 0.25863f, -0.40889f, - -0.11344f, -0.79108f, 0.76259f, -0.14562f, -0.15459f, -0.20954f, -0.51306f, - 0.02743f, -0.82456f, -0.00861f, -0.27274f, 0.28762f, 0.07282f, 0.26410f, - 0.53413f, -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f, 0.80313f, - 1.07698f, 0.02531f, + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, }; static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = { - -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f, 0.29276f, - -0.41990f, -0.96924f, -0.30933f, 0.95264f, -0.25330f, -1.19584f, - 1.46564f, -0.42959f, 1.55720f, 0.18479f, + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, }; static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = { - -1.72959f, -0.21670f, 0.10616f, -0.02006f, 0.15084f, -0.85303f, -0.27535f, - 0.58704f, 0.23683f, 1.19743f, 0.77971f, 0.49874f, 0.19508f, 0.19641f, - 1.47895f, -0.52173f, -0.56746f, -0.50761f, 0.15864f, -0.95168f, 0.48103f, - 0.91904f, -0.11700f, 0.62863f, 0.06526f, 1.63803f, -0.72325f, -1.80449f, - 0.66373f, 0.12831f, 0.27139f, -0.26346f, 1.50852f, 0.25079f, -0.54255f, - 1.78815f, 1.39691f, -0.44989f, -0.18511f, -1.52903f, 0.13983f, 1.06906f, - -0.30184f, 0.37566f, 0.46209f, 0.10440f, 0.64695f, -0.34002f, 1.96990f, - 0.21189f, -0.91248f, -0.11263f, 0.26708f, 1.27405f, 1.89776f, 0.02081f, - -0.06977f, -0.02584f, 0.47733f, 0.27117f, 1.33315f, -0.09175f, 0.48747f, - 1.16772f, + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, }; static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = { - 1.25783f, - 1.19452f, - 0.69964f, - 2.41982f, + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { @@ -412,62 +482,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x16_hor_layer0, - av1_tx_type_nn_weights_8x16_hor_layer1, - }, - { - av1_tx_type_nn_bias_8x16_hor_layer0, - av1_tx_type_nn_bias_8x16_hor_layer1, - }, + { av1_tx_type_nn_weights_8x16_hor_layer0, + av1_tx_type_nn_weights_8x16_hor_layer1 }, + { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 } }; static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = { - 0.90888f, 0.86305f, 0.81674f, 0.75352f, 1.07834f, 0.99048f, 0.96355f, - 0.13836f, -0.51334f, 0.19906f, 1.84608f, 0.67828f, 0.45876f, 0.08325f, - 0.28190f, -0.01958f, -1.96553f, 0.27837f, -0.05929f, 0.13491f, 0.21036f, - 0.05797f, -0.01373f, 0.73765f, 1.39603f, -0.53767f, 0.10362f, 0.03420f, - 0.41909f, 0.09510f, 0.32284f, 0.83860f, 0.13954f, 0.48434f, 1.47762f, - 0.45891f, 0.23613f, 0.13013f, 0.82097f, -0.03251f, -1.89757f, 0.21589f, - -0.10370f, 0.02530f, -0.25659f, 0.01466f, -0.23661f, 0.22783f, 0.92100f, - 1.02915f, 1.20358f, 1.17251f, 0.97749f, 1.04696f, 0.91333f, 0.54576f, - -0.52792f, 0.02217f, 0.25652f, 0.31405f, -0.18398f, 0.04572f, -0.81359f, - 1.82883f, -0.40047f, -0.33056f, 0.35255f, 0.34448f, -0.00339f, -0.23857f, - 0.28925f, -0.77175f, -0.24325f, -0.21420f, 1.11451f, 1.39553f, 0.51573f, - 0.05476f, 1.13791f, 0.94959f, -0.35710f, 0.67467f, 0.16722f, 0.61213f, - 0.07683f, -0.20613f, 0.13440f, -0.72131f, -0.15418f, -0.17688f, -0.16510f, - -0.19226f, 0.09270f, -2.43559f, -0.12669f, 0.05074f, 0.30414f, 0.00927f, - 0.60630f, 0.00801f, -1.07310f, -0.06227f, 2.10607f, 0.02382f, -0.39891f, - -0.09149f, -0.78596f, 0.83966f, -0.14802f, -0.14083f, -0.20831f, -0.55136f, - 0.08566f, -0.00647f, 0.07044f, 0.53408f, 0.85720f, -0.07393f, 0.24476f, - 0.43767f, 0.30519f, -1.89430f, 0.23252f, 1.63790f, 0.17316f, -0.03903f, - 0.25269f, 0.01562f, + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, }; static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = { - -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f, 0.16745f, - -1.34680f, -1.07083f, -0.34649f, 0.65598f, -0.56278f, 0.22660f, - -0.25956f, -0.29608f, 1.24359f, -0.09167f, + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, }; static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = { - -0.71147f, -0.63964f, -0.69220f, 0.22326f, 0.67191f, -0.58894f, -0.98464f, - 0.23583f, 0.22824f, 1.39838f, 0.09920f, -0.59411f, -0.67101f, 0.19088f, - 0.83025f, -0.66991f, -0.42889f, -0.49969f, 1.39532f, -1.02000f, 0.62101f, - 0.57175f, -0.83226f, 0.01551f, 0.05604f, 1.23028f, 0.02030f, -0.55995f, - -0.42349f, 0.15375f, 0.52132f, -0.52421f, 0.89586f, -0.73778f, -0.10911f, - 0.22447f, 1.16858f, -0.48169f, 1.73890f, -0.69860f, 0.12504f, 1.10492f, - 0.04391f, -0.85670f, -0.49257f, 0.09616f, 0.76518f, -0.44854f, 1.50938f, - 0.62246f, -0.40366f, -0.11182f, -0.01680f, 0.59724f, 1.32170f, -1.09061f, - -0.04278f, -0.02449f, 0.25024f, 1.26239f, 0.42345f, -0.10031f, 0.80871f, - 0.44198f, + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, }; static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = { - 0.68329f, - 1.33555f, - 0.25943f, - 3.23439f, + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { @@ -477,64 +542,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x16_ver_layer0, - av1_tx_type_nn_weights_8x16_ver_layer1, - }, - { - av1_tx_type_nn_bias_8x16_ver_layer0, - av1_tx_type_nn_bias_8x16_ver_layer1, - }, + { av1_tx_type_nn_weights_8x16_ver_layer0, + av1_tx_type_nn_weights_8x16_ver_layer1 }, + { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 } }; /******************************************************************************/ // Tx type model for 16x8 block. static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = { - 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f, - 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f, - -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f, - 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f, - 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f, - 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f, - 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f, - 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f, - -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f, - 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f, - 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f, - -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f, - 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f, - 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f, - 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f, - -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f, - 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f, - 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f, - 0.43289f, -0.00362f, + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, }; static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = { - -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f, - -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f, - -0.14741f, -0.43954f, 1.72137f, -0.21704f, + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, }; static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = { - -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f, - 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f, - 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f, - 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f, - -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f, - 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f, - -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f, - 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f, - -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f, - 0.63304f, + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, }; static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = { - 0.71765f, - 1.40400f, - 0.32221f, - 3.07234f, + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { @@ -544,62 +604,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x8_hor_layer0, - av1_tx_type_nn_weights_16x8_hor_layer1, - }, - { - av1_tx_type_nn_bias_16x8_hor_layer0, - av1_tx_type_nn_bias_16x8_hor_layer1, - }, + { av1_tx_type_nn_weights_16x8_hor_layer0, + av1_tx_type_nn_weights_16x8_hor_layer1 }, + { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 } }; static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = { - 1.20497f, 1.23691f, 1.23738f, 1.07773f, 1.15264f, 1.31959f, 1.15365f, - 0.17179f, 0.68612f, 0.55636f, 0.57145f, 0.67022f, 0.19636f, -1.27420f, - -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f, - -1.02689f, -0.88153f, 0.83857f, 1.53355f, 0.13601f, 0.35451f, 0.53750f, - 0.62381f, 0.32438f, 0.59405f, 0.33090f, -1.52948f, -0.46094f, 0.42634f, - 0.48763f, 0.30707f, 0.52553f, 0.71427f, -0.31287f, -2.37106f, -0.18756f, - 0.16561f, -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f, 0.45010f, - -0.00317f, -0.06403f, 0.95442f, 1.59636f, 0.30602f, -0.05515f, 0.05467f, - -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f, 0.26141f, -0.32174f, - 1.78129f, -0.40161f, -0.33158f, 0.38084f, 0.38081f, 0.01053f, -0.23567f, - 0.29239f, -0.76159f, -0.19373f, 0.13649f, 0.66949f, 1.19733f, 1.92557f, - 1.16691f, 0.94955f, 0.62324f, -0.85434f, -0.07699f, 0.87683f, 0.95911f, - 0.86106f, 0.57959f, 0.40146f, -0.35851f, 1.55427f, 0.15349f, -0.01582f, - 0.32517f, 0.03784f, 0.15916f, 0.09024f, 1.43187f, 0.56160f, 0.11521f, - 0.52476f, -0.26107f, -0.38167f, -0.31596f, 0.31304f, -0.65366f, -0.40680f, - -0.11082f, -0.78585f, 0.77906f, -0.13322f, -0.13747f, -0.21001f, -0.53204f, - -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f, 0.13586f, -0.42001f, - 0.85388f, 0.08300f, -0.89325f, -1.73681f, -0.70473f, 0.23151f, 0.69549f, - 0.72124f, 0.12769f, + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, }; static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = { - -1.15644f, -0.31062f, 0.20697f, -0.60304f, -1.19498f, 0.21451f, - -0.42825f, -0.71800f, -0.25816f, 1.47408f, -0.24423f, -1.45773f, - -0.55834f, -0.36938f, 1.56759f, 0.07238f, + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, }; static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = { - -1.45227f, -0.67141f, 0.75237f, 0.32681f, -0.70528f, -0.76730f, -0.49777f, - 0.02418f, 0.25096f, 1.14840f, 0.23548f, 0.48755f, 0.33164f, 0.21050f, - 1.41651f, -0.28888f, -0.76668f, 0.04439f, 0.67538f, -1.06438f, 0.68128f, - 0.95824f, 0.08530f, -0.03635f, 0.06820f, 1.38621f, -0.50424f, -1.72992f, - -0.20949f, 0.13400f, 0.93366f, -0.05324f, 1.41593f, -0.75119f, -1.80912f, - 1.05440f, 0.62580f, -0.30867f, -0.07025f, -0.34654f, 0.13621f, 1.74426f, - -0.22417f, 0.47031f, -0.08142f, 0.10151f, 0.42498f, 0.06635f, 1.50623f, - 1.04130f, 0.85107f, 0.23382f, 0.69800f, 1.10856f, 1.18767f, -0.69395f, - -0.07985f, 0.50412f, 0.46019f, 0.49214f, 0.44219f, -0.09502f, 0.75745f, - 0.99208f, + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, }; static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = { - 0.68774f, - 0.88572f, - 0.77462f, - 3.05667f, + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { @@ -609,14 +664,9 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x8_ver_layer0, - av1_tx_type_nn_weights_16x8_ver_layer1, - }, - { - av1_tx_type_nn_bias_16x8_ver_layer0, - av1_tx_type_nn_bias_16x8_ver_layer1, - }, + { av1_tx_type_nn_weights_16x8_ver_layer0, + av1_tx_type_nn_weights_16x8_ver_layer1 }, + { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 } }; /******************************************************************************/ @@ -687,445 +737,253 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x16 = { }; /******************************************************************************/ -// Tx type model for 16x32 block. -static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = { - 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f, - 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f, - -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f, - 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f, - 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f, - 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f, - 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f, - 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f, - -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f, - 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f, - 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f, - -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f, - 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f, - 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f, - 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f, - -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f, - 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f, - 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f, - 0.43289f, -0.00362f, -}; - -static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = { - -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f, - -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f, - -0.14741f, -0.43954f, 1.72137f, -0.21704f, -}; - -static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = { - -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f, - 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f, - 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f, - 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f, - -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f, - 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f, - -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f, - 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f, - -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f, - 0.63304f, -}; - -static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = { - 0.71765f, - 1.40400f, - 0.32221f, - 3.07234f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = { +// Tx type model for 4x16 block. +static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_hor_layer0, + av1_tx_type_nn_weights_4x16_hor_layer1 }, + { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x32_hor_layer0, - av1_tx_type_nn_weights_16x32_hor_layer1, - }, - { - av1_tx_type_nn_bias_16x32_hor_layer0, - av1_tx_type_nn_bias_16x32_hor_layer1, - }, + { av1_tx_type_nn_weights_4x16_ver_layer0, + av1_tx_type_nn_weights_4x16_ver_layer1 }, + { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 } }; +/******************************************************************************/ -static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = { - -0.01219f, 0.51494f, 0.25450f, 0.45788f, -0.87277f, 0.32954f, -0.04851f, - -0.24321f, -0.40000f, 0.21915f, 0.14108f, 0.98268f, 0.18989f, 0.54298f, - 0.36349f, 0.38931f, 1.08124f, 0.87199f, 1.03553f, 1.14777f, 1.04254f, - 1.11336f, 0.92198f, 0.84715f, 1.89363f, 1.21587f, 0.72377f, 1.25097f, - 0.84231f, 0.95529f, 1.12346f, 0.19113f, -0.04559f, 0.56859f, 0.59747f, - 0.60176f, 0.82465f, 0.59009f, 0.67240f, 1.58674f, -0.92951f, -0.23449f, - 0.11923f, -0.19151f, -0.15914f, 0.03146f, -0.16541f, 0.17181f, -0.21834f, - 0.21906f, 0.96708f, 0.36085f, -0.42380f, -2.25681f, -0.48812f, 0.72875f, - 0.06585f, 0.18818f, -0.02109f, -0.10996f, 0.00187f, -0.02078f, 0.04484f, - -0.07171f, 0.94773f, -0.33466f, 0.28484f, 0.14791f, 0.30274f, 0.13377f, - 0.40970f, 0.45133f, 1.69265f, -0.36422f, -0.15889f, 0.07670f, 0.44675f, - -0.28665f, -0.07097f, 1.03803f, -0.83274f, -0.24571f, 0.08039f, -0.23790f, - -0.23276f, -0.28031f, 0.26451f, -0.18513f, -2.23336f, -0.62073f, 0.32495f, - -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f, 0.19723f, - 0.04017f, 0.31624f, 0.58369f, 0.30411f, -0.81165f, -2.58541f, -0.20491f, - 0.68089f, -0.14799f, 0.13925f, 0.12867f, 0.15229f, 0.06887f, -0.03784f, - 0.02288f, -0.28712f, 0.14107f, 0.29485f, -0.11662f, 0.25239f, 0.30311f, - -0.07377f, -0.10962f, 0.59856f, 0.47967f, 0.01847f, -0.27889f, 0.46786f, - 0.18118f, 0.09355f, -2.10076f, 0.38823f, 0.28202f, 0.29104f, 0.86977f, - 0.52377f, 0.21161f, 0.72888f, -0.00952f, 0.15982f, -0.14651f, 0.28763f, - -0.14155f, 0.00093f, 0.08351f, 0.34685f, -0.22066f, 0.20378f, 0.25416f, - 0.03423f, -0.11068f, -0.41612f, 0.56913f, -0.06697f, -0.12585f, -0.21033f, - -0.14513f, -0.04477f, -0.35778f, 0.03437f, 0.06956f, -0.25356f, -1.46010f, - -0.08142f, 0.11926f, -0.63551f, -0.13882f, 0.34164f, 0.10821f, 1.07323f, - -0.62435f, -0.27116f, 0.25971f, 0.11952f, -0.39480f, -0.05474f, -0.12582f, - 0.28289f, 0.13723f, 0.58369f, 0.41865f, 0.28574f, 1.01357f, 0.46661f, - 0.61717f, 0.85708f, -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f, - -0.01041f, 0.12119f, -0.20786f, 0.55915f, 0.67511f, 0.55554f, 0.56540f, - 0.76647f, 0.54766f, 0.45166f, 0.61384f, 0.95407f, -0.06811f, -0.62132f, - 0.12713f, 0.63713f, 2.04090f, 1.17054f, 0.00469f, -0.93692f, -0.24136f, - -0.04281f, -0.15787f, 0.37956f, -0.09174f, -0.72494f, 0.55285f, -1.40996f, - -0.54077f, 0.38445f, -0.08258f, 0.64259f, -0.54058f, -0.49865f, 1.41371f, - 0.89014f, 0.78788f, 0.37919f, 0.87447f, -0.00760f, -0.00947f, 0.16323f, - -0.36632f, -1.38115f, -0.24619f, 0.40490f, -0.08871f, -0.25365f, -0.60842f, - 0.11128f, 0.18658f, -0.86001f, -0.28271f, 0.39572f, -0.29930f, -0.10110f, - 0.33706f, 0.21731f, 0.15383f, -0.01707f, 0.02812f, 0.31192f, 0.39742f, - 0.38260f, -0.48263f, 0.57385f, 0.53239f, -0.60013f, -0.63211f, -0.45140f, - -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f, -0.05195f, -0.07138f, - -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f, 0.09419f, - 0.36919f, -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f, - 0.28414f, 0.19273f, -0.68516f, 0.09514f, -0.45381f, 0.19917f, -0.32377f, - 1.32549f, 0.08244f, -0.64405f, 0.13195f, 2.85307f, 0.47631f, -0.33408f, - 0.04168f, 0.18585f, -0.18029f, 0.07986f, -0.08816f, -0.00703f, -0.01515f, - -0.13164f, 0.00571f, 0.05676f, 1.51425f, 0.73360f, 0.43486f, -0.08223f, - -0.06183f, -0.57098f, -0.29948f, 0.05945f, 0.19238f, -0.47980f, -0.35902f, - -0.19931f, 0.43443f, 0.67436f, 0.78573f, 0.25703f, 1.01863f, 0.99047f, - 0.95228f, 1.02429f, 1.19264f, 0.29935f, -0.26583f, -0.98749f, -0.46167f, - -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f, - -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f, - -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f, -0.18699f, 0.11037f, - 1.39110f, 0.05715f, 3.00762f, 1.52243f, 0.25028f, 0.12779f, -0.12871f, - 0.04764f, 0.08288f, -0.16572f, -0.06580f, 0.05845f, -0.01474f, 0.04886f, - -0.10000f, 0.12911f, -0.01416f, -0.12472f, 0.14358f, 0.16554f, 0.08853f, - 0.13418f, -0.05408f, -0.13871f, -0.00049f, 0.20725f, -0.05603f, 0.27885f, - -0.14277f, 0.29653f, -0.24739f, 0.10101f, -0.17068f, -2.43802f, 0.41834f, - 0.49784f, 0.34949f, 0.98487f, 0.16792f, 1.07355f, 0.32546f, 1.32377f, - -0.08584f, 0.85214f, -0.05721f, 0.90307f, 0.20167f, 0.52664f, -0.14478f, - 0.64997f, 0.06846f, 0.32475f, 0.64453f, 0.70143f, -0.03091f, -0.24958f, - -0.39021f, -0.57693f, -0.18319f, 0.11793f, -0.05948f, 0.36670f, -0.27932f, - 0.14800f, -0.55459f, -0.89673f, 0.65922f, 0.54308f, -0.16731f, -0.59731f, - -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f, - 0.57665f, 0.14171f, 0.54693f, -0.22144f, -0.59664f, 0.13295f, 0.07057f, - -0.19698f, 0.03328f, -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f, - 0.10399f, -0.29824f, 0.16028f, 0.00053f, 0.22699f, 0.04203f, -0.43880f, - -0.12654f, 0.12172f, 0.21087f, -0.46350f, -0.22081f, -0.06173f, -0.23287f, - 0.90314f, 0.04466f, -0.06149f, 0.32682f, 0.16609f, -0.58991f, -0.03786f, - -0.41329f, 0.02632f, 0.23411f, 0.25344f, 0.16468f, 0.31007f, 0.21845f, - 0.32462f, 0.33945f, 0.11527f, -0.35926f, -0.18584f, 0.29340f, 0.78199f, - 2.39287f, 0.53838f, -1.55085f, 0.02238f, -0.26153f, -0.42498f, -0.02460f, - 0.19261f, -0.10870f, -0.08453f, -0.39561f, 0.08600f, 0.36310f, 0.58439f, - -0.59526f, 0.13104f, -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f, - -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f, - -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f, - -0.01752f, -}; - -static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = { - 1.02458f, -1.02185f, -0.18978f, 0.05981f, -0.94931f, 0.34544f, 0.04415f, - -0.60036f, -0.11368f, -0.14154f, 1.23438f, 0.51640f, -0.57587f, -0.91380f, - 0.95720f, 0.68298f, -0.06353f, -2.14960f, -0.11080f, 0.79380f, -0.94199f, - 0.43040f, 0.01358f, 0.07201f, -0.49689f, -0.14839f, -0.80132f, -0.13925f, - -0.11834f, -0.24998f, -0.33976f, 0.35497f, -}; - -static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = { - 0.87367f, -1.06469f, -0.50829f, -0.70540f, 1.14596f, -1.12346f, -0.94467f, - 0.01380f, -0.18911f, 0.07961f, -0.18626f, 0.61902f, -0.64423f, 1.21545f, - 1.01149f, 0.26309f, 1.50380f, 1.93940f, -0.64064f, 1.03987f, -1.88000f, - -0.44574f, -1.53303f, 1.36307f, 1.00292f, 0.37031f, 0.21594f, 0.16758f, - 0.02592f, -0.77431f, -0.31797f, -1.53826f, 1.14013f, -1.21957f, 0.04571f, - -0.22168f, 0.32299f, 0.25949f, -0.13306f, 0.17850f, 0.92494f, 0.19999f, - 0.07494f, -0.03362f, -0.53453f, 1.02970f, -0.22947f, 0.73964f, 1.08445f, - 0.16855f, -0.02686f, 0.25254f, 0.05952f, 0.02194f, 0.05649f, 0.39195f, - 0.14139f, 0.53843f, -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f, - -1.21977f, 0.62932f, 1.07173f, 0.24049f, -0.51574f, 0.97492f, -0.28169f, - -0.15406f, -0.05441f, -0.25415f, 0.16583f, 0.43674f, -0.00593f, -0.09277f, - 0.61402f, 1.35562f, -0.03926f, 0.18967f, -0.29548f, -0.55509f, 0.23661f, - 0.05023f, 0.36226f, -0.83314f, 0.39357f, 0.19943f, -0.63431f, -0.03847f, - 0.12213f, 0.62024f, -0.11704f, -0.22483f, 0.96624f, 0.18518f, 0.09181f, - -0.63068f, 0.66797f, 0.74107f, 0.40624f, 0.70636f, -0.06921f, 0.34175f, - -0.15513f, 2.07844f, 0.22126f, 0.52919f, 0.26793f, -0.50018f, 1.10549f, - 0.10970f, 0.05831f, 0.82842f, -1.22975f, 1.78377f, 0.92679f, 2.01480f, - -1.19011f, -0.53381f, 0.38533f, 0.45579f, -0.10683f, -0.40828f, 0.31398f, - 0.14978f, 0.91325f, -}; - -static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = { - 1.03659f, - 1.80249f, - 1.25710f, - 1.32000f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = { - 16, // num_inputs - 4, // num_outputs - 1, // num_hidden_layers +// Tx type model for 16x4 block. +static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers { - 32, + 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x32_ver_layer0, - av1_tx_type_nn_weights_16x32_ver_layer1, - }, - { - av1_tx_type_nn_bias_16x32_ver_layer0, - av1_tx_type_nn_bias_16x32_ver_layer1, - }, + { av1_tx_type_nn_weights_16x4_hor_layer0, + av1_tx_type_nn_weights_16x4_hor_layer1 }, + { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 } }; -/******************************************************************************/ -// Tx type model for 32x16 block. -static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = { - -0.07289f, 0.30798f, 0.41881f, 0.33434f, -0.01599f, 0.85307f, -0.16060f, - -0.07922f, -0.04693f, 0.29186f, 0.44117f, 1.02417f, 0.12447f, 0.46321f, - 0.40060f, 0.50140f, 0.48338f, 0.47298f, 0.36585f, 0.42821f, 0.41289f, - 0.47534f, 0.42900f, 0.26061f, 0.45887f, 0.38163f, 0.17302f, 1.00888f, - 1.79910f, 1.36140f, 0.24471f, 0.04557f, 1.10823f, 0.74325f, 0.91210f, - 0.81387f, 0.98865f, -0.09874f, 0.55146f, 0.19385f, -0.50752f, -0.17249f, - 0.27261f, -0.02763f, -0.03286f, 0.09122f, 0.07015f, 0.20012f, 0.68983f, - -1.25345f, -0.00145f, 0.71567f, 0.54948f, -0.56154f, -0.28918f, 0.11997f, - -0.09907f, 0.09195f, 0.05768f, 0.15558f, 0.11284f, -0.35195f, -0.08723f, - -0.03571f, 0.94031f, 0.63737f, 0.98202f, 0.93826f, 0.87126f, 0.88530f, - 0.97697f, 0.55283f, 0.58670f, 0.86502f, 0.97008f, 0.99709f, 0.66214f, - 0.96660f, 0.99890f, 0.31945f, -1.00301f, 0.13215f, -0.03950f, 0.21148f, - 0.05128f, 0.10955f, 0.44839f, -0.33438f, -2.09773f, 0.13908f, 0.58669f, - 0.25268f, -0.24006f, 0.01286f, -0.05732f, 0.03401f, -0.06896f, 0.35397f, - 0.05133f, -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f, - -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f, -0.26912f, - 0.06866f, -0.25694f, 0.17632f, 0.32032f, -0.10666f, 0.26278f, 0.31877f, - -0.09338f, -0.14289f, 0.54232f, 0.46070f, 0.00059f, -0.27914f, 0.45177f, - 0.16274f, -0.08811f, -0.45791f, 0.53946f, -0.16794f, 0.16229f, 0.11840f, - -0.24435f, 0.26894f, -0.33180f, -0.47314f, 0.34061f, -0.13939f, 0.13321f, - -0.05208f, -0.18139f, -0.35234f, 1.37298f, -0.19360f, 0.21728f, 0.26088f, - 0.04045f, -0.10763f, -0.40470f, 0.50026f, -0.06726f, -0.12871f, -0.20963f, - -0.14583f, -0.04711f, -0.35988f, 0.03091f, 0.06491f, -0.31668f, -0.52190f, - 0.23397f, -0.13984f, -0.15207f, -0.49977f, 0.51205f, 0.12559f, -0.03631f, - 0.33447f, -0.36684f, 0.17533f, 0.15671f, -0.00096f, 0.06817f, 0.20922f, - 0.34006f, 0.71260f, 0.45024f, 0.53033f, 0.15645f, 0.76019f, 0.56870f, - 0.83066f, 0.63022f, 1.74436f, -0.24798f, 0.06795f, -0.00749f, 0.17795f, - 0.10371f, 0.06527f, 0.41054f, 0.49003f, 0.34630f, 0.02615f, 0.30320f, - -0.47133f, -0.49584f, 0.21775f, 0.27530f, -0.29977f, -0.64269f, 0.52627f, - -0.02492f, 0.08077f, 0.40786f, -0.36015f, -0.70714f, -1.98185f, -0.28187f, - 0.35018f, -0.06105f, -0.12710f, 0.06606f, -0.27805f, 0.44630f, -0.84731f, - -0.26699f, 0.25856f, 0.06194f, -0.18674f, -0.11560f, -0.43277f, 1.10579f, - 0.95876f, 0.17415f, 0.56386f, 0.68426f, 0.50180f, 0.24844f, 0.12347f, - 0.15281f, -0.19089f, 0.52279f, 0.41860f, -0.05270f, -0.17029f, -0.03542f, - 0.10621f, -0.25088f, 0.24070f, -0.08951f, 0.29950f, -0.36720f, 0.02151f, - 0.20129f, -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f, - 0.23814f, -0.25991f, 0.05812f, 0.60554f, -0.06106f, -0.58326f, 0.28762f, - -0.18747f, 0.08232f, -0.04243f, -0.03293f, 0.14722f, -0.13017f, -0.67263f, - 0.38698f, -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f, 0.04684f, - 0.04214f, 0.00030f, 0.02410f, 0.19966f, -0.04246f, 0.00442f, 0.23121f, - 0.13364f, 0.21548f, -0.12748f, -0.14066f, -0.28354f, 0.59937f, -0.27553f, - 1.57503f, -0.01050f, -0.17724f, 0.44110f, -0.80334f, 0.72064f, 1.00501f, - -0.72638f, 0.02774f, 0.48540f, -0.72016f, -0.27721f, 0.31559f, 0.07322f, - 0.20279f, -0.19647f, 0.02352f, 0.12662f, 0.19743f, 0.30543f, 0.25712f, - 0.44702f, 0.16417f, 0.17888f, -2.58469f, 0.20555f, 0.57782f, -0.10892f, - 0.14527f, 0.82251f, 0.04200f, 0.44626f, 0.10818f, 0.71204f, 0.62903f, - 0.69178f, 0.73603f, 0.52717f, 0.83020f, 0.48824f, 1.03270f, -0.00152f, - 0.07958f, 0.24181f, -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f, - 0.56318f, 0.32580f, -0.58503f, -0.33673f, -0.00838f, 0.48924f, 0.43362f, - 0.12750f, 0.00295f, 0.38624f, 0.17037f, 0.00729f, -0.26256f, -0.41669f, - 0.36847f, 0.22424f, 1.33334f, 0.18112f, 0.37682f, 0.49173f, -0.45240f, - -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f, 0.38033f, 0.13685f, - 0.17597f, 0.28668f, 0.31193f, -0.43281f, 0.43267f, -0.50495f, 0.01969f, - 0.14131f, -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f, - -0.38584f, -0.10953f, 0.19669f, 0.34540f, -0.49941f, 0.04605f, -0.43535f, - 0.27519f, 0.03659f, -0.31961f, 0.13330f, 0.87009f, 0.20101f, -0.70392f, - -0.27883f, 0.33874f, -0.34308f, 0.67760f, 0.88195f, 0.55752f, -0.26563f, - 0.17875f, 0.06964f, 0.87607f, 1.47616f, 0.46747f, -0.56408f, -0.39352f, - -0.16427f, -0.41185f, 0.14187f, 0.19265f, -0.58613f, 0.56345f, -0.17729f, - -0.11320f, 0.08752f, -0.01329f, 1.20981f, 0.45170f, -0.20571f, -0.01150f, - 0.26476f, 0.13508f, 0.22020f, -0.42684f, -0.22499f, -1.51212f, 0.86648f, - 0.21776f, 0.24666f, 0.71339f, 0.42742f, -0.00952f, 0.14762f, 0.07693f, - -0.19599f, 0.03075f, -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f, - 0.10038f, -0.30038f, 0.14686f, 0.00548f, 0.20350f, 0.00763f, -0.43756f, - -0.01997f, 0.00902f, 0.07470f, -0.41441f, -0.20605f, 0.07626f, -0.34973f, - 0.47455f, -0.15251f, -0.05325f, 0.04964f, 0.32477f, -0.54604f, 0.25273f, - -0.18461f, -0.30841f, 0.64908f, 0.60752f, 0.64148f, 0.72788f, 0.71232f, - 0.58597f, 0.73017f, 0.58857f, 0.71908f, 0.59860f, 0.61849f, 0.99398f, - 0.39572f, -0.36165f, -1.88646f, 0.14384f, -0.60541f, -0.21380f, -0.55498f, - -0.50960f, -0.08801f, 0.51892f, 0.19126f, 0.57879f, 1.19447f, 0.25673f, - -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f, -0.60983f, - -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f, - -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f, - -0.14533f, -}; - -static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = { - 1.53781f, -0.49320f, -0.31646f, 0.02826f, -1.05554f, 0.06559f, -0.12399f, - -0.61671f, -0.28956f, -0.15419f, 0.87189f, -0.43375f, -1.08477f, -0.66006f, - 0.36233f, 0.82678f, -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f, - 0.50173f, -0.07560f, 0.71598f, 1.50795f, -0.04745f, -0.14008f, -0.18510f, - -0.14988f, -0.67044f, 0.79659f, 0.70610f, -}; - -static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = { - 0.84983f, -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f, - -0.02850f, 0.50767f, 0.10252f, 0.24540f, 0.67748f, -0.43483f, -0.22242f, - 0.23431f, 0.57287f, 0.69560f, 1.13814f, -0.47427f, -0.55858f, -1.47072f, - 0.26587f, -0.36335f, 0.83060f, 1.01645f, -0.52895f, -0.11614f, 0.17390f, - -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f, -0.55612f, 0.46852f, - 0.07406f, -0.80467f, 0.23059f, 0.09992f, -0.06164f, 0.13541f, 0.06135f, - 0.83605f, -0.53224f, -0.13867f, 0.93838f, -0.61290f, 0.27732f, -0.46688f, - -0.41810f, 0.12885f, 0.13619f, -0.24612f, 0.07215f, 0.98866f, 0.10993f, - 1.05799f, -0.27146f, -0.00079f, -0.08585f, 0.08322f, -0.33809f, 0.67598f, - -1.06515f, 1.28866f, 0.61028f, -0.31704f, -0.59905f, 1.62151f, 0.10969f, - 0.20671f, -0.17818f, 0.14170f, 0.19322f, 0.30602f, 0.93111f, 0.19011f, - -0.45609f, 0.82506f, 0.32936f, -0.07858f, -0.27106f, -0.31638f, 0.23299f, - 0.81491f, 0.32584f, -0.52093f, -0.32472f, 0.53643f, -0.42605f, 0.01641f, - 0.09002f, 0.15832f, -0.08790f, 0.05511f, 1.00730f, 0.46309f, 0.68166f, - -0.18835f, 0.64512f, -1.00540f, 0.86802f, 0.18981f, -0.06982f, -0.24514f, - -0.08027f, 0.61199f, -0.20830f, 0.72001f, 0.17477f, 0.06511f, 0.00801f, - -0.43590f, 0.37257f, 0.70323f, 0.60233f, 1.62541f, 0.74383f, -0.22254f, - -0.33892f, 0.22881f, 0.62817f, 0.68915f, -0.06417f, 0.00969f, 1.65869f, - 0.89060f, 0.75948f, -}; - -static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = { - 0.95359f, - 1.56043f, - 1.06017f, - 2.54520f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = { - 16, // num_inputs - 4, // num_outputs - 1, // num_hidden_layers - { - 32, - }, // num_hidden_nodes - { - av1_tx_type_nn_weights_32x16_hor_layer0, - av1_tx_type_nn_weights_32x16_hor_layer1, - }, - { - av1_tx_type_nn_bias_32x16_hor_layer0, - av1_tx_type_nn_bias_32x16_hor_layer1, - }, +static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, }; -static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = { - 1.30219f, 1.30548f, 1.33334f, 1.20560f, 1.01572f, 1.38100f, 1.37504f, - 0.12599f, -0.96957f, 0.19400f, 0.75734f, 0.11295f, -0.40447f, -1.53062f, - -0.82980f, 0.02168f, -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f, - -0.87176f, -1.10711f, 0.71207f, 1.49689f, -0.12715f, 0.29357f, 0.35234f, - 0.61016f, 0.80708f, 0.83564f, 1.05961f, -0.99842f, 0.82004f, 0.02638f, - 0.44606f, 0.32298f, 0.21321f, 0.47290f, -0.71442f, -2.81050f, -0.02520f, - -0.08919f, 0.00369f, -0.05257f, -0.07011f, -0.16394f, 0.06290f, 0.80086f, - 0.32349f, 0.47411f, 1.36126f, 1.68162f, 0.91325f, -0.27495f, 0.00262f, - 0.06025f, 0.42832f, 0.36965f, 0.38063f, 0.32772f, 0.40914f, 0.44510f, - 3.02239f, -1.84077f, 0.49536f, -0.27340f, -0.10437f, -0.34293f, -0.08047f, - -0.29651f, -0.97111f, -0.34187f, 0.52869f, 1.27240f, 1.20306f, 1.19121f, - 1.28742f, 0.26393f, -0.62319f, 0.92285f, -0.08303f, -0.33118f, -0.13053f, - 0.24875f, -0.52089f, 0.44691f, -1.08908f, 1.20921f, 0.36538f, -0.46792f, - -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f, 0.68519f, 0.08228f, - -0.49027f, -0.34381f, 0.04719f, -0.33298f, 0.72525f, 0.09538f, -0.29216f, - -0.07260f, -0.55827f, 0.54542f, -0.10144f, -0.09292f, -0.14427f, -0.38361f, - -0.41559f, 0.75338f, -0.04530f, 0.27944f, 0.06932f, -0.11537f, 0.29568f, - 1.92155f, -0.98996f, -0.08841f, 0.49386f, 0.15947f, 0.53290f, 1.46747f, - 0.59360f, 0.25468f, -}; - -static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = { - -1.19673f, 0.33043f, 0.24408f, 0.46221f, 2.00646f, 0.19031f, - -0.64944f, -0.43452f, 1.04400f, 1.47371f, 0.52460f, -1.39577f, - 0.83852f, -0.25536f, 1.33200f, -0.24444f, -}; - -static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = { - -1.31447f, -0.86455f, 0.85217f, 1.00048f, 0.37395f, -1.35713f, -0.54032f, - 0.82803f, 0.89606f, 1.57696f, 0.68067f, 0.42512f, -0.26250f, 0.14621f, - 0.93249f, -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f, - 0.67700f, -0.29310f, 0.91604f, -1.21881f, 1.11188f, 0.45045f, -0.86119f, - -0.09294f, 0.09360f, 0.80794f, 0.41027f, 1.80399f, -0.50362f, -1.44689f, - 0.85148f, 0.90707f, -0.18458f, 0.14165f, 1.17367f, 0.70869f, 1.57147f, - 0.24692f, 0.16626f, 0.56794f, 0.07313f, 0.14728f, -0.74296f, 1.74127f, - 1.26560f, 0.17753f, 1.10194f, 0.56435f, 1.73779f, 1.42841f, -1.16773f, - 0.24584f, 0.10813f, -0.60187f, 0.79802f, 0.75229f, -0.06112f, 1.77282f, - 1.01058f, -}; - -static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = { - 0.83082f, - 2.03845f, - 0.59627f, - 2.31341f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = { - 8, // num_inputs +static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = { + 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { - 16, + 8, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_32x16_ver_layer0, - av1_tx_type_nn_weights_32x16_ver_layer1, - }, - { - av1_tx_type_nn_bias_32x16_ver_layer0, - av1_tx_type_nn_bias_32x16_ver_layer1, - }, + { av1_tx_type_nn_weights_16x4_ver_layer0, + av1_tx_type_nn_weights_16x4_ver_layer1 }, + { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 } }; /******************************************************************************/ // Map tx_size to its corresponding neural net model for tx type prediction. static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = { - &av1_tx_type_nnconfig_4x4, // 4x4 - &av1_tx_type_nnconfig_8x8, // 8x8 - &av1_tx_type_nnconfig_16x16, // 16x16 - NULL, // 32x32 - NULL, // 64x64 - &av1_tx_type_nnconfig_4x8_hor, // 4x8 - &av1_tx_type_nnconfig_8x4_hor, // 8x4 - &av1_tx_type_nnconfig_8x16_hor, // 8x16 - &av1_tx_type_nnconfig_16x8_hor, // 16x8 - &av1_tx_type_nnconfig_16x32_hor, // 16x32 - &av1_tx_type_nnconfig_32x16_hor, // 32x16 - NULL, // 32x64 - NULL, // 64x32 - NULL, // 4x16 - NULL, // 16x4 - NULL, // 8x32 - NULL, // 32x8 - NULL, // 16x64 - NULL, // 64x16 + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform }; static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = { - &av1_tx_type_nnconfig_4x4, // 4x4 transform - &av1_tx_type_nnconfig_8x8, // 8x8 transform - &av1_tx_type_nnconfig_16x16, // 16x16 transform - NULL, // 32x32 transform - NULL, // 64x64 transform - &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform - &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform - &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform - &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform - &av1_tx_type_nnconfig_16x32_ver, // 16x32 transform - &av1_tx_type_nnconfig_32x16_ver, // 32x16 transform - NULL, // 32x64 transform - NULL, // 64x32 transform - NULL, // 4x16 transform - NULL, // 16x4 transform - NULL, // 8x32 transform - NULL, // 32x8 transform - NULL, // 16x64 transform - NULL, // 64x16 transform + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform }; // Tx split model for 4x8 block. @@ -2083,4 +1941,4 @@ static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = { } // extern "C" #endif -#endif // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c index c71f2e74c..07615543c 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -395,7 +395,8 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, } void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, - int8_t cos_bit) { + int8_t cos_bit, const int instride, + const int outstride) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); @@ -480,70 +481,70 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, // stage 1 __m128i x1[64]; - x1[0] = _mm_add_epi32(input[0], input[63]); - x1[63] = _mm_sub_epi32(input[0], input[63]); - x1[1] = _mm_add_epi32(input[1], input[62]); - x1[62] = _mm_sub_epi32(input[1], input[62]); - x1[2] = _mm_add_epi32(input[2], input[61]); - x1[61] = _mm_sub_epi32(input[2], input[61]); - x1[3] = _mm_add_epi32(input[3], input[60]); - x1[60] = _mm_sub_epi32(input[3], input[60]); - x1[4] = _mm_add_epi32(input[4], input[59]); - x1[59] = _mm_sub_epi32(input[4], input[59]); - x1[5] = _mm_add_epi32(input[5], input[58]); - x1[58] = _mm_sub_epi32(input[5], input[58]); - x1[6] = _mm_add_epi32(input[6], input[57]); - x1[57] = _mm_sub_epi32(input[6], input[57]); - x1[7] = _mm_add_epi32(input[7], input[56]); - x1[56] = _mm_sub_epi32(input[7], input[56]); - x1[8] = _mm_add_epi32(input[8], input[55]); - x1[55] = _mm_sub_epi32(input[8], input[55]); - x1[9] = _mm_add_epi32(input[9], input[54]); - x1[54] = _mm_sub_epi32(input[9], input[54]); - x1[10] = _mm_add_epi32(input[10], input[53]); - x1[53] = _mm_sub_epi32(input[10], input[53]); - x1[11] = _mm_add_epi32(input[11], input[52]); - x1[52] = _mm_sub_epi32(input[11], input[52]); - x1[12] = _mm_add_epi32(input[12], input[51]); - x1[51] = _mm_sub_epi32(input[12], input[51]); - x1[13] = _mm_add_epi32(input[13], input[50]); - x1[50] = _mm_sub_epi32(input[13], input[50]); - x1[14] = _mm_add_epi32(input[14], input[49]); - x1[49] = _mm_sub_epi32(input[14], input[49]); - x1[15] = _mm_add_epi32(input[15], input[48]); - x1[48] = _mm_sub_epi32(input[15], input[48]); - x1[16] = _mm_add_epi32(input[16], input[47]); - x1[47] = _mm_sub_epi32(input[16], input[47]); - x1[17] = _mm_add_epi32(input[17], input[46]); - x1[46] = _mm_sub_epi32(input[17], input[46]); - x1[18] = _mm_add_epi32(input[18], input[45]); - x1[45] = _mm_sub_epi32(input[18], input[45]); - x1[19] = _mm_add_epi32(input[19], input[44]); - x1[44] = _mm_sub_epi32(input[19], input[44]); - x1[20] = _mm_add_epi32(input[20], input[43]); - x1[43] = _mm_sub_epi32(input[20], input[43]); - x1[21] = _mm_add_epi32(input[21], input[42]); - x1[42] = _mm_sub_epi32(input[21], input[42]); - x1[22] = _mm_add_epi32(input[22], input[41]); - x1[41] = _mm_sub_epi32(input[22], input[41]); - x1[23] = _mm_add_epi32(input[23], input[40]); - x1[40] = _mm_sub_epi32(input[23], input[40]); - x1[24] = _mm_add_epi32(input[24], input[39]); - x1[39] = _mm_sub_epi32(input[24], input[39]); - x1[25] = _mm_add_epi32(input[25], input[38]); - x1[38] = _mm_sub_epi32(input[25], input[38]); - x1[26] = _mm_add_epi32(input[26], input[37]); - x1[37] = _mm_sub_epi32(input[26], input[37]); - x1[27] = _mm_add_epi32(input[27], input[36]); - x1[36] = _mm_sub_epi32(input[27], input[36]); - x1[28] = _mm_add_epi32(input[28], input[35]); - x1[35] = _mm_sub_epi32(input[28], input[35]); - x1[29] = _mm_add_epi32(input[29], input[34]); - x1[34] = _mm_sub_epi32(input[29], input[34]); - x1[30] = _mm_add_epi32(input[30], input[33]); - x1[33] = _mm_sub_epi32(input[30], input[33]); - x1[31] = _mm_add_epi32(input[31], input[32]); - x1[32] = _mm_sub_epi32(input[31], input[32]); + x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]); + x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]); + x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]); + x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]); + x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]); + x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]); + x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]); + x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]); + x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]); + x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]); + x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]); + x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]); + x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]); + x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]); + x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]); + x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]); + x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]); + x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]); + x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]); + x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]); + x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]); + x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]); + x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]); + x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]); + x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]); + x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]); + x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]); + x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]); + x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]); + x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]); + x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]); + x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]); + x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]); + x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]); + x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]); + x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]); + x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]); + x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]); + x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]); + x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]); + x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]); + x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]); + x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]); + x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]); + x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]); + x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]); + x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]); + x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]); + x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]); + x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]); + x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]); + x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]); + x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]); + x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]); + x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]); + x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]); + x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]); + x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]); + x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]); + x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]); + x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]); + x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]); + x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]); + x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]); // stage 2 __m128i x2[64]; @@ -1149,68 +1150,68 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, x10[48], __rounding, cos_bit); // stage 11 - output[0] = x10[0]; - output[1] = x10[32]; - output[2] = x10[16]; - output[3] = x10[48]; - output[4] = x10[8]; - output[5] = x10[40]; - output[6] = x10[24]; - output[7] = x10[56]; - output[8] = x10[4]; - output[9] = x10[36]; - output[10] = x10[20]; - output[11] = x10[52]; - output[12] = x10[12]; - output[13] = x10[44]; - output[14] = x10[28]; - output[15] = x10[60]; - output[16] = x10[2]; - output[17] = x10[34]; - output[18] = x10[18]; - output[19] = x10[50]; - output[20] = x10[10]; - output[21] = x10[42]; - output[22] = x10[26]; - output[23] = x10[58]; - output[24] = x10[6]; - output[25] = x10[38]; - output[26] = x10[22]; - output[27] = x10[54]; - output[28] = x10[14]; - output[29] = x10[46]; - output[30] = x10[30]; - output[31] = x10[62]; - output[32] = x10[1]; - output[33] = x10[33]; - output[34] = x10[17]; - output[35] = x10[49]; - output[36] = x10[9]; - output[37] = x10[41]; - output[38] = x10[25]; - output[39] = x10[57]; - output[40] = x10[5]; - output[41] = x10[37]; - output[42] = x10[21]; - output[43] = x10[53]; - output[44] = x10[13]; - output[45] = x10[45]; - output[46] = x10[29]; - output[47] = x10[61]; - output[48] = x10[3]; - output[49] = x10[35]; - output[50] = x10[19]; - output[51] = x10[51]; - output[52] = x10[11]; - output[53] = x10[43]; - output[54] = x10[27]; - output[55] = x10[59]; - output[56] = x10[7]; - output[57] = x10[39]; - output[58] = x10[23]; - output[59] = x10[55]; - output[60] = x10[15]; - output[61] = x10[47]; - output[62] = x10[31]; - output[63] = x10[63]; + output[0 * outstride] = x10[0]; + output[1 * outstride] = x10[32]; + output[2 * outstride] = x10[16]; + output[3 * outstride] = x10[48]; + output[4 * outstride] = x10[8]; + output[5 * outstride] = x10[40]; + output[6 * outstride] = x10[24]; + output[7 * outstride] = x10[56]; + output[8 * outstride] = x10[4]; + output[9 * outstride] = x10[36]; + output[10 * outstride] = x10[20]; + output[11 * outstride] = x10[52]; + output[12 * outstride] = x10[12]; + output[13 * outstride] = x10[44]; + output[14 * outstride] = x10[28]; + output[15 * outstride] = x10[60]; + output[16 * outstride] = x10[2]; + output[17 * outstride] = x10[34]; + output[18 * outstride] = x10[18]; + output[19 * outstride] = x10[50]; + output[20 * outstride] = x10[10]; + output[21 * outstride] = x10[42]; + output[22 * outstride] = x10[26]; + output[23 * outstride] = x10[58]; + output[24 * outstride] = x10[6]; + output[25 * outstride] = x10[38]; + output[26 * outstride] = x10[22]; + output[27 * outstride] = x10[54]; + output[28 * outstride] = x10[14]; + output[29 * outstride] = x10[46]; + output[30 * outstride] = x10[30]; + output[31 * outstride] = x10[62]; + output[32 * outstride] = x10[1]; + output[33 * outstride] = x10[33]; + output[34 * outstride] = x10[17]; + output[35 * outstride] = x10[49]; + output[36 * outstride] = x10[9]; + output[37 * outstride] = x10[41]; + output[38 * outstride] = x10[25]; + output[39 * outstride] = x10[57]; + output[40 * outstride] = x10[5]; + output[41 * outstride] = x10[37]; + output[42 * outstride] = x10[21]; + output[43 * outstride] = x10[53]; + output[44 * outstride] = x10[13]; + output[45 * outstride] = x10[45]; + output[46 * outstride] = x10[29]; + output[47 * outstride] = x10[61]; + output[48 * outstride] = x10[3]; + output[49 * outstride] = x10[35]; + output[50 * outstride] = x10[19]; + output[51 * outstride] = x10[51]; + output[52 * outstride] = x10[11]; + output[53 * outstride] = x10[43]; + output[54 * outstride] = x10[27]; + output[55 * outstride] = x10[59]; + output[56 * outstride] = x10[7]; + output[57 * outstride] = x10[39]; + output[58 * outstride] = x10[23]; + output[59 * outstride] = x10[55]; + output[60 * outstride] = x10[15]; + output[61 * outstride] = x10[47]; + output[62 * outstride] = x10[31]; + output[63 * outstride] = x10[63]; } diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c index abb95f31e..8ec0256eb 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -14,6 +14,7 @@ #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" @@ -52,9 +53,22 @@ static void fdct32_new_sse4_1(const __m128i *input, __m128i *output, } } +static void fdct64_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 64; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + (void)stage_range; + for (int col = 0; col < col_num; col++) { + av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num, + col_num); + } +} + static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break; + case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break; default: assert(0); } return NULL; @@ -95,6 +109,42 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, transpose_32(txfm_size, buf_128, out_128); } +static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input, + int32_t *output, const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + + const int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + int col_num = txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, output, + txfm_size); + /*col wise transform*/ + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + + /*row wise transform*/ + for (int col = 0; col < (col_num >> 1); col++) { + av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, + col_num, (col_num >> 1)); + } + + txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_32x32(buf_128, out_128); +} + void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); @@ -104,6 +154,15 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); } +void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + (void)bd; + fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); +} + static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA, const __m128i *inputB, __m128i *output) { __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]); @@ -162,8 +221,8 @@ static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); @@ -209,10 +268,10 @@ static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); - av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); - av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); int32_t *output8 = output + 8 * 32 * i; for (int j = 0; j < width_div8; ++j) { @@ -260,8 +319,8 @@ static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, } av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row); av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row); - av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); - av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); int32_t *output8 = output + 8 * 32 * i; for (int j = 0; j < (32 / 4); ++j) { diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h index c582ca0e3..38707137c 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_FWD_TXFM_AVX2_H_ -#define AV1_FWD_TXFM_AVX2_H_ +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ #include <immintrin.h> static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { @@ -100,4 +100,4 @@ static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, *in1 = _mm256_srai_epi32(temp1, cos_bit); } -#endif // AV1_FWD_TXFM_AVX2_H_ +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h index aa14d3ade..99a6b9082 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ -#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ #include <immintrin.h> @@ -114,4 +114,4 @@ static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { } #endif -#endif // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h index 0adefecdb..6df2a8bdb 100644 --- a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h +++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_TXMF1D_SSE2_H_ -#define AV1_TXMF1D_SSE2_H_ +#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ +#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ #include <smmintrin.h> #include "av1/common/av1_txfm.h" @@ -29,7 +29,8 @@ void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output, void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit); void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, - int8_t cos_bit); + int8_t cos_bit, const int instride, + const int outstride); void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, const int8_t cos_bit, const int8_t *stage_range); @@ -138,4 +139,4 @@ static INLINE void transpose_32(int txfm_size, const __m128i *input, } #endif -#endif // AV1_TXMF1D_SSE2_H_ +#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c new file mode 100644 index 000000000..7642f57d1 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <smmintrin.h> /* SSE4.1 */ +#include <immintrin.h> /* AVX2 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + const __m256i y_zeros = _mm256_setzero_si256(); + + const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; + uint8_t *pre_buf = levels - TX_PAD_TOP * stride; + uint8_t *pre_buf_end = pre_buf + pre_len; + do { + yy_storeu_256(pre_buf, y_zeros); + pre_buf += 32; + } while (pre_buf < pre_buf_end); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride; + uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); + + do { + yy_storeu_256(bottom_buf, y_zeros); + bottom_buf += 32; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (width == 4) { + do { + const __m256i c0 = yy_loadu_256(cf); + const __m256i c1 = yy_loadu_256(cf + 8); + const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); + const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); + const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); + const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); + yy_storeu_256(ls, res); + ls += 32; + cf += 16; + i += 4; + } while (i < height); + } else if (width == 8) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + const __m128i res0 = _mm256_castsi256_si128(res); + const __m128i res1 = _mm256_extracti128_si256(res, 1); + xx_storel_64(ls, res0); + *(int32_t *)(ls + width) = 0; + xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); + *(int32_t *)(ls + width + stride) = 0; + xx_storel_64(ls + stride * 2, res1); + *(int32_t *)(ls + width + stride * 2) = 0; + xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); + *(int32_t *)(ls + width + stride * 3) = 0; + cf += 32; + ls += stride << 2; + i += 4; + } while (i < height); + } else if (width == 16) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + xx_storeu_128(ls, _mm256_castsi256_si128(res)); + xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); + cf += 32; + *(int32_t *)(ls + width) = 0; + *(int32_t *)(ls + stride + width) = 0; + ls += stride << 1; + i += 2; + } while (i < height); + } else { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + yy_storeu_256(ls, res); + cf += 32; + *(int32_t *)(ls + width) = 0; + ls += stride; + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c index b3a879b0f..5e0687cd3 100644 --- a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c @@ -14,43 +14,55 @@ #include <smmintrin.h> /* SSE4.1 */ #include "aom/aom_integer.h" -#include "aom_dsp/x86/mem_sse2.h" #include "av1/common/onyxc_int.h" #include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels) { const int stride = width + TX_PAD_HOR; - memset(levels - TX_PAD_TOP * stride, 0, - sizeof(*levels) * TX_PAD_TOP * stride); - memset(levels + stride * height, 0, - sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); - const __m128i zeros = _mm_setzero_si128(); + + const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; + uint8_t *pre_buf = levels - TX_PAD_TOP * stride; + uint8_t *pre_buf_end = pre_buf + pre_len; + do { + _mm_storeu_si128((__m128i *)(pre_buf), zeros); + pre_buf += 16; + } while (pre_buf < pre_buf_end); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf = levels + stride * height; + uint8_t *bottom_buf_end = bottom_buf + bottom_len; + do { + _mm_storeu_si128((__m128i *)(bottom_buf), zeros); + bottom_buf += 16; + } while (bottom_buf < bottom_buf_end); + int i = 0; uint8_t *ls = levels; const tran_low_t *cf = coeff; if (width == 4) { do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); - _mm_storeu_si128((__m128i *)ls, lsAB); + xx_storeu_128(ls, lsAB); ls += (stride << 1); cf += (width << 1); i += 2; } while (i < height); } else if (width == 8) { do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); - _mm_storeu_si128((__m128i *)ls, absAB8); + xx_storeu_128(ls, absAB8); ls += stride; cf += width; i += 1; @@ -59,16 +71,16 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, do { int j = 0; do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); - const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8)); - const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffC = xx_loadu_128(cf + 8); + const __m128i coeffD = xx_loadu_128(cf + 12); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absCD = _mm_abs_epi16(coeffCD); const __m128i absABCD = _mm_packs_epi16(absAB, absCD); - _mm_storeu_si128((__m128i *)(ls + j), absABCD); + xx_storeu_128(ls + j, absABCD); j += 16; cf += 16; } while (j < width); diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c index 4cd6371a6..535485ae8 100644 --- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -17,6 +17,7 @@ #include "av1/common/av1_txfm.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_ports/mem.h" @@ -393,7 +394,32 @@ static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) { _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); } -static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { +static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output, + const int stride) { + _mm_storeu_si128((__m128i *)(output), res[0]); + _mm_storeu_si128((__m128i *)(output + 4), res[1]); + _mm_storeu_si128((__m128i *)(output + stride), res[2]); + _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]); + + _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]); + _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]); + _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]); + _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]); + + _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]); + _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]); + _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]); + _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]); + + _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]); + _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]); + _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]); + _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]); +} + +static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + (void)(col_num); const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); @@ -589,7 +615,9 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { out[13] = u[3]; // buf0[3] } -static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { +static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + (void)(col_num); const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); @@ -780,82 +808,82 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, switch (tx_type) { case DCT_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_DCT: load_buffer_8x8(input, in, stride, 1, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_FLIPADST: load_buffer_8x8(input, in, stride, 0, 1, shift[0]); - fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_FLIPADST: load_buffer_8x8(input, in, stride, 1, 1, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_FLIPADST: load_buffer_8x8(input, in, stride, 0, 1, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_ADST: load_buffer_8x8(input, in, stride, 1, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; @@ -940,7 +968,26 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out, convert_8x8_to_16x16(in, out); } -static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_8x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift); +} + +static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); @@ -962,7 +1009,6 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); __m128i u[16], v[16], x; - const int col_num = 4; int col; // Calculate the column 0, 1, 2, 3 @@ -1226,7 +1272,8 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { } } -static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { +static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_cols) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); @@ -1271,25 +1318,25 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { __m128i u[16], v[16], x, y; int col; - for (col = 0; col < 4; ++col) { + for (col = 0; col < num_cols; ++col) { // stage 0 // stage 1 - u[0] = in[0 * 4 + col]; - u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]); - u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]); - u[3] = in[8 * 4 + col]; - u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]); - u[5] = in[12 * 4 + col]; - u[6] = in[4 * 4 + col]; - u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]); - u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]); - u[9] = in[14 * 4 + col]; - u[10] = in[6 * 4 + col]; - u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]); - u[12] = in[2 * 4 + col]; - u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]); - u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]); - u[15] = in[10 * 4 + col]; + u[0] = in[0 * num_cols + col]; + u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; // stage 2 v[0] = u[0]; @@ -1453,22 +1500,22 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 - out[0 * 4 + col] = v[1]; - out[1 * 4 + col] = v[14]; - out[2 * 4 + col] = v[3]; - out[3 * 4 + col] = v[12]; - out[4 * 4 + col] = v[5]; - out[5 * 4 + col] = v[10]; - out[6 * 4 + col] = v[7]; - out[7 * 4 + col] = v[8]; - out[8 * 4 + col] = v[9]; - out[9 * 4 + col] = v[6]; - out[10 * 4 + col] = v[11]; - out[11 * 4 + col] = v[4]; - out[12 * 4 + col] = v[13]; - out[13 * 4 + col] = v[2]; - out[14 * 4 + col] = v[15]; - out[15 * 4 + col] = v[0]; + out[0 * num_cols + col] = v[1]; + out[1 * num_cols + col] = v[14]; + out[2 * num_cols + col] = v[3]; + out[3 * num_cols + col] = v[12]; + out[4 * num_cols + col] = v[5]; + out[5 * num_cols + col] = v[10]; + out[6 * num_cols + col] = v[7]; + out[7 * num_cols + col] = v[8]; + out[8 * num_cols + col] = v[9]; + out[9 * num_cols + col] = v[6]; + out[10 * num_cols + col] = v[11]; + out[11 * num_cols + col] = v[4]; + out[12 * num_cols + col] = v[13]; + out[13 * num_cols + col] = v[2]; + out[14 * num_cols + col] = v[15]; + out[15 * num_cols + col] = v[0]; } } @@ -1482,6 +1529,11 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) { col_txfm_8x8_rounding(&in[48], shift); } +static void col_txfm_8x16_rounding(__m128i *in, int shift) { + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); +} + static void write_buffer_16x16(const __m128i *in, int32_t *output) { const int size_8x8 = 16 * 4; write_buffer_8x8(&in[0], output); @@ -1499,85 +1551,86 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, const int8_t *shift = fwd_txfm_shift_ls[TX_16X16]; const int txw_idx = get_txw_idx(TX_16X16); const int txh_idx = get_txh_idx(TX_16X16); + const int col_num = 4; switch (tx_type) { case DCT_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_DCT: load_buffer_16x16(input, in, stride, 1, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_FLIPADST: load_buffer_16x16(input, in, stride, 0, 1, shift[0]); - fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_FLIPADST: load_buffer_16x16(input, in, stride, 1, 1, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_FLIPADST: load_buffer_16x16(input, in, stride, 0, 1, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_ADST: load_buffer_16x16(input, in, stride, 1, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; @@ -1585,3 +1638,146 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, } (void)bd; } + +static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; i += 2) in[30 - i] = out[i]; + for (int i = 1; i < size; i += 2) in[size - i] = out[i]; +} + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fdct16x16_sse4_1, // ADST_DCT + fadst16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fdct16x16_sse4_1, // FLIPADST_DCT + fadst16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fadst16x16_sse4_1, // ADST_DCT + fdct16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fadst16x16_sse4_1, // FLIPADST_DCT + fdct16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fdct8x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct8x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int bit = fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]); + col_txfm(in, in, bit, 0); + col_txfm_8x8_rounding(in, -shift[1]); + transpose_8x8(in, out + i * 16); + } + + if (lr_flip) { + flip_buf_sse4_1(in, out, 32); + row_txfm(in, out, bit, 2); + } else { + row_txfm(out, out, bit, 2); + } + + for (int i = 0; i < 2; i++) { + transpose_8x8(out + i * 16, in); + av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2); + write_buffer_16x8(in, coeff + i * 8, 16); + } + + (void)bd; +} + +void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + int bit = fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x16_rounding(in, -shift[1]); + transpose_8x8(in, out); + transpose_8x8(in + 16, out + 16); + + for (int i = 0; i < 2; i++) { + row_txfm(out + i * 16, out, bit, 0); + transpose_8x8(out, in); + av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2); + write_buffer_8x8(in, coeff + i * 64); + } + + (void)bd; +} diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c new file mode 100644 index 000000000..06aaaa7ee --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> // AVX2 +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/transpose_sse2.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m256i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s)); + const __m256i dst0 = yy_loadu_256(dst); + const __m256i r0 = _mm256_add_epi32(dst0, d0); + yy_storeu_256(dst, r0); +} + +static INLINE void acc_stat_win7_one_line_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + for (j = h_start; j < h_end; j += 2) { + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint8_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m256i kl = + _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win7_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +static INLINE void acc_stat_win5_one_line_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + for (j = h_start; j < h_end; j += 2) { + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint8_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m256i kl = + _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win5_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, double *M, double *H) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } +} + +static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt0_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt0 + j), + yy_loadu_256(flt0 + j + 8)), + 0xd8); + const __m256i flt1_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt1 + j), + yy_loadu_256(flt1 + j + 8)), + 0xd8); + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0); + const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0); + const __m256i v0 = _mm256_madd_epi16( + xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i v1 = _mm256_madd_epi16( + xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0) { + __m256i xq_coeff = + pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS))); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt0_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt0 + j), + yy_loadu_256(flt0 + j + 8)), + 0xd8); + const __m256i v0 = + _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0)); + const __m256i v1 = + _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else if (params->r[1] > 0) { + __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS)); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt1_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt1 + j), + yy_loadu_256(flt1 + j + 8)), + 0xd8); + const __m256i v0 = + _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0)); + const __m256i v1 = + _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else { + __m256i sum32 = _mm256_setzero_si256(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i diff0 = _mm256_sub_epi16(d0, s0); + const __m256i err0 = _mm256_madd_epi16(diff0, diff0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + } + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64_0, sum64_1); + } + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c new file mode 100644 index 000000000..04e4d1afc --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> +#include "aom_dsp/x86/synonyms.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m128i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s)); + const __m128i d1 = + _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8))); + const __m128i dst0 = xx_loadu_128(dst); + const __m128i dst1 = xx_loadu_128(dst + 4); + const __m128i r0 = _mm_add_epi32(dst0, d0); + const __m128i r1 = _mm_add_epi32(dst1, d1); + xx_storeu_128(dst, r0); + xx_storeu_128(dst + 4, r1); +} + +static INLINE void acc_stat_win7_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + const int wiener_win = 7; + int j, k, l; + for (j = h_start; j < h_end; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +static INLINE void acc_stat_win5_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + const int wiener_win = WIENER_WIN_CHROMA; + int j, k, l; + for (j = h_start; j < h_end; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} +void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, double *M, double *H) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } +} + +static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) { + return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j < width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt0_16b = + _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); + const __m128i flt1_16b = + _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0); + const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0); + const __m128i v0 = _mm_madd_epi16( + xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i v1 = _mm_madd_epi16( + xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS)); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j < width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt0_16b = + _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); + const __m128i v0 = + _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0)); + const __m128i v1 = + _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else if (params->r[1] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS)); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j < width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt1_16b = + _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); + const __m128i v0 = + _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0)); + const __m128i v1 = + _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else { + __m128i sum32 = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < width - 16; j += 16) { + const __m128i d = xx_loadu_128(dat + j); + const __m128i s = xx_loadu_128(src + j); + const __m128i d0 = _mm_cvtepu8_epi16(d); + const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8)); + const __m128i s0 = _mm_cvtepu8_epi16(s); + const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)); + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + } + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64_0, sum64_1); + } + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c index f776e84c7..2a792f14e 100644 --- a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c +++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c @@ -14,7 +14,7 @@ #include <smmintrin.h> #include "aom_dsp/x86/synonyms.h" - +#include "aom_dsp/x86/synonyms_avx2.h" #include "aom/aom_integer.h" #include "av1/common/reconinter.h" @@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, uint64_t csse; const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); - const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff); + const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff); __m256i v_acc0_q = _mm256_setzero_si256(); |