summaryrefslogtreecommitdiff
path: root/third_party/aom/av1/encoder
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-15 21:45:30 -0500
committertrav90 <travawine@palemoon.org>2018-10-15 21:45:30 -0500
commit9f611df3f188082ac5b9b3d87d2cc866c56a00ff (patch)
treed960f017cd7eba3f125b7e8a813789ee2e076310 /third_party/aom/av1/encoder
parent349df80246557244b6529f6a4e6685e5c537b045 (diff)
downloaduxp-9f611df3f188082ac5b9b3d87d2cc866c56a00ff.tar.gz
Import aom library
This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36.
Diffstat (limited to 'third_party/aom/av1/encoder')
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.c163
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.h37
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.c566
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.h98
-rw-r--r--third_party/aom/av1/encoder/aq_variance.c207
-rw-r--r--third_party/aom/av1/encoder/aq_variance.h31
-rw-r--r--third_party/aom/av1/encoder/arm/neon/dct_neon.c36
-rw-r--r--third_party/aom/av1/encoder/arm/neon/error_neon.c42
-rw-r--r--third_party/aom/av1/encoder/arm/neon/quantize_neon.c118
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.c1790
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.h184
-rw-r--r--third_party/aom/av1/encoder/bitstream.c5399
-rw-r--r--third_party/aom/av1/encoder/bitstream.h53
-rw-r--r--third_party/aom/av1/encoder/block.h241
-rw-r--r--third_party/aom/av1/encoder/blockiness.c142
-rw-r--r--third_party/aom/av1/encoder/context_tree.c331
-rw-r--r--third_party/aom/av1/encoder/context_tree.h111
-rw-r--r--third_party/aom/av1/encoder/corner_detect.c37
-rw-r--r--third_party/aom/av1/encoder/corner_detect.h22
-rw-r--r--third_party/aom/av1/encoder/corner_match.c193
-rw-r--r--third_party/aom/av1/encoder/corner_match.h29
-rw-r--r--third_party/aom/av1/encoder/cost.c67
-rw-r--r--third_party/aom/av1/encoder/cost.h63
-rw-r--r--third_party/aom/av1/encoder/daala_compat_enc.c30
-rw-r--r--third_party/aom/av1/encoder/dct.c2228
-rw-r--r--third_party/aom/av1/encoder/encint.h51
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c7160
-rw-r--r--third_party/aom/av1/encoder/encodeframe.h58
-rw-r--r--third_party/aom/av1/encoder/encodemb.c1671
-rw-r--r--third_party/aom/av1/encoder/encodemb.h92
-rw-r--r--third_party/aom/av1/encoder/encodemv.c497
-rw-r--r--third_party/aom/av1/encoder/encodemv.h43
-rw-r--r--third_party/aom/av1/encoder/encoder.c5980
-rw-r--r--third_party/aom/av1/encoder/encoder.h883
-rw-r--r--third_party/aom/av1/encoder/encodetxb.c784
-rw-r--r--third_party/aom/av1/encoder/encodetxb.h53
-rw-r--r--third_party/aom/av1/encoder/ethread.c176
-rw-r--r--third_party/aom/av1/encoder/ethread.h34
-rw-r--r--third_party/aom/av1/encoder/extend.c192
-rw-r--r--third_party/aom/av1/encoder/extend.h32
-rw-r--r--third_party/aom/av1/encoder/firstpass.c3026
-rw-r--r--third_party/aom/av1/encoder/firstpass.h202
-rw-r--r--third_party/aom/av1/encoder/generic_encoder.c157
-rw-r--r--third_party/aom/av1/encoder/global_motion.c319
-rw-r--r--third_party/aom/av1/encoder/global_motion.h62
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.c499
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.h44
-rw-r--r--third_party/aom/av1/encoder/laplace_encoder.c107
-rw-r--r--third_party/aom/av1/encoder/lookahead.c225
-rw-r--r--third_party/aom/av1/encoder/lookahead.h114
-rw-r--r--third_party/aom/av1/encoder/mbgraph.c398
-rw-r--r--third_party/aom/av1/encoder/mbgraph.h39
-rw-r--r--third_party/aom/av1/encoder/mcomp.c3493
-rw-r--r--third_party/aom/av1/encoder/mcomp.h163
-rw-r--r--third_party/aom/av1/encoder/mips/msa/error_msa.c108
-rw-r--r--third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c436
-rw-r--r--third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c98
-rw-r--r--third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c65
-rw-r--r--third_party/aom/av1/encoder/mips/msa/fdct_msa.h117
-rw-r--r--third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c284
-rw-r--r--third_party/aom/av1/encoder/palette.c277
-rw-r--r--third_party/aom/av1/encoder/palette.h73
-rw-r--r--third_party/aom/av1/encoder/pickcdef.c490
-rw-r--r--third_party/aom/av1/encoder/picklpf.c211
-rw-r--r--third_party/aom/av1/encoder/picklpf.h32
-rw-r--r--third_party/aom/av1/encoder/pickrst.c1269
-rw-r--r--third_party/aom/av1/encoder/pickrst.h30
-rw-r--r--third_party/aom/av1/encoder/pvq_encoder.c988
-rw-r--r--third_party/aom/av1/encoder/pvq_encoder.h53
-rw-r--r--third_party/aom/av1/encoder/ransac.c1210
-rw-r--r--third_party/aom/av1/encoder/ransac.h44
-rw-r--r--third_party/aom/av1/encoder/ratectrl.c1759
-rw-r--r--third_party/aom/av1/encoder/ratectrl.h284
-rw-r--r--third_party/aom/av1/encoder/ratectrl_xiph.c1244
-rw-r--r--third_party/aom/av1/encoder/ratectrl_xiph.h200
-rw-r--r--third_party/aom/av1/encoder/rd.c1204
-rw-r--r--third_party/aom/av1/encoder/rd.h505
-rw-r--r--third_party/aom/av1/encoder/rdopt.c12713
-rw-r--r--third_party/aom/av1/encoder/rdopt.h142
-rw-r--r--third_party/aom/av1/encoder/segmentation.c394
-rw-r--r--third_party/aom/av1/encoder/segmentation.h51
-rw-r--r--third_party/aom/av1/encoder/speed_features.c506
-rw-r--r--third_party/aom/av1/encoder/speed_features.h484
-rw-r--r--third_party/aom/av1/encoder/subexp.c282
-rw-r--r--third_party/aom/av1/encoder/subexp.h49
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.c719
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.h25
-rw-r--r--third_party/aom/av1/encoder/tokenize.c887
-rw-r--r--third_party/aom/av1/encoder/tokenize.h151
-rw-r--r--third_party/aom/av1/encoder/treewriter.c59
-rw-r--r--third_party/aom/av1/encoder/treewriter.h42
-rw-r--r--third_party/aom/av1/encoder/variance_tree.c61
-rw-r--r--third_party/aom/av1/encoder/variance_tree.h96
-rw-r--r--third_party/aom/av1/encoder/wedge_utils.c125
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c193
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_sse2.c211
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm204
-rw-r--r--third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm219
-rw-r--r--third_party/aom/av1/encoder/x86/dct_intrin_sse2.c3884
-rw-r--r--third_party/aom/av1/encoder/x86/dct_sse2.asm87
-rw-r--r--third_party/aom/av1/encoder/x86/dct_ssse3.c469
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_avx2.c73
-rw-r--r--third_party/aom/av1/encoder/x86/error_sse2.asm125
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c72
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c1895
-rw-r--r--third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c1678
-rw-r--r--third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm215
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_sse2.c254
108 files changed, 76118 insertions, 0 deletions
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
new file mode 100644
index 0000000000..054b0e062b
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3 // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 1.75, 1.25, 1.05, 1.00, 0.90 },
+ { 2.00, 1.50, 1.15, 1.00, 0.85 },
+ { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 0.15, 0.30, 0.55, 2.00, 100.0 },
+ { 0.20, 0.40, 0.65, 2.00, 100.0 },
+ { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { -4.0, -3.0, -2.0, 100.00, 100.0 },
+ { -3.5, -2.5, -1.5, 100.00, 100.0 },
+ { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+#define DEFAULT_COMPLEXITY 64
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+ // Approximate base quatizer (truncated to int)
+ const int base_quant = av1_ac_quant(q_index, 0, bit_depth) / 4;
+ return (base_quant > 10) + (base_quant > 25);
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct segmentation *const seg = &cm->seg;
+
+ // Make SURE use of floating point in this function is safe.
+ aom_clear_system_state();
+
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ int segment;
+ const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+ // Clear down the segment map.
+ memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+
+ av1_clearall_segfeatures(seg);
+
+ // Segmentation only makes sense if the target bits per SB is above a
+ // threshold. Below this the overheads will usually outweigh any benefit.
+ if (cpi->rc.sb64_target_rate < 256) {
+ av1_disable_segmentation(seg);
+ return;
+ }
+
+ av1_enable_segmentation(seg);
+
+ // Select delta coding method.
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+ // Use some of the segments for in frame Q adjustment.
+ for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+ int qindex_delta;
+
+ if (segment == DEFAULT_AQ2_SEG) continue;
+
+ qindex_delta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex,
+ aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
+
+ // For AQ complexity mode, we dont allow Q0 in a segment if the base
+ // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+ if ((cm->base_qindex + qindex_delta) > 0) {
+ av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+ av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+ }
+ }
+ }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+#define VAR_STRENGTH_STEP 0.25
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]);
+ const int ymis = AOMMIN(cm->mi_rows - mi_row, mi_size_high[bs]);
+ int x, y;
+ int i;
+ unsigned char segment;
+
+ if (0) {
+ segment = DEFAULT_AQ2_SEG;
+ } else {
+ // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+ // It is converted to bits * 256 units.
+ const int64_t num = (int64_t)cpi->rc.sb64_target_rate * xmis * ymis * 256;
+ const int denom = cm->mib_size * cm->mib_size;
+ const int target_rate = (int)(num / denom);
+ double logvar;
+ double low_var_thresh;
+ const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+ aom_clear_system_state();
+ low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy,
+ MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
+
+ av1_setup_src_planes(mb, cpi->source, mi_row, mi_col);
+ logvar = av1_log_block_var(cpi, mb, bs);
+
+ segment = AQ_C_SEGMENTS - 1; // Just in case no break out below.
+ for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+ // Test rate against a threshold value and variance against a threshold.
+ // Increasing segment number (higher variance and complexity) = higher Q.
+ if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+ (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+ segment = i;
+ break;
+ }
+ }
+ }
+
+ // Fill in the entires in the segment map corresponding to this SB64.
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
new file mode 100644
index 0000000000..af525b36de
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+ BLOCK_SIZE bs, int mi_row, int mi_col,
+ int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 0000000000..e41c608b64
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+struct CYCLIC_REFRESH {
+ // Percentage of blocks per frame that are targeted as candidates
+ // for cyclic refresh.
+ int percent_refresh;
+ // Maximum q-delta as percentage of base q.
+ int max_qdelta_perc;
+ // Superblock starting index for cycling through the frame.
+ int sb_index;
+ // Controls how long block will need to wait to be refreshed again, in
+ // excess of the cycle time, i.e., in the case of all zero motion, block
+ // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+ int time_for_refresh;
+ // Target number of (8x8) blocks that are set for delta-q.
+ int target_num_seg_blocks;
+ // Actual number of (8x8) blocks that were applied delta-q.
+ int actual_num_seg1_blocks;
+ int actual_num_seg2_blocks;
+ // RD mult. parameters for segment 1.
+ int rdmult;
+ // Cyclic refresh map.
+ signed char *map;
+ // Map of the last q a block was coded at.
+ uint8_t *last_coded_q_map;
+ // Thresholds applied to the projected rate/distortion of the coding block,
+ // when deciding whether block should be refreshed.
+ int64_t thresh_rate_sb;
+ int64_t thresh_dist_sb;
+ // Threshold applied to the motion vector (in units of 1/8 pel) of the
+ // coding block, when deciding whether block should be refreshed.
+ int16_t motion_thresh;
+ // Rate target ratio to set q delta.
+ double rate_ratio_qdelta;
+ // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+ int rate_boost_fac;
+ double low_content_avg;
+ int qindex_delta[3];
+};
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+ size_t last_coded_q_map_size;
+ CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+ if (cr == NULL) return NULL;
+
+ cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+ if (cr->map == NULL) {
+ av1_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+ cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
+ if (cr->last_coded_q_map == NULL) {
+ av1_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ assert(MAXQ <= 255);
+ memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+
+ return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+ aom_free(cr->map);
+ aom_free(cr->last_coded_q_map);
+ aom_free(cr);
+}
+
+// Check if we should turn off cyclic refresh based on bitrate condition.
+static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm,
+ const RATE_CONTROL *rc) {
+ // Turn off cyclic refresh if bits available per frame is not sufficiently
+ // larger than bit cost of segmentation. Segment map bit cost should scale
+ // with number of seg blocks, so compare available bits to number of blocks.
+ // Average bits available per frame = avg_frame_bandwidth
+ // Number of (8x8) blocks in frame = mi_rows * mi_cols;
+ const float factor = 0.25;
+ const int number_blocks = cm->mi_rows * cm->mi_cols;
+ // The condition below corresponds to turning off at target bitrates:
+ // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
+ // Also turn off at very small frame sizes, to avoid too large fraction of
+ // superblocks to be refreshed per frame. Threshold below is less than QCIF.
+ if (rc->avg_frame_bandwidth < factor * number_blocks ||
+ number_blocks / 64 < 5)
+ return 0;
+ else
+ return 1;
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+ const MB_MODE_INFO *mbmi, int64_t rate,
+ int64_t dist, int bsize) {
+ MV mv = mbmi->mv[0].as_mv;
+ // Reject the block for lower-qp coding if projected distortion
+ // is above the threshold, and any of the following is true:
+ // 1) mode uses large mv
+ // 2) mode is an intra-mode
+ // Otherwise accept for refresh.
+ if (dist > cr->thresh_dist_sb &&
+ (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+ mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+ !is_inter_block(mbmi)))
+ return CR_SEGMENT_ID_BASE;
+ else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+ is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+ cr->rate_boost_fac > 10)
+ // More aggressive delta-q for bigger blocks with zero motion.
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int deltaq = av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q,
+ rate_factor, cpi->common.bit_depth);
+ if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+ deltaq = -cr->max_qdelta_perc * q / 100;
+ }
+ return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int estimated_bits;
+ int mbs = cm->MBs;
+ int num8x8bl = mbs << 2;
+ // Weight for non-base segments: use actual number of blocks refreshed in
+ // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+ double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+ double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+ // Take segment weighted average for estimated bits.
+ estimated_bits =
+ (int)((1.0 - weight_segment1 - weight_segment2) *
+ av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+ correction_factor, cm->bit_depth) +
+ weight_segment1 *
+ av1_estimate_bits_at_q(cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[1],
+ mbs, correction_factor, cm->bit_depth) +
+ weight_segment2 *
+ av1_estimate_bits_at_q(cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[2],
+ mbs, correction_factor, cm->bit_depth));
+ return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int bits_per_mb;
+ int num8x8bl = cm->MBs << 2;
+ // Weight for segment prior to encoding: take the average of the target
+ // number for the frame to be encoded and the actual from the previous frame.
+ double weight_segment =
+ (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+ cr->actual_num_seg2_blocks) >>
+ 1) /
+ num8x8bl;
+ // Compute delta-q corresponding to qindex i.
+ int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+ // Take segment weighted average for bits per mb.
+ bits_per_mb = (int)((1.0 - weight_segment) *
+ av1_rc_bits_per_mb(cm->frame_type, i,
+ correction_factor, cm->bit_depth) +
+ weight_segment *
+ av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
+ correction_factor, cm->bit_depth));
+ return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
+ MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ const int refresh_this_block =
+ candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+ // Default is to not update the refresh map.
+ int new_map_value = cr->map[block_index];
+ int x = 0;
+ int y = 0;
+
+ // If this block is labeled for refresh, check if we should reset the
+ // segment_id.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+ mbmi->segment_id = refresh_this_block;
+ // Reset segment_id if will be skipped.
+ if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+ }
+
+ // Update the cyclic refresh map, to be used for setting segmentation map
+ // for the next frame. If the block will be refreshed this frame, mark it
+ // as clean. The magnitude of the -ve influences how long before we consider
+ // it for refresh again.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+ new_map_value = -cr->time_for_refresh;
+ } else if (refresh_this_block) {
+ // Else if it is accepted as candidate for refresh, and has not already
+ // been refreshed (marked as 1) then mark it as a candidate for cleanup
+ // for future time (marked as 0), otherwise don't update it.
+ if (cr->map[block_index] == 1) new_map_value = 0;
+ } else {
+ // Leave it marked as block that is not candidate for refresh.
+ new_map_value = 1;
+ }
+
+ // Update entries in the cyclic refresh map with new_map_value, and
+ // copy mbmi->segment_id into global segmentation map.
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ int map_offset = block_index + y * cm->mi_cols + x;
+ cr->map[map_offset] = new_map_value;
+ cpi->segmentation_map[map_offset] = mbmi->segment_id;
+ // Inter skip blocks were clearly not coded at the current qindex, so
+ // don't update the map for them. For cases where motion is non-zero or
+ // the reference frame isn't the previous frame, the previous value in
+ // the map for this spatial location is not entirely correct.
+ if ((!is_inter_block(mbmi) || !skip) &&
+ mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+ cr->last_coded_q_map[map_offset] = clamp(
+ cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+ } else if (is_inter_block(mbmi) && skip &&
+ mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+ cr->last_coded_q_map[map_offset] =
+ AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+ 0, MAXQ),
+ cr->last_coded_q_map[map_offset]);
+ }
+ }
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int mi_row, mi_col;
+ cr->actual_num_seg1_blocks = 0;
+ cr->actual_num_seg2_blocks = 0;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) ==
+ CR_SEGMENT_ID_BOOST1)
+ cr->actual_num_seg1_blocks++;
+ else if (cyclic_refresh_segment_id(
+ seg_map[mi_row * cm->mi_cols + mi_col]) ==
+ CR_SEGMENT_ID_BOOST2)
+ cr->actual_num_seg2_blocks++;
+ }
+}
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+ // period. Depending on past encoding stats, GF flag may be reset and update
+ // may not occur until next baseline_gf_interval.
+ if (cr->percent_refresh > 0)
+ rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+ else
+ rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int mi_row, mi_col;
+ double fraction_low = 0.0;
+ int low_content_frame = 0;
+
+ MODE_INFO **mi;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int rows = cm->mi_rows, cols = cm->mi_cols;
+ int cnt1 = 0, cnt2 = 0;
+ int force_gf_refresh = 0;
+
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0
+ ? mi[0]->mbmi.mv[0].as_mv.row
+ : -1 * mi[0]->mbmi.mv[0].as_mv.row;
+ int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0
+ ? mi[0]->mbmi.mv[0].as_mv.col
+ : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+
+ // Calculate the motion of the background.
+ if (abs_mvr <= 16 && abs_mvc <= 16) {
+ cnt1++;
+ if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
+ }
+ mi++;
+
+ // Accumulate low_content_frame.
+ if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
+ }
+ }
+
+ // For video conference clips, if the background has high motion in current
+ // frame because of the camera movement, set this frame as the golden frame.
+ // Use 70% and 5% as the thresholds for golden frame refreshing.
+ // Also, force this frame as a golden update frame if this frame will change
+ // the resolution (resize_pending != 0).
+ if (cpi->resize_pending != 0 ||
+ (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+ av1_cyclic_refresh_set_golden_update(cpi);
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ force_gf_refresh = 1;
+ }
+
+ fraction_low = (double)low_content_frame / (rows * cols);
+ // Update average.
+ cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+ if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+ // Don't update golden reference if the amount of low_content for the
+ // current encoded frame is small, or if the recursive average of the
+ // low_content over the update interval window falls below threshold.
+ if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+ cpi->refresh_golden_frame = 0;
+ // Reset for next internal.
+ cr->low_content_avg = fraction_low;
+ }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+ int xmis, ymis, x, y;
+ memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+ sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size;
+ sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size;
+ sbs_in_frame = sb_cols * sb_rows;
+ // Number of target blocks to get the q delta (segment 1).
+ block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ // Set the segmentation map: cycle through the superblocks, starting at
+ // cr->mb_index, and stopping when either block_count blocks have been found
+ // to be refreshed, or we have passed through whole frame.
+ assert(cr->sb_index < sbs_in_frame);
+ i = cr->sb_index;
+ cr->target_num_seg_blocks = 0;
+ do {
+ int sum_map = 0;
+ // Get the mi_row/mi_col corresponding to superblock index i.
+ int sb_row_index = (i / sb_cols);
+ int sb_col_index = i - sb_row_index * sb_cols;
+ int mi_row = sb_row_index * cm->mib_size;
+ int mi_col = sb_col_index * cm->mib_size;
+ int qindex_thresh =
+ cpi->oxcf.content == AOM_CONTENT_SCREEN
+ ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+ : 0;
+ assert(mi_row >= 0 && mi_row < cm->mi_rows);
+ assert(mi_col >= 0 && mi_col < cm->mi_cols);
+ bl_index = mi_row * cm->mi_cols + mi_col;
+ // Loop through all MI blocks in superblock and update map.
+ xmis = AOMMIN(cm->mi_cols - mi_col, cm->mib_size);
+ ymis = AOMMIN(cm->mi_rows - mi_row, cm->mib_size);
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ const int bl_index2 = bl_index + y * cm->mi_cols + x;
+ // If the block is as a candidate for clean up then mark it
+ // for possible boost/refresh (segment 1). The segment id may get
+ // reset to 0 later if block gets coded anything other than ZEROMV.
+ if (cr->map[bl_index2] == 0) {
+ if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
+ } else if (cr->map[bl_index2] < 0) {
+ cr->map[bl_index2]++;
+ }
+ }
+ }
+ // Enforce constant segment over superblock.
+ // If segment is at least half of superblock, set to 1.
+ if (sum_map >= xmis * ymis / 2) {
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+ }
+ cr->target_num_seg_blocks += xmis * ymis;
+ }
+ i++;
+ if (i == sbs_in_frame) {
+ i = 0;
+ }
+ } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+ cr->sb_index = i;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ cr->percent_refresh = 10;
+ cr->max_qdelta_perc = 50;
+ cr->time_for_refresh = 0;
+ // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+ // periods of the refresh cycle, after a key frame.
+ if (rc->frames_since_key < 4 * cr->percent_refresh)
+ cr->rate_ratio_qdelta = 3.0;
+ else
+ cr->rate_ratio_qdelta = 2.0;
+ // Adjust some parameters for low resolutions at low bitrates.
+ if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
+ cr->motion_thresh = 4;
+ cr->rate_boost_fac = 10;
+ } else {
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac = 17;
+ }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ struct segmentation *const seg = &cm->seg;
+ const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+ if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
+ // Don't apply refresh on key frame or enhancement layer frames.
+ if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
+ // Set segmentation map to 0 and disable.
+ unsigned char *const seg_map = cpi->segmentation_map;
+ memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+ av1_disable_segmentation(&cm->seg);
+ if (cm->frame_type == KEY_FRAME) {
+ memset(cr->last_coded_q_map, MAXQ,
+ cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+ cr->sb_index = 0;
+ }
+ return;
+ } else {
+ int qindex_delta = 0;
+ int qindex2;
+ const double q = av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+ aom_clear_system_state();
+ // Set rate threshold to some multiple (set to 2 for now) of the target
+ // rate (target is given by sb64_target_rate and scaled by 256).
+ cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+ // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+ // q will not exceed 457, so (q * q) is within 32bit; see:
+ // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+ cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+ // Set up segmentation.
+ // Clear down the segment map.
+ av1_enable_segmentation(&cm->seg);
+ av1_clearall_segfeatures(seg);
+ // Select delta coding method.
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ // Note: setting temporal_update has no effect, as the seg-map coding method
+ // (temporal or spatial) is determined in
+ // av1_choose_segmap_coding_method(),
+ // based on the coding cost of each method. For error_resilient mode on the
+ // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+ // relative to 0 previous map.
+ // seg->temporal_update = 0;
+
+ // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+ // Use segment BOOST1 for in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+ // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+ // Set the q delta for segment BOOST1.
+ qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+ cr->qindex_delta[1] = qindex_delta;
+
+ // Compute rd-mult for segment BOOST1.
+ qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
+ cr->rdmult = av1_compute_rd_mult(cpi, qindex2);
+
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Set a more aggressive (higher) q delta for segment BOOST2.
+ qindex_delta = compute_deltaq(
+ cpi, cm->base_qindex,
+ AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+ 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+ cr->qindex_delta[2] = qindex_delta;
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Update the segmentation and refresh map.
+ cyclic_refresh_update_map(cpi);
+ }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+ return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+ cr->sb_index = 0;
+ cpi->refresh_golden_frame = 1;
+}
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 0000000000..459ab80b8e
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct AV1_COMP;
+
+struct CYCLIC_REFRESH;
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+ double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+ double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+ MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+ return segment_id == CR_SEGMENT_ID_BOOST1 ||
+ segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+ if (segment_id == CR_SEGMENT_ID_BOOST1)
+ return CR_SEGMENT_ID_BOOST1;
+ else if (segment_id == CR_SEGMENT_ID_BOOST2)
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
new file mode 100644
index 0000000000..ab9b3790bf
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+
+#include "av1/common/seg_common.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_ports/system_state.h"
+
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
+ 0.75, 1.0, 1.0, 1.0 };
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+#if CONFIG_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t,
+ av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+#endif
+
+unsigned int av1_vaq_segment_id(int energy) {
+ ENERGY_IN_BOUNDS(energy);
+ return SEGMENT_ID(energy);
+}
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ struct segmentation *seg = &cm->seg;
+ int i;
+
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ cpi->vaq_refresh = 1;
+
+ av1_enable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ aom_clear_system_state();
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ int qindex_delta =
+ av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+ rate_ratio[i], cm->bit_depth);
+
+ // We don't allow qindex 0 in a segment if the base value is not 0.
+ // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+
+ // No need to enable SEG_LVL_ALT_Q for this segment.
+ if (rate_ratio[i] == 1.0) {
+ continue;
+ }
+
+ av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+ av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+ }
+}
+
+/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
+ * of variance() and highbd_8_variance(). It should not.
+ */
+static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, unsigned int *sse,
+ int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint64_t *sse, uint64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ unsigned int *sse, int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (unsigned int)sse_long;
+ *sum = (int)sum_long;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int var, sse;
+ int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+ if (right_overflow || bottom_overflow) {
+ const int bw = 8 * mi_size_wide[bs] - right_overflow;
+ const int bh = 8 * mi_size_high[bs] - bottom_overflow;
+ int avg;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
+ &sse, &avg);
+ sse >>= 2 * (xd->bd - 8);
+ avg >>= (xd->bd - 8);
+ } else {
+ aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
+ bw, bh, &sse, &avg);
+ }
+#else
+ aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
+ bw, bh, &sse, &avg);
+#endif // CONFIG_HIGHBITDEPTH
+ var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
+ return (unsigned int)((uint64_t)var * 256) / (bw * bh);
+ } else {
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ var =
+ cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse);
+ } else {
+ var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ av1_all_zeros, 0, &sse);
+ }
+#else
+ var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ av1_all_zeros, 0, &sse);
+#endif // CONFIG_HIGHBITDEPTH
+ return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+ }
+}
+
+double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ unsigned int var = block_variance(cpi, x, bs);
+ aom_clear_system_state();
+ return log(var + 1.0);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ double energy;
+ double energy_midpoint;
+ aom_clear_system_state();
+ energy_midpoint =
+ (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+ energy = av1_log_block_var(cpi, x, bs) - energy_midpoint;
+ return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
new file mode 100644
index 0000000000..05725c5def
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AQ_VARIANCE_H_
+#define AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int av1_vaq_segment_id(int energy);
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/dct_neon.c b/third_party/aom/av1/encoder/arm/neon/dct_neon.c
new file mode 100644
index 0000000000..f6ce24a3dd
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/dct_neon.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "av1/common/blockd.h"
+#include "aom_dsp/txfm_common.h"
+
+void av1_fdct8x8_quant_neon(const int16_t *input, int stride,
+ int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ int16_t temp_buffer[64];
+ (void)coeff_ptr;
+
+ aom_fdct8x8_neon(input, temp_buffer, stride);
+ av1_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/error_neon.c b/third_party/aom/av1/encoder/arm/neon/error_neon.c
new file mode 100644
index 0000000000..fe5233f89f
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/error_neon.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./av1_rtcd.h"
+
+int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ int64x2_t error = vdupq_n_s64(0);
+
+ assert(block_size >= 8);
+ assert((block_size % 8) == 0);
+
+ do {
+ const int16x8_t c = vld1q_s16(coeff);
+ const int16x8_t d = vld1q_s16(dqcoeff);
+ const int16x8_t diff = vsubq_s16(c, d);
+ const int16x4_t diff_lo = vget_low_s16(diff);
+ const int16x4_t diff_hi = vget_high_s16(diff);
+ // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+ // accumulating them in 64-bits.
+ const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+ const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+ const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+ error = vaddq_s64(error, err2);
+ coeff += 8;
+ dqcoeff += 8;
+ block_size -= 8;
+ } while (block_size != 0);
+
+ return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 0000000000..36e7d33702
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ int i;
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_one = vdupq_n_s16(1);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+ int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+ int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ // adjust for dc
+ v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+ v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+ // process dc and the first seven ac coeffs
+ {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+ vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
+ v_round = vmovq_n_s16(round_ptr[1]);
+ v_quant = vmovq_n_s16(quant_ptr[1]);
+ v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ }
+ // now process the rest of the ac coeffs
+ for (i = 8; i < count; i += 8) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+ vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+ }
+ {
+ const int16x4_t v_eobmax_3210 = vmax_s16(
+ vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ }
+ } else {
+ memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+ memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
new file mode 100644
index 0000000000..6cffac264b
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -0,0 +1,1790 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+#if CONFIG_NEW_QUANT
+static INLINE int quantize_coeff_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+ const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < cuml_bins_ptr[i]) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+ q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+ }
+ if (q) {
+ *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+ const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int logsizeby16) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
+ q = NUQ_KNOTS +
+ (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - logsizeby16));
+ }
+ if (q) {
+ *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+ av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+ // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+ // (logsizeby16);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+static INLINE int quantize_coeff_fp_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < cuml_bins_ptr[i]) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ q = NUQ_KNOTS +
+ ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+ }
+ if (q) {
+ *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_fp_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ q = NUQ_KNOTS +
+ ((((int64_t)tmp -
+ ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
+ quant) >>
+ (16 - logsizeby16));
+ }
+ if (q) {
+ *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+ av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+ // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+ // (logsizeby16);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t quant_shift, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+ cuml_bins_ptr, dequant_val, qcoeff_ptr, dqcoeff_ptr))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, cuml_bins_ptr,
+ dequant_val, qcoeff_ptr, dqcoeff_ptr))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t quant_shift, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+ cuml_bins_ptr, dequant_val, qcoeff_ptr,
+ dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+ cuml_bins_ptr, dequant_val, qcoeff_ptr,
+ dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t quant_shift, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+ cuml_bins_ptr, dequant_val, qcoeff_ptr,
+ dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+ cuml_bins_ptr, dequant_val, qcoeff_ptr,
+ dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+
+void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (quantize_coeff_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
+ quant_shift_ptr[rc != 0], dequant_ptr[rc != 0],
+ cuml_bins_ptr[band[i]], dequant_val[band[i]],
+ &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
+ dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+ dequant_val[band[i]], &qcoeff_ptr[rc],
+ &dqcoeff_ptr[rc]))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (quantize_coeff_bigtx_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+ dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+ dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+ av1_get_tx_scale(TX_32X32)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (quantize_coeff_bigtx_fp_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+ cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+ &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (quantize_coeff_bigtx_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+ dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+ dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+ av1_get_tx_scale(TX_64X64)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (quantize_coeff_bigtx_fp_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+ cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+ &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+#endif // CONFIG_NEW_QUANT
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ *eob_ptr = 0;
+}
+
+static void quantize_fp_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+ int log_scale) {
+ int i, eob = -1;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+ const qm_val_t wt = qm_ptr[rc];
+ const qm_val_t iwt = iqm_ptr[rc];
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+#endif
+ const int coeff_sign = (coeff >> 31);
+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32 = 0;
+#if CONFIG_AOM_QM
+ if (abs_coeff * wt >=
+ (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+#else
+ if (abs_coeff >= (dequant_ptr[rc != 0] >> (1 + log_scale))) {
+#endif
+ abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+#if CONFIG_AOM_QM
+ tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+ ((16 - log_scale) + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
+#else
+ tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] =
+ qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale);
+#endif
+ }
+
+ if (tmp32) eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan
+#if CONFIG_AOM_QM
+ ,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+ ) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ 0);
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+ ,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+ ) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ 1);
+}
+
+#if CONFIG_TX64X64
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+ ,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+ ) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ 2);
+}
+#endif // CONFIG_TX64X64
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif // CONFIG_AOM_QM
+
+ switch (qparam->log_scale) {
+ case 0:
+ if (n_coeffs < 16) {
+ // TODO(jingning): Need SIMD implementation for smaller block size
+ // quantization.
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+ sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ qparam->log_scale);
+ } else {
+ av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ }
+ break;
+ case 1:
+ av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+ sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+ sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif // CONFIG_AOM_QM
+
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+ case 1:
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif // CONFIG_AOM_QM
+
+ (void)sc;
+
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+ p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+ eob_ptr
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+ case 1:
+ aom_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+#if CONFIG_TX64X64
+ aom_quantize_dc_64x64(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ case 2: break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+#if CONFIG_NEW_QUANT
+void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const uint8_t *band = get_band_translate(qparam->tx_size);
+ int dq = qparam->dq;
+
+ switch (qparam->log_scale) {
+ case 0:
+ quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
+ pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+ qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+ case 1:
+ quantize_32x32_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
+ p->quant_shift, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+ qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ quantize_64x64_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
+ p->quant_shift, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+ qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const uint8_t *band = get_band_translate(qparam->tx_size);
+ int dq = qparam->dq;
+
+ switch (qparam->log_scale) {
+ case 0:
+ quantize_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+ qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+ case 1:
+ quantize_32x32_fp_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ quantize_64x64_fp_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ int dq = qparam->dq;
+ (void)sc;
+
+ switch (qparam->log_scale) {
+ case 0:
+ quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+ pd->dequant[0], p->cuml_bins_nuq[dq][0],
+ pd->dequant_val_nuq[dq][0], qcoeff_ptr, dqcoeff_ptr,
+ eob_ptr);
+ break;
+ case 1:
+ quantize_dc_32x32_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+ pd->dequant[0], p->cuml_bins_nuq[dq][0],
+ pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr);
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ quantize_dc_64x64_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+ pd->dequant[0], p->cuml_bins_nuq[dq][0],
+ pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr);
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+#endif // CONFIG_NEW_QUANT
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif // CONFIG_AOM_QM
+
+ if (n_coeffs < 16) {
+ // TODO(jingning): Need SIMD implementation for smaller block size
+ // quantization.
+ av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+ sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ qparam->log_scale);
+ return;
+ }
+
+ av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ qparam->log_scale);
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif // CONFIG_AOM_QM
+
+ switch (qparam->log_scale) {
+ case 0:
+ aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round, p->quant, p->quant_shift,
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+ sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round, p->quant, p->quant_shift,
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+ sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+ ,
+ qm_ptr, iqm_ptr
+#endif
+ );
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void highbd_quantize_dc(
+ const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+ const int log_scale) {
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+#if CONFIG_AOM_QM
+ (void)qm_ptr;
+ (void)iqm_ptr;
+#endif
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[0];
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#endif // CONFIG_AOM_QM
+
+ (void)sc;
+
+ highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+ p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+ eob_ptr,
+#if CONFIG_AOM_QM
+ qm_ptr, iqm_ptr,
+#endif
+ qparam->log_scale);
+}
+
+#if CONFIG_NEW_QUANT
+static INLINE int highbd_quantize_coeff_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+ const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < cuml_bins_ptr[i]) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+ q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+ }
+ if (q) {
+ *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_fp_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < cuml_bins_ptr[i]) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ q = NUQ_KNOTS + (int)(((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+ }
+ if (q) {
+ *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_fp_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ q = NUQ_KNOTS +
+ (int)(((tmp -
+ ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
+ quant) >>
+ (16 - logsizeby16));
+ }
+ if (q) {
+ *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+ av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_nuq(
+ const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+ const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int logsizeby16) {
+ const int coeff = coeffv;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int i, q;
+ int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+ for (i = 0; i < NUQ_KNOTS; i++) {
+ if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
+ q = i;
+ break;
+ }
+ }
+ if (i == NUQ_KNOTS) {
+ tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
+ q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >>
+ (16 - logsizeby16));
+ }
+ if (q) {
+ *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+ av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
+ *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+ *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+ } else {
+ *qcoeff_ptr = 0;
+ *dqcoeff_ptr = 0;
+ }
+ return (q != 0);
+}
+
+void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t quant_shift, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (highbd_quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+ cuml_bins_ptr, dequant_val, qcoeff_ptr,
+ dqcoeff_ptr))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t quant,
+ const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant,
+ cuml_bins_ptr, dequant_val, qcoeff_ptr,
+ dqcoeff_ptr))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (highbd_quantize_coeff_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+ dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+ dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (highbd_quantize_coeff_bigtx_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+ dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+ dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+ av1_get_tx_scale(TX_32X32)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, int skip_block,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (highbd_quantize_coeff_bigtx_fp_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+ cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+ &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void highbd_quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (highbd_quantize_coeff_bigtx_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+ dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+ dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
+ av1_get_tx_scale(TX_64X64)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, int skip_block,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (highbd_quantize_coeff_bigtx_fp_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+ cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+ &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+
+void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ const cuml_bins_type_nuq *cuml_bins_ptr,
+ const dequant_val_type_nuq *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const uint8_t *band) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ int i;
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ if (highbd_quantize_coeff_fp_nuq(
+ coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+ cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+ &dqcoeff_ptr[rc]))
+ eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_nuq(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (highbd_quantize_coeff_bigtx_nuq(
+ coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
+ dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_fp_nuq(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (highbd_quantize_coeff_bigtx_fp_nuq(
+ coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
+ qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_TX64X64
+void highbd_quantize_dc_64x64_nuq(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+ const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (highbd_quantize_coeff_bigtx_nuq(
+ coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
+ dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_64x64_fp_nuq(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+ const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ if (!skip_block) {
+ const int rc = 0;
+ if (highbd_quantize_coeff_bigtx_fp_nuq(
+ coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
+ qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_TX64X64
+
+void av1_highbd_quantize_b_nuq_facade(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const uint8_t *band = get_band_translate(qparam->tx_size);
+ const int dq = qparam->dq;
+
+ switch (qparam->log_scale) {
+ case 0:
+ highbd_quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
+ p->quant_shift, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+ qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+ case 1:
+ highbd_quantize_32x32_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
+ pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ highbd_quantize_64x64_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
+ pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+void av1_highbd_quantize_fp_nuq_facade(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const uint8_t *band = get_band_translate(qparam->tx_size);
+ const int dq = qparam->dq;
+
+ switch (qparam->log_scale) {
+ case 0:
+ highbd_quantize_fp_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+ case 1:
+ highbd_quantize_32x32_fp_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ highbd_quantize_64x64_fp_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr, sc->scan, band);
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+
+void av1_highbd_quantize_dc_nuq_facade(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const int dq = qparam->dq;
+ (void)sc;
+
+ switch (qparam->log_scale) {
+ case 0:
+ highbd_quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
+ pd->dequant[0], p->cuml_bins_nuq[dq][0],
+ pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr);
+ break;
+ case 1:
+ highbd_quantize_dc_32x32_fp_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
+ p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr);
+ break;
+#if CONFIG_TX64X64
+ case 2:
+ highbd_quantize_dc_64x64_fp_nuq(
+ coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
+ p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
+ dqcoeff_ptr, eob_ptr);
+ break;
+#endif // CONFIG_TX64X64
+ default: assert(0);
+ }
+}
+#endif // CONFIG_NEW_QUANT
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+#if CONFIG_AOM_QM
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+ int log_scale) {
+ int i;
+ int eob = -1;
+ const int scale = 1 << log_scale;
+ const int shift = 16 - log_scale;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+ const qm_val_t wt = qm_ptr[rc];
+ const qm_val_t iwt = iqm_ptr[rc];
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+#endif
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+#if CONFIG_AOM_QM
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
+#else
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+#endif
+ if (abs_qcoeff) eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+#endif // CONFIG_HIGHBITDEPTH
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+ uint32_t t;
+ int l, m;
+ t = d;
+ for (l = 0; t > 1; l++) t >>= 1;
+ m = 1 + (1 << (16 + l)) / d;
+ *quant = (int16_t)(m - (1 << 16));
+ *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+ const int quant = av1_dc_quant(q, 0, bit_depth);
+#if CONFIG_HIGHBITDEPTH
+ switch (bit_depth) {
+ case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+ case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+ case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+#else
+ (void)bit_depth;
+ return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+#endif
+}
+
+void av1_init_quantizer(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ QUANTS *const quants = &cpi->quants;
+ int i, q, quant;
+#if CONFIG_NEW_QUANT
+ int dq;
+#endif
+
+ for (q = 0; q < QINDEX_RANGE; q++) {
+ const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+ const int qrounding_factor = q == 0 ? 64 : 48;
+
+ for (i = 0; i < 2; ++i) {
+ int qrounding_factor_fp = 64;
+ // y
+ quant = i == 0 ? av1_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
+ : av1_ac_quant(q, 0, cm->bit_depth);
+ invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+ quants->y_quant_fp[q][i] = (1 << 16) / quant;
+ quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+ quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+ quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
+ cpi->y_dequant[q][i] = quant;
+
+ // uv
+ quant = i == 0 ? av1_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
+ : av1_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
+ invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i],
+ quant);
+ quants->uv_quant_fp[q][i] = (1 << 16) / quant;
+ quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+ quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+ quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+ cpi->uv_dequant[q][i] = quant;
+ }
+
+#if CONFIG_NEW_QUANT
+ for (dq = 0; dq < QUANT_PROFILES; dq++) {
+ for (i = 0; i < COEF_BANDS; i++) {
+ const int y_quant = cpi->y_dequant[q][i != 0];
+ const int uvquant = cpi->uv_dequant[q][i != 0];
+ av1_get_dequant_val_nuq(y_quant, i, cpi->y_dequant_val_nuq[dq][q][i],
+ quants->y_cuml_bins_nuq[dq][q][i], dq);
+ av1_get_dequant_val_nuq(uvquant, i, cpi->uv_dequant_val_nuq[dq][q][i],
+ quants->uv_cuml_bins_nuq[dq][q][i], dq);
+ }
+ }
+#endif // CONFIG_NEW_QUANT
+
+ for (i = 2; i < 8; i++) { // 8: SIMD width
+ quants->y_quant[q][i] = quants->y_quant[q][1];
+ quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+ quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+ quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+ quants->y_zbin[q][i] = quants->y_zbin[q][1];
+ quants->y_round[q][i] = quants->y_round[q][1];
+ cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
+
+ quants->uv_quant[q][i] = quants->uv_quant[q][1];
+ quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
+ quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
+ quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
+ quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
+ quants->uv_round[q][i] = quants->uv_round[q][1];
+ cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
+ }
+ }
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const QUANTS *const quants = &cpi->quants;
+
+#if CONFIG_DELTA_Q
+#if CONFIG_EXT_DELTA_Q
+ int current_q_index = AOMMAX(
+ 0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
+ ? cm->base_qindex + xd->delta_qindex
+ : cm->base_qindex));
+#else
+ int current_q_index = AOMMAX(
+ 0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_present_flag
+ ? cm->base_qindex + xd->delta_qindex
+ : cm->base_qindex));
+#endif
+ const int qindex = av1_get_qindex(&cm->seg, segment_id, current_q_index);
+#else
+ const int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+#endif
+ const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+ int i;
+#if CONFIG_AOM_QM
+ int minqm = cm->min_qmlevel;
+ int maxqm = cm->max_qmlevel;
+ // Quant matrix only depends on the base QP so there is only one set per frame
+ int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+ ? NUM_QM_LEVELS - 1
+ : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+#endif
+#if CONFIG_NEW_QUANT
+ int dq;
+#endif
+
+ // Y
+ x->plane[0].quant = quants->y_quant[qindex];
+ x->plane[0].quant_fp = quants->y_quant_fp[qindex];
+ x->plane[0].round_fp = quants->y_round_fp[qindex];
+ x->plane[0].quant_shift = quants->y_quant_shift[qindex];
+ x->plane[0].zbin = quants->y_zbin[qindex];
+ x->plane[0].round = quants->y_round[qindex];
+#if CONFIG_AOM_QM
+ memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
+ sizeof(cm->gqmatrix[qmlevel][0]));
+ memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
+ sizeof(cm->giqmatrix[qmlevel][0]));
+#endif
+ xd->plane[0].dequant = cpi->y_dequant[qindex];
+#if CONFIG_NEW_QUANT
+ for (dq = 0; dq < QUANT_PROFILES; dq++) {
+ x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex];
+ xd->plane[0].dequant_val_nuq[dq] = cpi->y_dequant_val_nuq[dq][qindex];
+ }
+#endif // CONFIG_NEW_QUANT
+
+ // UV
+ for (i = 1; i < 3; i++) {
+ x->plane[i].quant = quants->uv_quant[qindex];
+ x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
+ x->plane[i].round_fp = quants->uv_round_fp[qindex];
+ x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
+ x->plane[i].zbin = quants->uv_zbin[qindex];
+ x->plane[i].round = quants->uv_round[qindex];
+#if CONFIG_AOM_QM
+ memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+ sizeof(cm->gqmatrix[qmlevel][1]));
+ memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+ sizeof(cm->giqmatrix[qmlevel][1]));
+#endif
+ xd->plane[i].dequant = cpi->uv_dequant[qindex];
+#if CONFIG_NEW_QUANT
+ for (dq = 0; dq < QUANT_PROFILES; dq++) {
+ x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex];
+ xd->plane[i].dequant_val_nuq[dq] = cpi->uv_dequant_val_nuq[dq][qindex];
+ }
+#endif // CONFIG_NEW_QUANT
+ }
+
+ x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+ x->qindex = qindex;
+
+ set_error_per_bit(x, rdmult);
+
+ av1_initialize_me_consts(cpi, x, qindex);
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+}
+
+void av1_set_quantizer(AV1_COMMON *cm, int q) {
+ // quantizer has to be reinitialized with av1_init_quantizer() if any
+ // delta_q changes.
+ cm->base_qindex = q;
+ cm->y_dc_delta_q = 0;
+ cm->uv_dc_delta_q = 0;
+ cm->uv_ac_delta_q = 0;
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48,
+ 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100,
+ 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+ 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+ 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+ return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+ int quantizer;
+
+ for (quantizer = 0; quantizer < 64; ++quantizer)
+ if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+ return 63;
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
new file mode 100644
index 0000000000..c87b6b7dc0
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_QUANTIZE_H_
+#define AV1_ENCODER_QUANTIZE_H_
+
+#include "./aom_config.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct QUANT_PARAM {
+ int log_scale;
+#if CONFIG_NEW_QUANT
+ TX_SIZE tx_size;
+ int dq;
+#endif // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+ const qm_val_t *qmatrix;
+ const qm_val_t *iqmatrix;
+#endif // CONFIG_AOM_QM
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+typedef struct {
+#if CONFIG_NEW_QUANT
+ DECLARE_ALIGNED(
+ 16, tran_low_t,
+ y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
+ DECLARE_ALIGNED(
+ 16, tran_low_t,
+ uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
+#endif // CONFIG_NEW_QUANT
+ // 0: dc 1: ac 2-8: ac repeated to SIMD width
+ DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+ // TODO(jingning): in progress of re-working the quantization. will decide
+ // if we want to deprecate the current use of y_quant.
+ DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
+
+ DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id);
+
+void av1_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_set_quantizer(struct AV1Common *cm, int q);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
+ uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+#if CONFIG_NEW_QUANT
+void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+#endif // CONFIG_NEW_QUANT
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+#if CONFIG_NEW_QUANT
+void av1_highbd_quantize_fp_nuq_facade(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_nuq_facade(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_nuq_facade(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+#endif // CONFIG_NEW_QUANT
+#endif // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
new file mode 100644
index 0000000000..7cc6179ead
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -0,0 +1,5399 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_CDEF
+#include "av1/common/cdef.h"
+#include "av1/common/clpf.h"
+#endif // CONFIG_CDEF
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#if CONFIG_EXT_INTRA
+#include "av1/common/reconintra.h"
+#endif // CONFIG_EXT_INTRA
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#endif // CONFIG_ANS
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif // CONFIG_LV_MAP
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#if CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING
+#include "av1/encoder/palette.h"
+#endif // CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/subexp.h"
+#include "av1/encoder/tokenize.h"
+#if CONFIG_PVQ
+#include "av1/encoder/pvq_encoder.h"
+#endif
+
+static struct av1_token intra_mode_encodings[INTRA_MODES];
+static struct av1_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EC_MULTISYMBOL
+static const struct av1_token ext_partition_encodings[EXT_PARTITION_TYPES] = {
+ { 0, 1 }, { 4, 3 }, { 12, 4 }, { 7, 3 },
+ { 10, 4 }, { 11, 4 }, { 26, 5 }, { 27, 5 }
+};
+#endif
+static struct av1_token partition_encodings[PARTITION_TYPES];
+#if !CONFIG_REF_MV
+static struct av1_token inter_mode_encodings[INTER_MODES];
+#endif
+#if CONFIG_EXT_INTER
+static const struct av1_token
+ inter_compound_mode_encodings[INTER_COMPOUND_MODES] = {
+ { 2, 2 }, { 50, 6 }, { 51, 6 }, { 24, 5 }, { 52, 6 },
+ { 53, 6 }, { 54, 6 }, { 55, 6 }, { 0, 1 }, { 7, 3 }
+ };
+#endif // CONFIG_EXT_INTER
+#if CONFIG_PALETTE
+static struct av1_token palette_size_encodings[PALETTE_SIZES];
+static struct av1_token palette_color_index_encodings[PALETTE_SIZES]
+ [PALETTE_COLORS];
+#endif // CONFIG_PALETTE
+#if !CONFIG_EC_MULTISYMBOL
+static const struct av1_token tx_size_encodings[MAX_TX_DEPTH][TX_SIZES] = {
+ { { 0, 1 }, { 1, 1 } }, // Max tx_size is 8X8
+ { { 0, 1 }, { 2, 2 }, { 3, 2 } }, // Max tx_size is 16X16
+ { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } }, // Max tx_size is 32X32
+#if CONFIG_TX64X64
+ { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 14, 4 }, { 15, 4 } }, // Max tx_size 64X64
+#endif // CONFIG_TX64X64
+};
+#endif
+
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return;
+ if (v < m) {
+ aom_write_literal(w, v, l - 1);
+ } else {
+ aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+ aom_write_literal(w, (v - m) & 1, 1);
+ }
+}
+#endif // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+
+#if CONFIG_EXT_TX
+static struct av1_token ext_tx_inter_encodings[EXT_TX_SETS_INTER][TX_TYPES];
+static struct av1_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES];
+#else
+static struct av1_token ext_tx_encodings[TX_TYPES];
+#endif // CONFIG_EXT_TX
+#if CONFIG_GLOBAL_MOTION
+static struct av1_token global_motion_types_encodings[GLOBAL_TRANS_TYPES];
+#endif // CONFIG_GLOBAL_MOTION
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+static struct av1_token intra_filter_encodings[INTRA_FILTERS];
+#endif // CONFIG_INTRA_INTERP
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+static struct av1_token interintra_mode_encodings[INTERINTRA_MODES];
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+static struct av1_token compound_type_encodings[COMPOUND_TYPES];
+#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static struct av1_token motion_mode_encodings[MOTION_MODES];
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_LOOP_RESTORATION
+static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
+#endif // CONFIG_LOOP_RESTORATION
+static void write_uncompressed_header(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb);
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
+static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+ const uint32_t data_size, const uint32_t max_tile_size,
+ const uint32_t max_tile_col_size,
+ int *const tile_size_bytes,
+ int *const tile_col_size_bytes);
+
+void av1_encode_token_init(void) {
+#if CONFIG_EXT_TX || CONFIG_PALETTE
+ int s;
+#endif // CONFIG_EXT_TX || CONFIG_PALETTE
+#if CONFIG_EXT_TX
+ for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+ av1_tokens_from_tree(ext_tx_inter_encodings[s], av1_ext_tx_inter_tree[s]);
+ }
+ for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+ av1_tokens_from_tree(ext_tx_intra_encodings[s], av1_ext_tx_intra_tree[s]);
+ }
+#else
+ av1_tokens_from_tree(ext_tx_encodings, av1_ext_tx_tree);
+#endif // CONFIG_EXT_TX
+ av1_tokens_from_tree(intra_mode_encodings, av1_intra_mode_tree);
+ av1_tokens_from_tree(switchable_interp_encodings, av1_switchable_interp_tree);
+ av1_tokens_from_tree(partition_encodings, av1_partition_tree);
+#if !CONFIG_REF_MV
+ av1_tokens_from_tree(inter_mode_encodings, av1_inter_mode_tree);
+#endif
+
+#if CONFIG_PALETTE
+ av1_tokens_from_tree(palette_size_encodings, av1_palette_size_tree);
+ for (s = 0; s < PALETTE_SIZES; ++s) {
+ av1_tokens_from_tree(palette_color_index_encodings[s],
+ av1_palette_color_index_tree[s]);
+ }
+#endif // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+ av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
+#endif // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#if CONFIG_EXT_INTER
+ av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+ av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree);
+#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ av1_tokens_from_tree(motion_mode_encodings, av1_motion_mode_tree);
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+ av1_tokens_from_tree(global_motion_types_encodings,
+ av1_global_motion_types_tree);
+#endif // CONFIG_GLOBAL_MOTION
+#if CONFIG_LOOP_RESTORATION
+ av1_tokens_from_tree(switchable_restore_encodings,
+ av1_switchable_restore_tree);
+#endif // CONFIG_LOOP_RESTORATION
+
+#if CONFIG_EC_MULTISYMBOL
+ /* This hack is necessary when CONFIG_DUAL_FILTER is enabled because the five
+ SWITCHABLE_FILTERS are not consecutive, e.g., 0, 1, 2, 3, 4, when doing
+ an in-order traversal of the av1_switchable_interp_tree structure. */
+ av1_indices_from_tree(av1_switchable_interp_ind, av1_switchable_interp_inv,
+ av1_switchable_interp_tree);
+/* This hack is necessary because the four TX_TYPES are not consecutive,
+ e.g., 0, 1, 2, 3, when doing an in-order traversal of the av1_ext_tx_tree
+ structure. */
+#if CONFIG_EXT_TX
+ for (s = 1; s < EXT_TX_SETS_INTRA; ++s)
+ av1_indices_from_tree(av1_ext_tx_intra_ind[s], av1_ext_tx_intra_inv[s],
+ av1_ext_tx_intra_tree[s]);
+ for (s = 1; s < EXT_TX_SETS_INTER; ++s)
+ av1_indices_from_tree(av1_ext_tx_inter_ind[s], av1_ext_tx_inter_inv[s],
+ av1_ext_tx_inter_tree[s]);
+#else
+ av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, av1_ext_tx_tree);
+#endif
+ av1_indices_from_tree(av1_intra_mode_ind, av1_intra_mode_inv,
+ av1_intra_mode_tree);
+ av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv,
+ av1_inter_mode_tree);
+#endif
+}
+
+static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
+ const MODE_INFO *mi, const MODE_INFO *above_mi,
+ const MODE_INFO *left_mi, int block,
+ PREDICTION_MODE mode, aom_writer *w) {
+#if CONFIG_INTRABC
+ assert(!is_intrabc_block(&mi->mbmi));
+#endif // CONFIG_INTRABC
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_intra_mode_ind[mode],
+ get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block),
+ INTRA_MODES);
+ (void)cm;
+#else
+ av1_write_token(w, av1_intra_mode_tree,
+ get_y_mode_probs(cm, mi, above_mi, left_mi, block),
+ &intra_mode_encodings[mode]);
+ (void)frame_ctx;
+#endif
+}
+
+#if CONFIG_EXT_INTER
+static void write_interintra_mode(aom_writer *w, INTERINTRA_MODE mode,
+ const aom_prob *probs) {
+ av1_write_token(w, av1_interintra_mode_tree, probs,
+ &interintra_mode_encodings[mode]);
+}
+#endif // CONFIG_EXT_INTER
+
+static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+ FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
+#if CONFIG_REF_MV
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+ const aom_prob newmv_prob = ec_ctx->newmv_prob[newmv_ctx];
+
+#define IS_NEWMV_MODE(mode) ((mode) == NEWMV)
+ aom_write(w, !IS_NEWMV_MODE(mode), newmv_prob);
+
+ if (!IS_NEWMV_MODE(mode)) {
+ const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+ const aom_prob zeromv_prob = ec_ctx->zeromv_prob[zeromv_ctx];
+
+ if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
+ assert(mode == ZEROMV);
+ return;
+ }
+
+ aom_write(w, mode != ZEROMV, zeromv_prob);
+
+ if (mode != ZEROMV) {
+ int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ aom_prob refmv_prob;
+
+ if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
+ if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
+ if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
+
+ refmv_prob = ec_ctx->refmv_prob[refmv_ctx];
+ aom_write(w, mode != NEARESTMV, refmv_prob);
+ }
+ }
+
+#undef IS_NEWMV_MODE
+
+#else // !CONFIG_REF_MV
+ assert(is_inter_mode(mode));
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_inter_mode_ind[INTER_OFFSET(mode)],
+ ec_ctx->inter_mode_cdf[mode_ctx], INTER_MODES);
+#else
+ {
+ const aom_prob *const inter_probs = ec_ctx->inter_mode_probs[mode_ctx];
+ av1_write_token(w, av1_inter_mode_tree, inter_probs,
+ &inter_mode_encodings[INTER_OFFSET(mode)]);
+ }
+#endif
+#endif
+}
+
+#if CONFIG_REF_MV
+static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+ const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) {
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+
+ assert(mbmi->ref_mv_idx < 3);
+
+#if CONFIG_EXT_INTER
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+#else
+ if (mbmi->mode == NEWMV) {
+#endif
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+ aom_write(w, mbmi->ref_mv_idx != idx, drl_prob);
+ if (mbmi->ref_mv_idx == idx) return;
+ }
+ }
+ return;
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ int idx;
+ // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+ for (idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+ aom_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob);
+ if (mbmi->ref_mv_idx == (idx - 1)) return;
+ }
+ }
+ return;
+ }
+}
+#endif
+
+#if CONFIG_EXT_INTER
+static void write_inter_compound_mode(AV1_COMMON *cm, aom_writer *w,
+ PREDICTION_MODE mode,
+ const int16_t mode_ctx) {
+ const aom_prob *const inter_compound_probs =
+ cm->fc->inter_compound_mode_probs[mode_ctx];
+
+ assert(is_inter_compound_mode(mode));
+ av1_write_token(w, av1_inter_compound_mode_tree, inter_compound_probs,
+ &inter_compound_mode_encodings[INTER_COMPOUND_OFFSET(mode)]);
+}
+#endif // CONFIG_EXT_INTER
+
+static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
+ int max) {
+ aom_wb_write_literal(wb, data, get_unsigned_bits(max));
+}
+
+#if !CONFIG_EC_ADAPT || \
+ (CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION || CONFIG_EXT_INTER)
+static void prob_diff_update(const aom_tree_index *tree,
+ aom_prob probs[/*n - 1*/],
+ const unsigned int counts[/*n - 1*/], int n,
+ int probwt, aom_writer *w) {
+ int i;
+ unsigned int branch_ct[32][2];
+
+ // Assuming max number of probabilities <= 32
+ assert(n <= 32);
+
+ av1_tree_probs_from_distribution(tree, branch_ct, counts);
+ for (i = 0; i < n - 1; ++i)
+ av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt);
+}
+#endif
+
+#if CONFIG_EXT_INTER || !CONFIG_EC_ADAPT
+static int prob_diff_update_savings(const aom_tree_index *tree,
+ aom_prob probs[/*n - 1*/],
+ const unsigned int counts[/*n - 1*/], int n,
+ int probwt) {
+ int i;
+ unsigned int branch_ct[32][2];
+ int savings = 0;
+
+ // Assuming max number of probabilities <= 32
+ assert(n <= 32);
+ av1_tree_probs_from_distribution(tree, branch_ct, counts);
+ for (i = 0; i < n - 1; ++i) {
+ savings +=
+ av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i], probwt);
+ }
+ return savings;
+}
+#endif // CONFIG_EXT_INTER || !CONFIG_EC_ADAPT
+
+#if CONFIG_VAR_TX
+static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
+ int depth, int blk_row, int blk_col,
+ aom_writer *w) {
+ const int tx_row = blk_row >> 1;
+ const int tx_col = blk_col >> 1;
+ const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+
+ int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row,
+ mbmi->sb_type, tx_size);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (depth == MAX_VARTX_DEPTH) {
+ txfm_partition_update(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row, tx_size, tx_size);
+ return;
+ }
+
+ if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
+ aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+ txfm_partition_update(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row, tx_size, tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsl = tx_size_wide_unit[sub_txs];
+ int i;
+
+ aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+
+ if (tx_size == TX_8X8) {
+ txfm_partition_update(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row, sub_txs, tx_size);
+ return;
+ }
+
+ assert(bsl > 0);
+ for (i = 0; i < 4; ++i) {
+ int offsetr = blk_row + (i >> 1) * bsl;
+ int offsetc = blk_col + (i & 0x01) * bsl;
+ write_tx_size_vartx(cm, xd, mbmi, sub_txs, depth + 1, offsetr, offsetc,
+ w);
+ }
+ }
+}
+
+static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
+ FRAME_COUNTS *counts, int probwt) {
+ int k;
+ for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+ av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
+ counts->txfm_partition[k], probwt);
+}
+#endif
+
+static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ (void)cm;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+// For sub8x8 blocks the tx_size symbol does not need to be sent
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
+ if (bsize > BLOCK_4X4) {
+#else
+ if (bsize >= BLOCK_8X8) {
+#endif
+ const TX_SIZE tx_size = mbmi->tx_size;
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+ : intra_tx_size_cat_lookup[bsize];
+ const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+ const int depth = tx_size_to_depth(coded_tx_size);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+ assert(
+ IMPLIES(is_rect_tx(tx_size), tx_size == max_txsize_rect_lookup[bsize]));
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+ tx_size_cat + 2);
+#else
+ av1_write_token(w, av1_tx_size_tree[tx_size_cat],
+ ec_ctx->tx_size_probs[tx_size_cat][tx_size_ctx],
+ &tx_size_encodings[tx_size_cat][depth]);
+#endif
+ }
+}
+
+#if CONFIG_REF_MV
+static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
+ FRAME_COUNTS *counts) {
+ int i;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+ av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i],
+ probwt);
+ for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+ av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
+ counts->zeromv_mode[i], probwt);
+ for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+ av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i],
+ probwt);
+ for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
+ av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
+ probwt);
+}
+#endif
+
+#if CONFIG_EXT_INTER
+static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt,
+ aom_writer *w) {
+ const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+ av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+ int i;
+ int savings = 0;
+ int do_update = 0;
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+ savings += prob_diff_update_savings(
+ av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
+ cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt);
+ }
+ do_update = savings > savings_thresh;
+ aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+ if (do_update) {
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+ prob_diff_update(
+ av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
+ cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt, w);
+ }
+ }
+}
+#endif // CONFIG_EXT_INTER
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int segment_id, const MODE_INFO *mi, aom_writer *w) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int skip = mi->mbmi.skip;
+ aom_write(w, skip, av1_get_skip_prob(cm, xd));
+ return skip;
+ }
+}
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static void write_motion_mode(const AV1_COMMON *cm, const MODE_INFO *mi,
+ aom_writer *w) {
+ const MB_MODE_INFO *mbmi = &mi->mbmi;
+ MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ 0, cm->global_motion,
+#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ mi);
+
+ if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return;
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+ if (last_motion_mode_allowed == OBMC_CAUSAL) {
+ aom_write(w, mbmi->motion_mode == OBMC_CAUSAL,
+ cm->fc->obmc_prob[mbmi->sb_type]);
+ } else {
+#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+ av1_write_token(w, av1_motion_mode_tree,
+ cm->fc->motion_mode_prob[mbmi->sb_type],
+ &motion_mode_encodings[mbmi->motion_mode]);
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+ }
+#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+}
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_DELTA_Q
+static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int delta_qindex, aom_writer *w) {
+ int sign = delta_qindex < 0;
+ int abs = sign ? -delta_qindex : delta_qindex;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ (void)cm;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+ (void)xd;
+#endif
+
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+ DELTA_Q_PROBS + 1);
+#else
+ int i = 0;
+ while (i < DELTA_Q_SMALL && i <= abs) {
+ int bit = (i < abs);
+ aom_write(w, bit, ec_ctx->delta_q_prob[i]);
+ i++;
+ }
+#endif
+
+ if (!smallval) {
+ rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_delta_q_probs(AV1_COMMON *cm, aom_writer *w,
+ FRAME_COUNTS *counts) {
+ int k;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+#if CONFIG_EXT_DELTA_Q
+ if (!cm->delta_q_present_flag) return;
+#endif // CONFIG_EXT_DELTA_Q
+ for (k = 0; k < DELTA_Q_PROBS; ++k) {
+ av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k],
+ probwt);
+ }
+}
+#endif // CONFIG_EC_ADAPT
+
+#if CONFIG_EXT_DELTA_Q
+static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int delta_lflevel, aom_writer *w) {
+ int sign = delta_lflevel < 0;
+ int abs = sign ? -delta_lflevel : delta_lflevel;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ (void)cm;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+ (void)xd;
+#endif
+
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+ DELTA_LF_PROBS + 1);
+#else
+ int i = 0;
+ while (i < DELTA_LF_SMALL && i <= abs) {
+ int bit = (i < abs);
+ aom_write(w, bit, ec_ctx->delta_lf_prob[i]);
+ i++;
+ }
+#endif // CONFIG_EC_MULTISYMBOL
+
+ if (!smallval) {
+ rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_delta_lf_probs(AV1_COMMON *cm, aom_writer *w,
+ FRAME_COUNTS *counts) {
+ int k;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ if (!cm->delta_lf_present_flag) return;
+ for (k = 0; k < DELTA_LF_PROBS; ++k) {
+ av1_cond_prob_diff_update(w, &cm->fc->delta_lf_prob[k], counts->delta_lf[k],
+ probwt);
+ }
+}
+#endif // CONFIG_EC_ADAPT
+#endif // CONFIG_EXT_DELTA_Q
+#endif // CONFIG_DELTA_Q
+
+static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
+ FRAME_COUNTS *counts) {
+ int k;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ for (k = 0; k < SKIP_CONTEXTS; ++k) {
+ av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k],
+ probwt);
+ }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_switchable_interp_probs(AV1_COMMON *cm, aom_writer *w,
+ FRAME_COUNTS *counts) {
+ int j;
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ prob_diff_update(
+ av1_switchable_interp_tree, cm->fc->switchable_interp_prob[j],
+ counts->switchable_interp[j], SWITCHABLE_FILTERS, probwt, w);
+ }
+}
+#endif
+
+#if !CONFIG_EC_ADAPT
+#if CONFIG_EXT_TX
+static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
+ const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+ av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+ int i, j;
+ int s;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+ int savings = 0;
+ int do_update = 0;
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+ savings += prob_diff_update_savings(
+ av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
+ cm->counts.inter_ext_tx[s][i],
+ num_ext_tx_set[ext_tx_set_type_inter[s]], probwt);
+ }
+ do_update = savings > savings_thresh;
+ aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+ if (do_update) {
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+ prob_diff_update(av1_ext_tx_inter_tree[s],
+ cm->fc->inter_ext_tx_prob[s][i],
+ cm->counts.inter_ext_tx[s][i],
+ num_ext_tx_set[ext_tx_set_type_inter[s]], probwt, w);
+ }
+ }
+ }
+
+ for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+ int savings = 0;
+ int do_update = 0;
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+ for (j = 0; j < INTRA_MODES; ++j)
+ savings += prob_diff_update_savings(
+ av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
+ cm->counts.intra_ext_tx[s][i][j],
+ num_ext_tx_set[ext_tx_set_type_intra[s]], probwt);
+ }
+ do_update = savings > savings_thresh;
+ aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+ if (do_update) {
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+ for (j = 0; j < INTRA_MODES; ++j)
+ prob_diff_update(av1_ext_tx_intra_tree[s],
+ cm->fc->intra_ext_tx_prob[s][i][j],
+ cm->counts.intra_ext_tx[s][i][j],
+ num_ext_tx_set[ext_tx_set_type_intra[s]], probwt, w);
+ }
+ }
+ }
+}
+
+#else
+static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
+ const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+ av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+ int i, j;
+
+ int savings = 0;
+ int do_update = 0;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ for (j = 0; j < TX_TYPES; ++j)
+ savings += prob_diff_update_savings(
+ av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
+ cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt);
+ }
+ do_update = savings > savings_thresh;
+ aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+ if (do_update) {
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ for (j = 0; j < TX_TYPES; ++j) {
+ prob_diff_update(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
+ cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt, w);
+ }
+ }
+ }
+
+ savings = 0;
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ savings +=
+ prob_diff_update_savings(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
+ cm->counts.inter_ext_tx[i], TX_TYPES, probwt);
+ }
+ do_update = savings > savings_thresh;
+ aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+ if (do_update) {
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ prob_diff_update(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
+ cm->counts.inter_ext_tx[i], TX_TYPES, probwt, w);
+ }
+ }
+}
+#endif // CONFIG_EXT_TX
+#endif // !CONFIG_EC_ADAPT
+#if CONFIG_PALETTE
+static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
+ int num) {
+ int i;
+ const TOKENEXTRA *p = *tp;
+
+ for (i = 0; i < num; ++i) {
+ av1_write_token(
+ w, av1_palette_color_index_tree[n - PALETTE_MIN_SIZE], p->context_tree,
+ &palette_color_index_encodings[n - PALETTE_MIN_SIZE][p->token]);
+ ++p;
+ }
+
+ *tp = p;
+}
+#endif // CONFIG_PALETTE
+
+#if !CONFIG_PVQ
+#if CONFIG_SUPERTX
+static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
+ const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+ av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+ int i, j;
+ int savings = 0;
+ int do_update = 0;
+ for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+ for (j = TX_8X8; j < TX_SIZES; ++j) {
+ savings += av1_cond_prob_diff_update_savings(
+ &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt);
+ }
+ }
+ do_update = savings > savings_thresh;
+ aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+ if (do_update) {
+ for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+ for (j = TX_8X8; j < TX_SIZES; ++j) {
+ av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
+ cm->counts.supertx[i][j], probwt);
+ }
+ }
+ }
+}
+#endif // CONFIG_SUPERTX
+
+#if CONFIG_NEW_MULTISYMBOL
+static INLINE void write_coeff_extra(const aom_cdf_prob *const *cdf, int val,
+ int n, aom_writer *w) {
+ // Code the extra bits from LSB to MSB in groups of 4
+ int i = 0;
+ int count = 0;
+ while (count < n) {
+ const int size = AOMMIN(n - count, 4);
+ const int mask = (1 << size) - 1;
+ aom_write_cdf(w, val & mask, cdf[i++], 1 << size);
+ val >>= size;
+ count += size;
+ }
+}
+#else
+static INLINE void write_coeff_extra(const aom_prob *pb, int value,
+ int num_bits, int skip_bits, aom_writer *w,
+ TOKEN_STATS *token_stats) {
+ // Code the extra bits from MSB to LSB 1 bit at a time
+ int index;
+ for (index = skip_bits; index < num_bits; ++index) {
+ const int shift = num_bits - index - 1;
+ const int bb = (value >> shift) & 1;
+ aom_write_record(w, bb, pb[index], token_stats);
+ }
+}
+#endif
+
+#if CONFIG_NEW_TOKENSET && !CONFIG_LV_MAP
+static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
+ const TOKENEXTRA *const stop,
+ aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
+ TOKEN_STATS *token_stats) {
+ const TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+ int count = 0;
+ const int seg_eob = tx_size_2d[tx_size];
+#endif
+
+ while (p < stop && p->token != EOSB_TOKEN) {
+ const int token = p->token;
+ if (token == BLOCK_Z_TOKEN) {
+ aom_write_symbol(w, 0, *p->head_cdf, HEAD_TOKENS + 1);
+ p++;
+ continue;
+ }
+
+ const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
+ if (p->eob_val == LAST_EOB) {
+ // Just code a flag indicating whether the value is >1 or 1.
+ aom_write_bit(w, token != ONE_TOKEN);
+ } else {
+ int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - p->eob_val + p->first_val;
+ aom_write_symbol(w, comb_symb, *p->head_cdf, HEAD_TOKENS + p->first_val);
+ }
+ if (token > ONE_TOKEN) {
+ aom_write_symbol(w, token - TWO_TOKEN, *p->tail_cdf, TAIL_TOKENS);
+ }
+
+ if (extra_bits->base_val) {
+ const int bit_string = p->extra;
+ const int bit_string_length = extra_bits->len; // Length of extra bits to
+ const int is_cat6 = (extra_bits->base_val == CAT6_MIN_VAL);
+ // be written excluding
+ // the sign bit.
+ int skip_bits = is_cat6
+ ? (int)sizeof(av1_cat6_prob) -
+ av1_get_cat6_extrabits_size(tx_size, bit_depth)
+ : 0;
+
+ assert(!(bit_string >> (bit_string_length - skip_bits + 1)));
+ if (bit_string_length > 0)
+#if CONFIG_NEW_MULTISYMBOL
+ write_coeff_extra(extra_bits->cdf, bit_string >> 1,
+ bit_string_length - skip_bits, w);
+#else
+ write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length,
+ skip_bits, w, token_stats);
+#endif
+
+ aom_write_bit_record(w, bit_string & 1, token_stats);
+ }
+ ++p;
+
+#if CONFIG_VAR_TX
+ ++count;
+ if (token == EOB_TOKEN || count == seg_eob) break;
+#endif
+ }
+
+ *tp = p;
+}
+#else // CONFIG_NEW_TOKENSET
+#if !CONFIG_LV_MAP
+static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
+ const TOKENEXTRA *const stop,
+ aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
+ TOKEN_STATS *token_stats) {
+ const TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+ int count = 0;
+ const int seg_eob = tx_size_2d[tx_size];
+#endif
+
+ while (p < stop && p->token != EOSB_TOKEN) {
+ const int token = p->token;
+#if !CONFIG_EC_MULTISYMBOL
+ const struct av1_token *const coef_encoding = &av1_coef_encodings[token];
+ int coef_value = coef_encoding->value;
+ int coef_length = coef_encoding->len;
+#endif // !CONFIG_EC_MULTISYMBOL
+ const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
+
+#if CONFIG_EC_MULTISYMBOL
+ /* skip one or two nodes */
+ if (!p->skip_eob_node)
+ aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
+ if (token != EOB_TOKEN) {
+ aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
+ if (token != ZERO_TOKEN) {
+ aom_write_symbol(w, token - ONE_TOKEN, *p->token_cdf,
+ CATEGORY6_TOKEN - ONE_TOKEN + 1);
+ }
+ }
+#else
+ /* skip one or two nodes */
+ if (p->skip_eob_node)
+ coef_length -= p->skip_eob_node;
+ else
+ aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
+
+ if (token != EOB_TOKEN) {
+ aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
+
+ if (token != ZERO_TOKEN) {
+ aom_write_record(w, token != ONE_TOKEN, p->context_tree[2],
+ token_stats);
+
+ if (token != ONE_TOKEN) {
+ const int unconstrained_len = UNCONSTRAINED_NODES - p->skip_eob_node;
+ aom_write_tree_record(
+ w, av1_coef_con_tree,
+ av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], coef_value,
+ coef_length - unconstrained_len, 0, token_stats);
+ }
+ }
+ }
+#endif // CONFIG_EC_MULTISYMBOL
+
+ if (extra_bits->base_val) {
+ const int bit_string = p->extra;
+ const int bit_string_length = extra_bits->len; // Length of extra bits to
+ // be written excluding
+ // the sign bit.
+ int skip_bits = (extra_bits->base_val == CAT6_MIN_VAL)
+ ? (int)sizeof(av1_cat6_prob) -
+ av1_get_cat6_extrabits_size(tx_size, bit_depth)
+ : 0;
+
+ assert(!(bit_string >> (bit_string_length - skip_bits + 1)));
+ if (bit_string_length > 0) {
+#if CONFIG_NEW_MULTISYMBOL
+ skip_bits &= ~3;
+ write_coeff_extra(extra_bits->cdf, bit_string >> 1,
+ bit_string_length - skip_bits, w);
+#else
+ write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length,
+ skip_bits, w, token_stats);
+#endif
+ }
+ aom_write_bit_record(w, bit_string & 1, token_stats);
+ }
+ ++p;
+
+#if CONFIG_VAR_TX
+ ++count;
+ if (token == EOB_TOKEN || count == seg_eob) break;
+#endif
+ }
+
+ *tp = p;
+}
+#endif // !CONFIG_LV_MAP
+#endif // CONFIG_NEW_TOKENSET
+#else // !CONFIG_PVQ
+static PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) {
+ PVQ_INFO *pvq;
+
+ assert(pvq_q->curr_pos <= pvq_q->last_pos);
+ assert(pvq_q->curr_pos < pvq_q->buf_len);
+
+ pvq = pvq_q->buf + pvq_q->curr_pos;
+ ++pvq_q->curr_pos;
+
+ return pvq;
+}
+
+static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int plane, BLOCK_SIZE bsize,
+ const TX_SIZE tx_size) {
+ PVQ_INFO *pvq;
+ int idx, idy;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ od_adapt_ctx *adapt;
+ int max_blocks_wide;
+ int max_blocks_high;
+ int step = (1 << tx_size);
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+
+ adapt = x->daala_enc.state.adapt;
+
+ max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+ for (idy = 0; idy < max_blocks_high; idy += step) {
+ for (idx = 0; idx < max_blocks_wide; idx += step) {
+ const int is_keyframe = 0;
+ const int encode_flip = 0;
+ const int flip = 0;
+ int i;
+ const int has_dc_skip = 1;
+ int *exg = &adapt->pvq.pvq_exg[plane][tx_size][0];
+ int *ext = adapt->pvq.pvq_ext + tx_size * PVQ_MAX_PARTITIONS;
+ generic_encoder *model = adapt->pvq.pvq_param_model;
+
+ pvq = get_pvq_block(x->pvq_q);
+
+ // encode block skip info
+ aom_write_symbol(w, pvq->ac_dc_coded,
+ adapt->skip_cdf[2 * tx_size + (plane != 0)], 4);
+
+ // AC coeffs coded?
+ if (pvq->ac_dc_coded & AC_CODED) {
+ assert(pvq->bs == tx_size);
+ for (i = 0; i < pvq->nb_bands; i++) {
+ if (i == 0 ||
+ (!pvq->skip_rest && !(pvq->skip_dir & (1 << ((i - 1) % 3))))) {
+ pvq_encode_partition(
+ w, pvq->qg[i], pvq->theta[i], pvq->y + pvq->off[i],
+ pvq->size[i], pvq->k[i], model, adapt, exg + i, ext + i,
+ (plane != 0) * OD_TXSIZES * PVQ_MAX_PARTITIONS +
+ pvq->bs * PVQ_MAX_PARTITIONS + i,
+ is_keyframe, i == 0 && (i < pvq->nb_bands - 1), pvq->skip_rest,
+ encode_flip, flip);
+ }
+ if (i == 0 && !pvq->skip_rest && pvq->bs > 0) {
+ aom_write_symbol(
+ w, pvq->skip_dir,
+ &adapt->pvq
+ .pvq_skip_dir_cdf[(plane != 0) + 2 * (pvq->bs - 1)][0],
+ 7);
+ }
+ }
+ }
+ // Encode residue of DC coeff, if exist.
+ if (!has_dc_skip || (pvq->ac_dc_coded & DC_CODED)) {
+ generic_encode(w, &adapt->model_dc[plane],
+ abs(pvq->dq_dc_residue) - has_dc_skip,
+ &adapt->ex_dc[plane][pvq->bs][0], 2);
+ }
+ if ((pvq->ac_dc_coded & DC_CODED)) {
+ aom_write_bit(w, pvq->dq_dc_residue < 0);
+ }
+ }
+ } // for (idy = 0;
+}
+#endif // !CONFIG_PVG
+
+#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
+static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
+ const TOKENEXTRA *const tok_end,
+#if CONFIG_PVQ
+ MACROBLOCK *const x,
+#endif
+ MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
+ BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
+ int block, int blk_row, int blk_col,
+ TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ const int tx_row = blk_row >> (1 - pd->subsampling_y);
+ const int tx_col = blk_col >> (1 - pd->subsampling_x);
+ TX_SIZE plane_tx_size;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ plane_tx_size =
+ plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+ : mbmi->inter_tx_size[tx_row][tx_col];
+
+ if (tx_size == plane_tx_size) {
+ TOKEN_STATS tmp_token_stats;
+ init_token_stats(&tmp_token_stats);
+#if !CONFIG_PVQ
+ pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size, &tmp_token_stats);
+#else
+ pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
+#endif
+#if CONFIG_RD_DEBUG
+ token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
+ token_stats->cost += tmp_token_stats.cost;
+#endif
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsl = tx_size_wide_unit[sub_txs];
+ int i;
+
+ assert(bsl > 0);
+
+ for (i = 0; i < 4; ++i) {
+ const int offsetr = blk_row + (i >> 1) * bsl;
+ const int offsetc = blk_col + (i & 0x01) * bsl;
+ const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ pack_txb_tokens(w, tp, tok_end,
+#if CONFIG_PVQ
+ x,
+#endif
+ xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr,
+ offsetc, sub_txs, token_stats);
+ block += step;
+ }
+ }
+}
+#endif
+
+static void write_segment_id(aom_writer *w, const struct segmentation *seg,
+ struct segmentation_probs *segp, int segment_id) {
+ if (seg->enabled && seg->update_map) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
+#else
+ aom_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
+#endif
+ }
+}
+
+// This function encodes the reference frame
+static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const int is_compound = has_second_ref(mbmi);
+ const int segment_id = mbmi->segment_id;
+
+ // If segment level coding of this signal is disabled...
+ // or the segment allows multiple reference frame options
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ assert(!is_compound);
+ assert(mbmi->ref_frame[0] ==
+ get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+ } else {
+ // does the feature use compound prediction or not
+ // (if not specified at the frame/segment level)
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+#if SUB8X8_COMP_REF
+ aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
+#else
+ if (mbmi->sb_type >= BLOCK_8X8)
+ aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
+#endif
+ } else {
+ assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
+ }
+
+ if (is_compound) {
+#if CONFIG_EXT_REFS
+ const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME);
+ const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+#else // CONFIG_EXT_REFS
+ const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
+#endif // CONFIG_EXT_REFS
+
+ aom_write(w, bit, av1_get_pred_prob_comp_ref_p(cm, xd));
+
+#if CONFIG_EXT_REFS
+ if (!bit) {
+ const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
+ aom_write(w, bit1, av1_get_pred_prob_comp_ref_p1(cm, xd));
+ } else {
+ const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+ aom_write(w, bit2, av1_get_pred_prob_comp_ref_p2(cm, xd));
+ }
+ aom_write(w, bit_bwd, av1_get_pred_prob_comp_bwdref_p(cm, xd));
+#endif // CONFIG_EXT_REFS
+ } else {
+#if CONFIG_EXT_REFS
+ const int bit0 = (mbmi->ref_frame[0] == ALTREF_FRAME ||
+ mbmi->ref_frame[0] == BWDREF_FRAME);
+ aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+
+ if (bit0) {
+ const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+ aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
+ } else {
+ const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[0] == GOLDEN_FRAME);
+ aom_write(w, bit2, av1_get_pred_prob_single_ref_p3(cm, xd));
+
+ if (!bit2) {
+ const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+ aom_write(w, bit3, av1_get_pred_prob_single_ref_p4(cm, xd));
+ } else {
+ const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+ aom_write(w, bit4, av1_get_pred_prob_single_ref_p5(cm, xd));
+ }
+ }
+#else // CONFIG_EXT_REFS
+ const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+ aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+
+ if (bit0) {
+ const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+ aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
+ }
+#endif // CONFIG_EXT_REFS
+ }
+ }
+}
+
+#if CONFIG_FILTER_INTRA
+static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mbmi,
+ aom_writer *w) {
+ if (mbmi->mode == DC_PRED
+#if CONFIG_PALETTE
+ && mbmi->palette_mode_info.palette_size[0] == 0
+#endif // CONFIG_PALETTE
+ ) {
+ aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0],
+ cm->fc->filter_intra_probs[0]);
+ if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+ const FILTER_INTRA_MODE mode =
+ mbmi->filter_intra_mode_info.filter_intra_mode[0];
+ write_uniform(w, FILTER_INTRA_MODES, mode);
+ }
+ }
+
+ if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+ && mbmi->palette_mode_info.palette_size[1] == 0
+#endif // CONFIG_PALETTE
+ ) {
+ aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1],
+ cm->fc->filter_intra_probs[1]);
+ if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) {
+ const FILTER_INTRA_MODE mode =
+ mbmi->filter_intra_mode_info.filter_intra_mode[1];
+ write_uniform(w, FILTER_INTRA_MODES, mode);
+ }
+ }
+}
+#endif // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+static void write_intra_angle_info(const MACROBLOCKD *xd,
+ FRAME_CONTEXT *const ec_ctx, aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_INTRA_INTERP
+ const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+ int p_angle;
+#endif // CONFIG_INTRA_INTERP
+
+ (void)ec_ctx;
+ if (bsize < BLOCK_8X8) return;
+
+ if (av1_is_directional_mode(mbmi->mode, bsize)) {
+ write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+#if CONFIG_INTRA_INTERP
+ p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+ if (av1_is_intra_filter_switchable(p_angle)) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, mbmi->intra_filter,
+ ec_ctx->intra_filter_cdf[intra_filter_ctx],
+ INTRA_FILTERS);
+#else
+ av1_write_token(w, av1_intra_filter_tree,
+ ec_ctx->intra_filter_probs[intra_filter_ctx],
+ &intra_filter_encodings[mbmi->intra_filter]);
+#endif // CONFIG_EC_MULTISYMBOL
+ }
+#endif // CONFIG_INTRA_INTERP
+ }
+
+ if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+ write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+ }
+}
+#endif // CONFIG_EXT_INTRA
+
+static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+ if (!av1_is_interp_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+ for (int i = 0; i < 4; ++i)
+ assert(mbmi->interp_filter[i] == (cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter));
+#else
+ assert(mbmi->interp_filter == (cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter));
+#endif // CONFIG_DUAL_FILTER
+ return;
+ }
+ if (cm->interp_filter == SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+ (mbmi->ref_frame[1] > INTRA_FRAME &&
+ has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter[dir]],
+ ec_ctx->switchable_interp_cdf[ctx],
+ SWITCHABLE_FILTERS);
+#else
+ av1_write_token(w, av1_switchable_interp_tree,
+ ec_ctx->switchable_interp_prob[ctx],
+ &switchable_interp_encodings[mbmi->interp_filter[dir]]);
+#endif
+ ++cpi->interp_filter_selected[0][mbmi->interp_filter[dir]];
+ } else {
+ assert(mbmi->interp_filter[dir] == EIGHTTAP_REGULAR);
+ }
+ }
+#else
+ {
+ const int ctx = av1_get_pred_context_switchable_interp(xd);
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter],
+ ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS);
+#else
+ av1_write_token(w, av1_switchable_interp_tree,
+ ec_ctx->switchable_interp_prob[ctx],
+ &switchable_interp_encodings[mbmi->interp_filter]);
+#endif
+ ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+ }
+#endif // CONFIG_DUAL_FILTER
+ }
+}
+
+#if CONFIG_PALETTE
+#if CONFIG_PALETTE_DELTA_ENCODING
+// Write luma palette color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. The luma
+// palette is sorted so each delta is larger than 0.
+static void write_palette_colors_y(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ const int n = pmi->palette_size[0];
+ int min_bits, i;
+ int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits);
+ aom_write_literal(w, bits - min_bits, 2);
+ aom_write_literal(w, pmi->palette_colors[0], bit_depth);
+ for (i = 1; i < n; ++i) {
+ aom_write_literal(
+ w, pmi->palette_colors[i] - pmi->palette_colors[i - 1] - 1, bits);
+ bits =
+ AOMMIN(bits, av1_ceil_log2((1 << bit_depth) - pmi->palette_colors[i]));
+ }
+}
+
+// Write chroma palette color values. Use delta encoding for u channel as its
+// palette is sorted. For v channel, either use delta encoding or transmit
+// raw values directly, whichever costs less.
+static void write_palette_colors_uv(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ int i;
+ const int n = pmi->palette_size[1];
+#if CONFIG_HIGHBITDEPTH
+ const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+ const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+#else
+ const uint8_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+ const uint8_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+#endif // CONFIG_HIGHBITDEPTH
+ // U channel colors.
+ int min_bits_u = 0;
+ int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u);
+ aom_write_literal(w, bits_u - min_bits_u, 2);
+ aom_write_literal(w, colors_u[0], bit_depth);
+ for (i = 1; i < n; ++i) {
+ aom_write_literal(w, colors_u[i] - colors_u[i - 1], bits_u);
+ bits_u = AOMMIN(bits_u, av1_ceil_log2(1 + (1 << bit_depth) - colors_u[i]));
+ }
+ // V channel colors.
+ const int max_val = 1 << bit_depth;
+ int zero_count = 0, min_bits_v = 0;
+ int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int rate_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int rate_using_raw = bit_depth * n;
+ if (rate_using_delta < rate_using_raw) { // delta encoding
+ aom_write_bit(w, 1);
+ aom_write_literal(w, bits_v - min_bits_v, 2);
+ aom_write_literal(w, colors_v[0], bit_depth);
+ for (i = 1; i < n; ++i) {
+ if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit.
+ aom_write_literal(w, 0, bits_v);
+ continue;
+ }
+ const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+ const int sign_bit = colors_v[i] < colors_v[i - 1];
+ if (delta <= max_val - delta) {
+ aom_write_literal(w, delta, bits_v);
+ aom_write_bit(w, sign_bit);
+ } else {
+ aom_write_literal(w, max_val - delta, bits_v);
+ aom_write_bit(w, !sign_bit);
+ }
+ }
+ } else { // Transmit raw values.
+ aom_write_bit(w, 0);
+ for (i = 0; i < n; ++i) aom_write_literal(w, colors_v[i], bit_depth);
+ }
+}
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+
+static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ const MODE_INFO *const mi, aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+
+ if (mbmi->mode == DC_PRED) {
+ const int n = pmi->palette_size[0];
+ int palette_y_mode_ctx = 0;
+ if (above_mi)
+ palette_y_mode_ctx +=
+ (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+ if (left_mi)
+ palette_y_mode_ctx +=
+ (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+ aom_write(
+ w, n > 0,
+ av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_y_mode_ctx]);
+ if (n > 0) {
+ av1_write_token(w, av1_palette_size_tree,
+ av1_default_palette_y_size_prob[bsize - BLOCK_8X8],
+ &palette_size_encodings[n - PALETTE_MIN_SIZE]);
+#if CONFIG_PALETTE_DELTA_ENCODING
+ write_palette_colors_y(pmi, cm->bit_depth, w);
+#else
+ int i;
+ for (i = 0; i < n; ++i)
+ aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+ write_uniform(w, n, pmi->palette_first_color_idx[0]);
+ }
+ }
+
+ if (mbmi->uv_mode == DC_PRED) {
+ const int n = pmi->palette_size[1];
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+ aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]);
+ if (n > 0) {
+ av1_write_token(w, av1_palette_size_tree,
+ av1_default_palette_uv_size_prob[bsize - BLOCK_8X8],
+ &palette_size_encodings[n - PALETTE_MIN_SIZE]);
+#if CONFIG_PALETTE_DELTA_ENCODING
+ write_palette_colors_uv(pmi, cm->bit_depth, w);
+#else
+ int i;
+ for (i = 0; i < n; ++i) {
+ aom_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
+ cm->bit_depth);
+ aom_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
+ cm->bit_depth);
+ }
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+ write_uniform(w, n, pmi->palette_first_color_idx[1]);
+ }
+ }
+}
+#endif // CONFIG_PALETTE
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+ const int supertx_enabled,
+#endif
+#if CONFIG_TXK_SEL
+ int block, int plane,
+#endif
+ aom_writer *w) {
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const int is_inter = is_inter_block(mbmi);
+#if CONFIG_VAR_TX
+ const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+ const TX_SIZE tx_size = mbmi->tx_size;
+#endif // CONFIG_VAR_TX
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+#if !CONFIG_TXK_SEL
+ TX_TYPE tx_type = mbmi->tx_type;
+#else
+ // Only y plane's tx_type is transmitted
+ if (plane > 0) return;
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+#endif
+
+ if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) >
+ 1 &&
+ ((!cm->seg.enabled && cm->base_qindex > 0) ||
+ (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+ !mbmi->skip &&
+#if CONFIG_SUPERTX
+ !supertx_enabled &&
+#endif // CONFIG_SUPERTX
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int eset =
+ get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+ if (is_inter) {
+ assert(ext_tx_used_inter[eset][tx_type]);
+ if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_ext_tx_inter_ind[eset][tx_type],
+ ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+ ext_tx_cnt_inter[eset]);
+#else
+ av1_write_token(w, av1_ext_tx_inter_tree[eset],
+ ec_ctx->inter_ext_tx_prob[eset][square_tx_size],
+ &ext_tx_inter_encodings[eset][tx_type]);
+#endif
+ }
+ } else if (ALLOW_INTRA_EXT_TX) {
+ assert(ext_tx_used_intra[eset][tx_type]);
+ if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(
+ w, av1_ext_tx_intra_ind[eset][tx_type],
+ ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
+ ext_tx_cnt_intra[eset]);
+#else
+ av1_write_token(
+ w, av1_ext_tx_intra_tree[eset],
+ ec_ctx->intra_ext_tx_prob[eset][square_tx_size][mbmi->mode],
+ &ext_tx_intra_encodings[eset][tx_type]);
+#endif
+ }
+ }
+ }
+#else
+ if (tx_size < TX_32X32 &&
+ ((!cm->seg.enabled && cm->base_qindex > 0) ||
+ (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+ !mbmi->skip &&
+#if CONFIG_SUPERTX
+ !supertx_enabled &&
+#endif // CONFIG_SUPERTX
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ if (is_inter) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_ext_tx_ind[tx_type],
+ ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES);
+#else
+ av1_write_token(w, av1_ext_tx_tree, ec_ctx->inter_ext_tx_prob[tx_size],
+ &ext_tx_encodings[tx_type]);
+#endif
+ } else {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(
+ w, av1_ext_tx_ind[tx_type],
+ ec_ctx->intra_ext_tx_cdf[tx_size]
+ [intra_mode_to_tx_type_context[mbmi->mode]],
+ TX_TYPES);
+#else
+ av1_write_token(
+ w, av1_ext_tx_tree,
+ ec_ctx
+ ->intra_ext_tx_prob[tx_size]
+ [intra_mode_to_tx_type_context[mbmi->mode]],
+ &ext_tx_encodings[tx_type]);
+#endif
+ }
+ }
+#endif // CONFIG_EXT_TX
+ }
+}
+
+static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
+ PREDICTION_MODE mode, aom_writer *w) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_intra_mode_ind[mode],
+ frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+ INTRA_MODES);
+#else
+ av1_write_token(w, av1_intra_mode_tree,
+ frame_ctx->y_mode_prob[size_group_lookup[bsize]],
+ &intra_mode_encodings[mode]);
+#endif
+}
+
+static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+ PREDICTION_MODE uv_mode, PREDICTION_MODE y_mode,
+ aom_writer *w) {
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, av1_intra_mode_ind[uv_mode],
+ frame_ctx->uv_mode_cdf[y_mode], INTRA_MODES);
+#else
+ av1_write_token(w, av1_intra_mode_tree, frame_ctx->uv_mode_prob[y_mode],
+ &intra_mode_encodings[uv_mode]);
+#endif
+}
+
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+ const int mi_col,
+#if CONFIG_SUPERTX
+ int supertx_enabled,
+#endif
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_DELTA_Q || CONFIG_EC_ADAPT
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+#else
+ const MACROBLOCK *x = &cpi->td.mb;
+ const MACROBLOCKD *xd = &x->e_mbd;
+#endif
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+#if !CONFIG_REF_MV
+ nmv_context *nmvc = &ec_ctx->nmvc;
+#endif
+ const MODE_INFO *mi = xd->mi[0];
+
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &cm->fc->seg;
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int segment_id = mbmi->segment_id;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int allow_hp = cm->allow_high_precision_mv;
+ const int is_inter = is_inter_block(mbmi);
+ const int is_compound = has_second_ref(mbmi);
+ int skip, ref;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+ (void)mi_row;
+ (void)mi_col;
+
+ if (seg->update_map) {
+ if (seg->temporal_update) {
+ const int pred_flag = mbmi->seg_id_predicted;
+ aom_prob pred_prob = av1_get_pred_prob_seg_id(segp, xd);
+ aom_write(w, pred_flag, pred_prob);
+ if (!pred_flag) write_segment_id(w, seg, segp, segment_id);
+ } else {
+ write_segment_id(w, seg, segp, segment_id);
+ }
+ }
+
+#if CONFIG_SUPERTX
+ if (supertx_enabled)
+ skip = mbmi->skip;
+ else
+ skip = write_skip(cm, xd, segment_id, mi, w);
+#else
+ skip = write_skip(cm, xd, segment_id, mi, w);
+#endif // CONFIG_SUPERTX
+#if CONFIG_DELTA_Q
+ if (cm->delta_q_present_flag) {
+ int super_block_upper_left =
+ ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
+ if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
+ assert(mbmi->current_q_index > 0);
+ int reduced_delta_qindex =
+ (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+ write_delta_qindex(cm, xd, reduced_delta_qindex, w);
+ xd->prev_qindex = mbmi->current_q_index;
+#if CONFIG_EXT_DELTA_Q
+ if (cm->delta_lf_present_flag) {
+ int reduced_delta_lflevel =
+ (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+ cm->delta_lf_res;
+ write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
+ xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+ }
+#endif // CONFIG_EXT_DELTA_Q
+ }
+ }
+#endif
+
+#if CONFIG_SUPERTX
+ if (!supertx_enabled)
+#endif // CONFIG_SUPERTX
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+ aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
+
+ if (cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
+#if CONFIG_RECT_TX
+ bsize > BLOCK_4X4 &&
+#else
+ (bsize >= BLOCK_8X8 || (bsize > BLOCK_4X4 && is_inter)) &&
+#endif // CONFIG_RECT_TX
+#else
+ bsize >= BLOCK_8X8 &&
+#endif
+#if CONFIG_SUPERTX
+ !supertx_enabled &&
+#endif // CONFIG_SUPERTX
+ !(is_inter && skip) && !xd->lossless[segment_id]) {
+#if CONFIG_VAR_TX
+ if (is_inter) { // This implies skip flag is 0.
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
+ int idx, idy;
+ for (idy = 0; idy < height; idy += bh)
+ for (idx = 0; idx < width; idx += bw)
+ write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy,
+ idx, w);
+ } else {
+ set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
+ write_selected_tx_size(cm, xd, w);
+ }
+ } else {
+ set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
+#else
+ write_selected_tx_size(cm, xd, w);
+#endif
+ }
+
+ if (!is_inter) {
+ if (bsize >= BLOCK_8X8 || unify_bsize) {
+ write_intra_mode(ec_ctx, bsize, mode, w);
+ } else {
+ int idx, idy;
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+ write_intra_mode(ec_ctx, bsize, b_mode, w);
+ }
+ }
+ }
+#if CONFIG_CB4X4
+ if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y))
+ write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
+#else // !CONFIG_CB4X4
+ write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
+#endif // CONFIG_CB4X4
+
+#if CONFIG_EXT_INTRA
+ write_intra_angle_info(xd, ec_ctx, w);
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+ if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+ write_palette_mode_info(cm, xd, mi, w);
+#endif // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+ if (bsize >= BLOCK_8X8 || unify_bsize)
+ write_filter_intra_mode_info(cm, mbmi, w);
+#endif // CONFIG_FILTER_INTRA
+ } else {
+ int16_t mode_ctx;
+ write_ref_frames(cm, xd, w);
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (is_compound)
+ mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+ else
+#endif // CONFIG_EXT_INTER
+ mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+ mbmi->ref_frame, bsize, -1);
+#else // CONFIG_REF_MV
+ mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+#endif // CONFIG_REF_MV
+
+ // If segment skip is not enabled code the mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+ if (bsize >= BLOCK_8X8 || unify_bsize) {
+#if CONFIG_EXT_INTER
+ if (is_inter_compound_mode(mode))
+ write_inter_compound_mode(cm, w, mode, mode_ctx);
+ else if (is_inter_singleref_mode(mode))
+#endif // CONFIG_EXT_INTER
+ write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (mode == NEWMV || mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(mode))
+#else
+ if (mode == NEARMV || mode == NEWMV)
+#endif
+ write_drl_idx(cm, mbmi, mbmi_ext, w);
+ else
+ assert(mbmi->ref_mv_idx == 0);
+#endif
+ }
+ }
+
+#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION
+ write_mb_interp_filter(cpi, xd, w);
+#endif // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
+
+ if (bsize < BLOCK_8X8 && !unify_bsize) {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (!is_compound)
+#endif // CONFIG_EXT_INTER
+ mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+ mbmi->ref_frame, bsize, j);
+#endif
+#if CONFIG_EXT_INTER
+ if (is_inter_compound_mode(b_mode))
+ write_inter_compound_mode(cm, w, b_mode, mode_ctx);
+ else if (is_inter_singleref_mode(b_mode))
+#endif // CONFIG_EXT_INTER
+ write_inter_mode(w, b_mode, ec_ctx, mode_ctx);
+
+#if CONFIG_EXT_INTER
+ if (b_mode == NEWMV || b_mode == NEW_NEWMV) {
+#else
+ if (b_mode == NEWMV) {
+#endif // CONFIG_EXT_INTER
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], ref,
+ mbmi->ref_mv_idx);
+ nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+ av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+#if CONFIG_EXT_INTER
+ &mi->bmi[j].ref_mv[ref].as_mv,
+#else
+#if CONFIG_REF_MV
+ &mi->bmi[j].pred_mv[ref].as_mv,
+#else
+ &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+#endif // CONFIG_REF_MV
+#endif // CONFIG_EXT_INTER
+ nmvc, allow_hp);
+ }
+ }
+#if CONFIG_EXT_INTER
+ else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 1,
+ mbmi->ref_mv_idx);
+ nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+ av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
+ &mi->bmi[j].ref_mv[1].as_mv, nmvc, allow_hp);
+ } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 0,
+ mbmi->ref_mv_idx);
+ nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+ av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
+ &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp);
+ }
+#endif // CONFIG_EXT_INTER
+ }
+ }
+ } else {
+#if CONFIG_EXT_INTER
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+#else
+ if (mode == NEWMV) {
+#endif // CONFIG_EXT_INTER
+ int_mv ref_mv;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], ref,
+ mbmi->ref_mv_idx);
+ nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+ ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
+ av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ }
+#if CONFIG_EXT_INTER
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+ nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+ av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
+ &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv, nmvc,
+ allow_hp);
+ } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+ nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+#endif
+ av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
+ &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc,
+ allow_hp);
+#endif // CONFIG_EXT_INTER
+ }
+ }
+
+#if CONFIG_EXT_INTER
+ if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+ !supertx_enabled &&
+#endif // CONFIG_SUPERTX
+ is_interintra_allowed(mbmi)) {
+ const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+ const int bsize_group = size_group_lookup[bsize];
+ aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
+ if (interintra) {
+ write_interintra_mode(w, mbmi->interintra_mode,
+ cm->fc->interintra_mode_prob[bsize_group]);
+ if (is_interintra_wedge_used(bsize)) {
+ aom_write(w, mbmi->use_wedge_interintra,
+ cm->fc->wedge_interintra_prob[bsize]);
+ if (mbmi->use_wedge_interintra) {
+ aom_write_literal(w, mbmi->interintra_wedge_index,
+ get_wedge_bits_lookup(bsize));
+ assert(mbmi->interintra_wedge_sign == 0);
+ }
+ }
+ }
+ }
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_SUPERTX
+ if (!supertx_enabled)
+#endif // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+ if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif // CONFIG_EXT_INTER
+ write_motion_mode(cm, mi, w);
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+ if (cpi->common.reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode)
+#if CONFIG_MOTION_VAR
+ && mbmi->motion_mode == SIMPLE_TRANSLATION
+#endif // CONFIG_MOTION_VAR
+ && is_any_masked_compound_used(bsize)) {
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+ av1_write_token(w, av1_compound_type_tree,
+ cm->fc->compound_type_prob[bsize],
+ &compound_type_encodings[mbmi->interinter_compound_type]);
+#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#if CONFIG_WEDGE
+ if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
+ aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
+ aom_write_bit(w, mbmi->wedge_sign);
+ }
+#endif // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+ if (mbmi->interinter_compound_type == COMPOUND_SEG) {
+ aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS);
+ }
+#endif // CONFIG_COMPOUND_SEGMENT
+ }
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
+ write_mb_interp_filter(cpi, xd, w);
+#endif // CONFIG_DUAL_FILTE || CONFIG_WARPED_MOTION
+ }
+
+#if !CONFIG_TXK_SEL
+ av1_write_tx_type(cm, xd,
+#if CONFIG_SUPERTX
+ supertx_enabled,
+#endif
+ w);
+#endif // !CONFIG_TXK_SEL
+}
+
+#if CONFIG_DELTA_Q
+static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd, const int mi_row,
+ const int mi_col, aom_writer *w) {
+ int skip;
+#else
+static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
+ const int mi_row, const int mi_col,
+ aom_writer *w) {
+#endif
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &cm->fc->seg;
+ const MODE_INFO *const mi = xd->mi[0];
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+ (void)mi_row;
+ (void)mi_col;
+
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+ if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id);
+
+#if CONFIG_DELTA_Q
+ skip = write_skip(cm, xd, mbmi->segment_id, mi, w);
+ if (cm->delta_q_present_flag) {
+ int super_block_upper_left =
+ ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
+ if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
+ assert(mbmi->current_q_index > 0);
+ int reduced_delta_qindex =
+ (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+ write_delta_qindex(cm, xd, reduced_delta_qindex, w);
+ xd->prev_qindex = mbmi->current_q_index;
+#if CONFIG_EXT_DELTA_Q
+ if (cm->delta_lf_present_flag) {
+ int reduced_delta_lflevel =
+ (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+ cm->delta_lf_res;
+ write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
+ xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+ }
+#endif // CONFIG_EXT_DELTA_Q
+ }
+ }
+#else
+ write_skip(cm, xd, mbmi->segment_id, mi, w);
+#endif
+
+ if (cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
+#if CONFIG_RECT_TX
+ bsize > BLOCK_4X4 &&
+#else
+ bsize >= BLOCK_8X8 &&
+#endif // CONFIG_RECT_TX
+#else
+ bsize >= BLOCK_8X8 &&
+#endif
+ !xd->lossless[mbmi->segment_id])
+ write_selected_tx_size(cm, xd, w);
+
+#if CONFIG_INTRABC
+ if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) {
+ int use_intrabc = is_intrabc_block(mbmi);
+ aom_write(w, use_intrabc, INTRABC_PROB);
+ if (use_intrabc) {
+ assert(mbmi->mode == DC_PRED);
+ assert(mbmi->uv_mode == DC_PRED);
+ int_mv dv_ref;
+ av1_find_ref_dv(&dv_ref, mi_row, mi_col);
+ av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+ return;
+ }
+ }
+#endif // CONFIG_INTRABC
+
+ if (bsize >= BLOCK_8X8 || unify_bsize) {
+ write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, 0, mbmi->mode, w);
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int block = idy * 2 + idx;
+ write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, block,
+ mi->bmi[block].as_mode, w);
+ }
+ }
+ }
+
+#if CONFIG_CB4X4
+ if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y))
+ write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
+#else // !CONFIG_CB4X4
+ write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
+#endif // CONFIG_CB4X4
+
+#if CONFIG_EXT_INTRA
+ write_intra_angle_info(xd, ec_ctx, w);
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+ if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+ write_palette_mode_info(cm, xd, mi, w);
+#endif // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+ if (bsize >= BLOCK_8X8 || unify_bsize)
+ write_filter_intra_mode_info(cm, mbmi, w);
+#endif // CONFIG_FILTER_INTRA
+
+#if !CONFIG_TXK_SEL
+ av1_write_tx_type(cm, xd,
+#if CONFIG_SUPERTX
+ 0,
+#endif
+ w);
+#endif // !CONFIG_TXK_SEL
+}
+
+#if CONFIG_SUPERTX
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+ mi_row, mi_col) \
+ write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
+#else
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+ mi_row, mi_col) \
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
+#endif // CONFIG_SUPERTX
+
+#if CONFIG_RD_DEBUG
+static void dump_mode_info(MODE_INFO *mi) {
+ printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row);
+ printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col);
+ printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type);
+ printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size);
+ if (mi->mbmi.sb_type >= BLOCK_8X8) {
+ printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode);
+ } else {
+ printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode);
+ }
+}
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+ int plane) {
+ if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+#if CONFIG_VAR_TX
+ int r, c;
+#endif
+ printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+ plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+#if CONFIG_VAR_TX
+ printf("rd txb_coeff_cost_map\n");
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]);
+ }
+ printf("\n");
+ }
+
+ printf("pack txb_coeff_cost_map\n");
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ printf("%d ", token_stats->txb_coeff_cost_map[r][c]);
+ }
+ printf("\n");
+ }
+#endif
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
+ aom_writer *w,
+#if CONFIG_SUPERTX
+ int supertx_enabled,
+#endif
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ MODE_INFO *m;
+ int bh, bw;
+ xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+ m = xd->mi[0];
+
+ assert(m->mbmi.sb_type <= cm->sb_size);
+
+ bh = mi_size_high[m->mbmi.sb_type];
+ bw = mi_size_wide[m->mbmi.sb_type];
+
+ cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+
+ if (frame_is_intra_only(cm)) {
+ write_mb_modes_kf(cm, xd, mi_row, mi_col, w);
+ } else {
+#if CONFIG_VAR_TX
+ xd->above_txfm_context = cm->above_txfm_context + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if CONFIG_DUAL_FILTER
+ // has_subpel_mv_component needs the ref frame buffers set up to look
+ // up if they are scaled. has_subpel_mv_component is in turn needed by
+ // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+ set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+#endif // CONFIG_DUAL_FILTER
+#if 0
+ // NOTE(zoeliu): For debug
+ if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+ const PREDICTION_MODE mode = m->mbmi.mode;
+ const int segment_id = m->mbmi.segment_id;
+ const BLOCK_SIZE bsize = m->mbmi.sb_type;
+
+ // For sub8x8, simply dump out the first sub8x8 block info
+ const PREDICTION_MODE b_mode =
+ (bsize < BLOCK_8X8) ? m->bmi[0].as_mode : -1;
+ const int mv_x = (bsize < BLOCK_8X8) ?
+ m->bmi[0].as_mv[0].as_mv.row : m->mbmi.mv[0].as_mv.row;
+ const int mv_y = (bsize < BLOCK_8X8) ?
+ m->bmi[0].as_mv[0].as_mv.col : m->mbmi.mv[0].as_mv.col;
+
+ printf("Before pack_inter_mode_mvs(): "
+ "Frame=%d, (mi_row,mi_col)=(%d,%d), "
+ "mode=%d, segment_id=%d, bsize=%d, b_mode=%d, "
+ "mv[0]=(%d, %d), ref[0]=%d, ref[1]=%d\n",
+ cm->current_video_frame, mi_row, mi_col,
+ mode, segment_id, bsize, b_mode, mv_x, mv_y,
+ m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+ }
+#endif // 0
+ pack_inter_mode_mvs(cpi, mi_row, mi_col,
+#if CONFIG_SUPERTX
+ supertx_enabled,
+#endif
+ w);
+ }
+}
+
+static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
+ aom_writer *w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ MODE_INFO *const m = xd->mi[0];
+ MB_MODE_INFO *const mbmi = &m->mbmi;
+ int plane;
+ int bh, bw;
+#if CONFIG_PVQ || CONFIG_LV_MAP
+ MACROBLOCK *const x = &cpi->td.mb;
+ (void)tok;
+ (void)tok_end;
+#endif
+ xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+
+ assert(mbmi->sb_type <= cm->sb_size);
+
+ bh = mi_size_high[mbmi->sb_type];
+ bw = mi_size_wide[mbmi->sb_type];
+ cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+
+#if CONFIG_PALETTE
+ for (plane = 0; plane <= 1; ++plane) {
+ const uint8_t palette_size_plane =
+ mbmi->palette_mode_info.palette_size[plane];
+ if (palette_size_plane > 0) {
+#if CONFIG_INTRABC
+ assert(mbmi->use_intrabc == 0);
+#endif
+ int rows, cols;
+ assert(mbmi->sb_type >= BLOCK_8X8);
+ av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+ &cols);
+ assert(*tok < tok_end);
+ pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1);
+ assert(*tok < tok_end + mbmi->skip);
+ }
+ }
+#endif // CONFIG_PALETTE
+
+#if CONFIG_COEF_INTERLEAVE
+ if (!mbmi->skip) {
+ const struct macroblockd_plane *const pd_y = &xd->plane[0];
+ const struct macroblockd_plane *const pd_c = &xd->plane[1];
+ const TX_SIZE tx_log2_y = mbmi->tx_size;
+ const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c);
+ const int tx_sz_y = (1 << tx_log2_y);
+ const int tx_sz_c = (1 << tx_log2_c);
+
+ const BLOCK_SIZE plane_bsize_y =
+ get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_y);
+ const BLOCK_SIZE plane_bsize_c =
+ get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_c);
+
+ const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
+ const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
+ const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
+ const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
+
+ const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
+ pd_y->subsampling_x);
+ const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
+ pd_y->subsampling_y);
+ const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
+ pd_c->subsampling_x);
+ const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
+ pd_c->subsampling_y);
+
+ // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
+ // i.e. when the SB is splitted by tile boundaries.
+ const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
+ const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
+ const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
+ const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
+ const int tu_num_y = tu_num_w_y * tu_num_h_y;
+ const int tu_num_c = tu_num_w_c * tu_num_h_c;
+
+ int tu_idx_y = 0, tu_idx_c = 0;
+ TOKEN_STATS token_stats;
+ init_token_stats(&token_stats);
+
+ assert(*tok < tok_end);
+
+ while (tu_idx_y < tu_num_y) {
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_y, &token_stats);
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+ tu_idx_y++;
+
+ if (tu_idx_c < tu_num_c) {
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+
+ tu_idx_c++;
+ }
+ }
+
+ // In 422 case, it's possilbe that Chroma has more TUs than Luma
+ while (tu_idx_c < tu_num_c) {
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+
+ tu_idx_c++;
+ }
+ }
+#else // CONFIG_COEF_INTERLEAVE
+ if (!mbmi->skip) {
+#if !CONFIG_PVQ && !CONFIG_LV_MAP
+ assert(*tok < tok_end);
+#endif
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+ if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y)) {
+ (*tok)++;
+ continue;
+ }
+#endif
+#if CONFIG_VAR_TX
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+ const BLOCK_SIZE plane_bsize =
+ AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif
+#else
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+#endif
+
+ const int num_4x4_w =
+ block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_h =
+ block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+ int row, col;
+ TOKEN_STATS token_stats;
+ init_token_stats(&token_stats);
+
+ if (is_inter_block(mbmi)) {
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+ int block = 0;
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ const int bkw = tx_size_wide_unit[max_tx_size];
+ const int bkh = tx_size_high_unit[max_tx_size];
+ for (row = 0; row < num_4x4_h; row += bkh) {
+ for (col = 0; col < num_4x4_w; col += bkw) {
+ pack_txb_tokens(w, tok, tok_end,
+#if CONFIG_PVQ
+ x,
+#endif
+ xd, mbmi, plane, plane_bsize, cm->bit_depth, block,
+ row, col, max_tx_size, &token_stats);
+ block += step;
+ }
+ }
+#if CONFIG_RD_DEBUG
+ if (mbmi->sb_type >= BLOCK_8X8 &&
+ rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+ dump_mode_info(m);
+ assert(0);
+ }
+#endif // CONFIG_RD_DEBUG
+ } else {
+ TX_SIZE tx = get_tx_size(plane, xd);
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+ tx = AOMMAX(TX_4X4, tx);
+#endif
+ const int bkw = tx_size_wide_unit[tx];
+ const int bkh = tx_size_high_unit[tx];
+ for (row = 0; row < num_4x4_h; row += bkh) {
+ for (col = 0; col < num_4x4_w; col += bkw) {
+#if !CONFIG_PVQ
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+#else
+ pack_pvq_tokens(w, x, xd, plane, bsize, tx);
+#endif
+ }
+ }
+ }
+#else
+ TX_SIZE tx = get_tx_size(plane, xd);
+ TOKEN_STATS token_stats;
+#if !CONFIG_PVQ
+ init_token_stats(&token_stats);
+#if CONFIG_LV_MAP
+ (void)tx;
+ av1_write_coeffs_mb(cm, x, w, plane);
+#else // CONFIG_LV_MAP
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+#endif // CONFIG_LV_MAP
+
+#else
+ (void)token_stats;
+ pack_pvq_tokens(w, x, xd, plane, mbmi->sb_type, tx);
+#endif
+#if CONFIG_RD_DEBUG
+ if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 &&
+ rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+ dump_mode_info(m);
+ assert(0);
+ }
+#endif // CONFIG_RD_DEBUG
+#endif // CONFIG_VAR_TX
+
+#if !CONFIG_PVQ && !CONFIG_LV_MAP
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+#endif
+ }
+ }
+#endif // CONFIG_COEF_INTERLEAVE
+}
+
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile,
+ aom_writer *w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int hbs = mi_size_wide[bsize] / 2;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ partition = get_partition(cm, mi_row, mi_col, bsize);
+ subsize = get_subsize(bsize, partition);
+
+ if (subsize < BLOCK_8X8 && !unify_bsize) {
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_row + hbs < cm->mi_rows)
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_col + hbs < cm->mi_cols)
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT:
+ write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+ write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+ subsize);
+ write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+ subsize);
+ write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+ subsize);
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0);
+ }
+ }
+}
+#endif
+
+static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+ aom_writer *w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+ int supertx_enabled,
+#endif
+ int mi_row, int mi_col) {
+ write_mbmi_b(cpi, tile, w,
+#if CONFIG_SUPERTX
+ supertx_enabled,
+#endif
+ mi_row, mi_col);
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+ (void)tok;
+ (void)tok_end;
+#else
+#if !CONFIG_PVQ && CONFIG_SUPERTX
+ if (!supertx_enabled)
+#endif
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+#endif
+}
+
+static void write_partition(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, int hbs, int mi_row,
+ int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
+ aom_writer *w) {
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+ const int is_partition_point = bsize >= BLOCK_8X8;
+ const int ctx = is_partition_point
+ ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+ has_rows, has_cols,
+#endif
+ bsize)
+ : 0;
+#if CONFIG_UNPOISON_PARTITION_CTX
+ const aom_prob *const probs =
+ ctx < PARTITION_CONTEXTS ? cm->fc->partition_prob[ctx] : NULL;
+#else
+ const aom_prob *const probs = cm->fc->partition_prob[ctx];
+#endif
+
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ (void)cm;
+#elif CONFIG_EC_MULTISYMBOL
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+
+ if (!is_partition_point) return;
+
+ if (has_rows && has_cols) {
+#if CONFIG_EXT_PARTITION_TYPES
+ if (bsize <= BLOCK_8X8)
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
+#else
+ av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
+#endif
+ else
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], EXT_PARTITION_TYPES);
+#else
+ av1_write_token(w, av1_ext_partition_tree, probs,
+ &ext_partition_encodings[p]);
+#endif // CONFIG_EC_MULTISYMBOL
+#else
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
+#else
+ av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
+#endif
+#endif // CONFIG_EXT_PARTITION_TYPES
+ } else if (!has_rows && has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ aom_write(w, p == PARTITION_SPLIT, probs[1]);
+ } else if (has_rows && !has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ aom_write(w, p == PARTITION_SPLIT, probs[2]);
+ } else {
+ assert(p == PARTITION_SPLIT);
+ }
+}
+
+#if CONFIG_SUPERTX
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+ mi_row, mi_col, bsize) \
+ write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
+ bsize)
+#else
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+ mi_row, mi_col, bsize) \
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
+#endif // CONFIG_SUPERTX
+
+static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
+ aom_writer *const w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+ int supertx_enabled,
+#endif
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const int hbs = mi_size_wide[bsize] / 2;
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+
+#if CONFIG_SUPERTX
+ const int mi_offset = mi_row * cm->mi_stride + mi_col;
+ MB_MODE_INFO *mbmi;
+ const int pack_token = !supertx_enabled;
+ TX_SIZE supertx_size;
+ int plane;
+#endif
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+#if CONFIG_SUPERTX
+ mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
+ xd->mi = cm->mi_grid_visible + mi_offset;
+ set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col,
+ mi_size_wide[bsize],
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+ if (!supertx_enabled && !frame_is_intra_only(cm) &&
+ partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+ !xd->lossless[0]) {
+ aom_prob prob;
+ supertx_size = max_txsize_lookup[bsize];
+ prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+ [supertx_size];
+ supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+ aom_write(w, supertx_enabled, prob);
+ }
+#endif // CONFIG_SUPERTX
+ if (subsize < BLOCK_8X8 && !unify_bsize) {
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row,
+ mi_col);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col);
+ if (mi_row + hbs < cm->mi_rows)
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col);
+ if (mi_col + hbs < cm->mi_cols)
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT:
+ write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col, subsize);
+ write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col + hbs, subsize);
+ write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col, subsize);
+ write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col + hbs, subsize);
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col + hbs);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row, mi_col + hbs);
+ write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+ mi_row + hbs, mi_col + hbs);
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0);
+ }
+ }
+#if CONFIG_SUPERTX
+ if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
+ int skip;
+ const int bsw = mi_size_wide[bsize];
+ const int bsh = mi_size_high[bsize];
+
+ xd->mi = cm->mi_grid_visible + mi_offset;
+ supertx_size = mbmi->tx_size;
+ set_mi_row_col(xd, tile, mi_row, bsh, mi_col, bsw,
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+
+ assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0));
+ assert(mbmi->segment_id_supertx < MAX_SEGMENTS);
+
+ skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w);
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
+ !skip) {
+ const int eset =
+ get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
+ if (eset > 0) {
+#if CONFIG_EC_MULTISYMBOL
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#else
+ FRAME_CONTEXT *ec_ctx = cm->fc;
+#endif
+ aom_write_symbol(w, av1_ext_tx_inter_ind[eset][mbmi->tx_type],
+ ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
+ ext_tx_cnt_inter[eset]);
+#else
+ av1_write_token(w, av1_ext_tx_inter_tree[eset],
+ cm->fc->inter_ext_tx_prob[eset][supertx_size],
+ &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+#endif
+ }
+ }
+#else
+ if (supertx_size < TX_32X32 && !skip) {
+ av1_write_token(w, av1_ext_tx_tree,
+ cm->fc->inter_ext_tx_prob[supertx_size],
+ &ext_tx_encodings[mbmi->tx_type]);
+ }
+#endif // CONFIG_EXT_TX
+
+ if (!skip) {
+ assert(*tok < tok_end);
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi_txb_size, pd);
+
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+ int row, col;
+ TX_SIZE tx = get_tx_size(plane, xd);
+ BLOCK_SIZE txb_size = txsize_to_bsize[tx];
+
+ const int stepr = tx_size_high_unit[txb_size];
+ const int stepc = tx_size_wide_unit[txb_size];
+
+ TOKEN_STATS token_stats;
+ token_stats.cost = 0;
+ for (row = 0; row < max_blocks_high; row += stepr)
+ for (col = 0; col < max_blocks_wide; col += stepc)
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+ }
+ }
+#if CONFIG_VAR_TX
+ xd->above_txfm_context = cm->above_txfm_context + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bsw, bsh, skip, xd);
+#endif
+ }
+#endif // CONFIG_SUPERTX
+
+// update partition context
+#if CONFIG_EXT_PARTITION_TYPES
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+ if (bsize >= BLOCK_8X8 &&
+ (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_CDEF
+#if CONFIG_EXT_PARTITION
+ if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 &&
+ !sb_all_skip(cm, mi_row, mi_col)) {
+ aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
+ ->mbmi.cdef_strength,
+ cm->cdef_bits);
+ } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 &&
+#else
+ if (bsize == BLOCK_64X64 &&
+#endif // CONFIG_EXT_PARTITION
+ !sb_all_skip(cm, mi_row, mi_col)) {
+ if (cm->cdef_bits != 0)
+ aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
+ ->mbmi.cdef_strength,
+ cm->cdef_bits);
+ }
+#endif
+}
+
+static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
+ aom_writer *const w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const int mi_row_start = tile->mi_row_start;
+ const int mi_row_end = tile->mi_row_end;
+ const int mi_col_start = tile->mi_col_start;
+ const int mi_col_end = tile->mi_col_end;
+ int mi_row, mi_col;
+
+#if CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_TILE_GROUPS
+ if (!cm->dependent_horz_tiles || mi_row_start == 0 ||
+ tile->tg_horz_boundary) {
+#else
+ if (!cm->dependent_horz_tiles || mi_row_start == 0) {
+#endif
+ av1_zero_above_context(cm, mi_col_start, mi_col_end);
+ }
+#else
+ av1_zero_above_context(cm, mi_col_start, mi_col_end);
+#endif
+#if CONFIG_PVQ
+ assert(cpi->td.mb.pvq_q->curr_pos == 0);
+#endif
+#if CONFIG_DELTA_Q
+ if (cpi->common.delta_q_present_flag) {
+ xd->prev_qindex = cpi->common.base_qindex;
+#if CONFIG_EXT_DELTA_Q
+ if (cpi->common.delta_lf_present_flag) {
+ xd->prev_delta_lf_from_base = 0;
+ }
+#endif // CONFIG_EXT_DELTA_Q
+ }
+#endif
+
+ for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
+ av1_zero_left_context(xd);
+
+ for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
+ write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col,
+ cm->sb_size);
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+ write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size);
+#endif
+ }
+ }
+#if CONFIG_PVQ
+ // Check that the number of PVQ blocks encoded and written to the bitstream
+ // are the same
+ assert(cpi->td.mb.pvq_q->curr_pos == cpi->td.mb.pvq_q->last_pos);
+ // Reset curr_pos in case we repack the bitstream
+ cpi->td.mb.pvq_q->curr_pos = 0;
+#endif
+}
+
+#if !CONFIG_LV_MAP
+#if !CONFIG_PVQ && !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+static void build_tree_distribution(AV1_COMP *cpi, TX_SIZE tx_size,
+ av1_coeff_stats *coef_branch_ct,
+ av1_coeff_probs_model *coef_probs) {
+ av1_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
+ unsigned int(*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+ cpi->common.counts.eob_branch[tx_size];
+ int i, j, k, l, m;
+#if CONFIG_RECT_TX
+ assert(!is_rect_tx(tx_size));
+#endif // CONFIG_RECT_TX
+
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ av1_tree_probs_from_distribution(av1_coef_tree,
+ coef_branch_ct[i][j][k][l],
+ coef_counts[i][j][k][l]);
+ coef_branch_ct[i][j][k][l][0][1] =
+ eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0];
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ coef_probs[i][j][k][l][m] =
+ get_binary_prob(coef_branch_ct[i][j][k][l][m][0],
+ coef_branch_ct[i][j][k][l][m][1]);
+ }
+ }
+ }
+ }
+}
+
+#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+static void update_coef_probs_common(aom_writer *const bc, AV1_COMP *cpi,
+ TX_SIZE tx_size,
+ av1_coeff_stats *frame_branch_ct,
+ av1_coeff_probs_model *new_coef_probs) {
+ av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+ const aom_prob upd = DIFF_UPDATE_PROB;
+#if CONFIG_EC_ADAPT
+ const int entropy_nodes_update = UNCONSTRAINED_NODES - 1;
+#else
+ const int entropy_nodes_update = UNCONSTRAINED_NODES;
+#endif
+ int i, j, k, l, t;
+ int stepsize = cpi->sf.coeff_prob_appx_step;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cpi->common.num_tg;
+#else
+ const int probwt = 1;
+#endif
+#if CONFIG_RECT_TX
+ assert(!is_rect_tx(tx_size));
+#endif // CONFIG_RECT_TX
+
+ switch (cpi->sf.use_fast_coef_updates) {
+ case TWO_LOOP: {
+ /* dry run to see if there is any update at all needed */
+ int savings = 0;
+ int update[2] = { 0, 0 };
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ aom_prob newp = new_coef_probs[i][j][k][l][t];
+ const aom_prob oldp = old_coef_probs[i][j][k][l][t];
+ int s;
+ int u = 0;
+ if (t == PIVOT_NODE)
+ s = av1_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0], oldp, &newp, upd,
+ stepsize, probwt);
+ else
+ s = av1_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], oldp, &newp, upd, probwt);
+
+ if (s > 0 && newp != oldp) u = 1;
+ if (u)
+ savings += s - (int)(av1_cost_zero(upd));
+ else
+ savings -= (int)(av1_cost_zero(upd));
+ update[u]++;
+ }
+ }
+ }
+ }
+ }
+
+ /* Is coef updated at all */
+ if (update[1] == 0 || savings < 0) {
+ aom_write_bit(bc, 0);
+ return;
+ }
+ aom_write_bit(bc, 1);
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ aom_prob newp = new_coef_probs[i][j][k][l][t];
+ aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+ int s;
+ int u = 0;
+ if (t == PIVOT_NODE)
+ s = av1_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+ stepsize, probwt);
+ else
+ s = av1_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+ probwt);
+ if (s > 0 && newp != *oldp) u = 1;
+ aom_write(bc, u, upd);
+ if (u) {
+ /* send/use new probability */
+ av1_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ return;
+ }
+
+ case ONE_LOOP_REDUCED: {
+ int updates = 0;
+ int noupdates_before_first = 0;
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ aom_prob newp = new_coef_probs[i][j][k][l][t];
+ aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+ int s;
+ int u = 0;
+ if (t == PIVOT_NODE) {
+ s = av1_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+ stepsize, probwt);
+ } else {
+ s = av1_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+ probwt);
+ }
+
+ if (s > 0 && newp != *oldp) u = 1;
+ updates += u;
+ if (u == 0 && updates == 0) {
+ noupdates_before_first++;
+ continue;
+ }
+ if (u == 1 && updates == 1) {
+ int v;
+ // first update
+ aom_write_bit(bc, 1);
+ for (v = 0; v < noupdates_before_first; ++v)
+ aom_write(bc, 0, upd);
+ }
+ aom_write(bc, u, upd);
+ if (u) {
+ /* send/use new probability */
+ av1_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (updates == 0) {
+ aom_write_bit(bc, 0); // no updates
+ }
+ return;
+ }
+ default: assert(0);
+ }
+}
+#endif
+#if CONFIG_SUBFRAME_PROB_UPDATE
+// Calculate the token counts between subsequent subframe updates.
+static void get_coef_counts_diff(
+ AV1_COMP *cpi, int index,
+ av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES],
+ unsigned int eob_counts[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
+ [COEFF_CONTEXTS]) {
+ int i, j, k, l, m, tx_size, val;
+ const int max_idx = cpi->common.coef_probs_update_idx;
+ const TX_MODE tx_mode = cpi->common.tx_mode;
+ const int max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ const SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+ assert(max_idx < COEF_PROBS_BUFS);
+
+ for (tx_size = 0; tx_size <= max_tx_size; ++tx_size)
+ for (i = 0; i < PLANE_TYPES; ++i)
+ for (j = 0; j < REF_TYPES; ++j)
+ for (k = 0; k < COEF_BANDS; ++k)
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ if (index == max_idx) {
+ val =
+ cpi->common.counts.eob_branch[tx_size][i][j][k][l] -
+ subframe_stats->eob_counts_buf[max_idx][tx_size][i][j][k][l];
+ } else {
+ val = subframe_stats
+ ->eob_counts_buf[index + 1][tx_size][i][j][k][l] -
+ subframe_stats->eob_counts_buf[index][tx_size][i][j][k][l];
+ }
+ assert(val >= 0);
+ eob_counts[tx_size][i][j][k][l] = val;
+
+ for (m = 0; m < ENTROPY_TOKENS; ++m) {
+ if (index == max_idx) {
+ val = cpi->td.rd_counts.coef_counts[tx_size][i][j][k][l][m] -
+ subframe_stats
+ ->coef_counts_buf[max_idx][tx_size][i][j][k][l][m];
+ } else {
+ val = subframe_stats
+ ->coef_counts_buf[index + 1][tx_size][i][j][k][l][m] -
+ subframe_stats
+ ->coef_counts_buf[index][tx_size][i][j][k][l][m];
+ }
+ assert(val >= 0);
+ coef_counts[tx_size][i][j][k][l][m] = val;
+ }
+ }
+}
+
+static void update_coef_probs_subframe(
+ aom_writer *const bc, AV1_COMP *cpi, TX_SIZE tx_size,
+ av1_coeff_stats branch_ct[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES],
+ av1_coeff_probs_model *new_coef_probs) {
+ av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+ const aom_prob upd = DIFF_UPDATE_PROB;
+ const int entropy_nodes_update = UNCONSTRAINED_NODES;
+ int i, j, k, l, t;
+ int stepsize = cpi->sf.coeff_prob_appx_step;
+ const int max_idx = cpi->common.coef_probs_update_idx;
+ int idx;
+ unsigned int this_branch_ct[ENTROPY_NODES][COEF_PROBS_BUFS][2];
+
+ switch (cpi->sf.use_fast_coef_updates) {
+ case TWO_LOOP: {
+ /* dry run to see if there is any update at all needed */
+ int savings = 0;
+ int update[2] = { 0, 0 };
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ for (idx = 0; idx <= max_idx; ++idx) {
+ memcpy(this_branch_ct[t][idx],
+ branch_ct[idx][tx_size][i][j][k][l][t],
+ 2 * sizeof(this_branch_ct[t][idx][0]));
+ }
+ }
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ aom_prob newp = new_coef_probs[i][j][k][l][t];
+ const aom_prob oldp = old_coef_probs[i][j][k][l][t];
+ int s, u = 0;
+
+ if (t == PIVOT_NODE)
+ s = av1_prob_update_search_model_subframe(
+ this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+ stepsize, max_idx);
+ else
+ s = av1_prob_update_search_subframe(this_branch_ct[t], oldp,
+ &newp, upd, max_idx);
+ if (s > 0 && newp != oldp) u = 1;
+ if (u)
+ savings += s - (int)(av1_cost_zero(upd));
+ else
+ savings -= (int)(av1_cost_zero(upd));
+ update[u]++;
+ }
+ }
+ }
+ }
+ }
+
+ /* Is coef updated at all */
+ if (update[1] == 0 || savings < 0) {
+ aom_write_bit(bc, 0);
+ return;
+ }
+ aom_write_bit(bc, 1);
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ for (idx = 0; idx <= max_idx; ++idx) {
+ memcpy(this_branch_ct[t][idx],
+ branch_ct[idx][tx_size][i][j][k][l][t],
+ 2 * sizeof(this_branch_ct[t][idx][0]));
+ }
+ }
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ aom_prob newp = new_coef_probs[i][j][k][l][t];
+ aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+ int s;
+ int u = 0;
+
+ if (t == PIVOT_NODE)
+ s = av1_prob_update_search_model_subframe(
+ this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+ stepsize, max_idx);
+ else
+ s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
+ &newp, upd, max_idx);
+ if (s > 0 && newp != *oldp) u = 1;
+ aom_write(bc, u, upd);
+ if (u) {
+ /* send/use new probability */
+ av1_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ return;
+ }
+
+ case ONE_LOOP_REDUCED: {
+ int updates = 0;
+ int noupdates_before_first = 0;
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ for (idx = 0; idx <= max_idx; ++idx) {
+ memcpy(this_branch_ct[t][idx],
+ branch_ct[idx][tx_size][i][j][k][l][t],
+ 2 * sizeof(this_branch_ct[t][idx][0]));
+ }
+ }
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ aom_prob newp = new_coef_probs[i][j][k][l][t];
+ aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+ int s;
+ int u = 0;
+
+ if (t == PIVOT_NODE)
+ s = av1_prob_update_search_model_subframe(
+ this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+ stepsize, max_idx);
+ else
+ s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
+ &newp, upd, max_idx);
+ if (s > 0 && newp != *oldp) u = 1;
+ updates += u;
+ if (u == 0 && updates == 0) {
+ noupdates_before_first++;
+ continue;
+ }
+ if (u == 1 && updates == 1) {
+ int v;
+ // first update
+ aom_write_bit(bc, 1);
+ for (v = 0; v < noupdates_before_first; ++v)
+ aom_write(bc, 0, upd);
+ }
+ aom_write(bc, u, upd);
+ if (u) {
+ /* send/use new probability */
+ av1_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (updates == 0) {
+ aom_write_bit(bc, 0); // no updates
+ }
+ return;
+ }
+ default: assert(0);
+ }
+}
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+
+#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+static void update_coef_probs(AV1_COMP *cpi, aom_writer *w) {
+ const TX_MODE tx_mode = cpi->common.tx_mode;
+ const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ TX_SIZE tx_size;
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ AV1_COMMON *cm = &cpi->common;
+ SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+ int i;
+ av1_coeff_probs_model dummy_frame_coef_probs[PLANE_TYPES];
+
+ if (cm->do_subframe_update &&
+ cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ av1_copy(cpi->common.fc->coef_probs,
+ subframe_stats->enc_starting_coef_probs);
+ for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+ get_coef_counts_diff(cpi, i, cpi->wholeframe_stats.coef_counts_buf[i],
+ cpi->wholeframe_stats.eob_counts_buf[i]);
+ }
+ }
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+
+ for (tx_size = 0; tx_size <= max_tx_size; ++tx_size) {
+ av1_coeff_stats frame_branch_ct[PLANE_TYPES];
+ av1_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+ if (cpi->td.counts->tx_size_totals[tx_size] <= 20 || CONFIG_RD_DEBUG ||
+ (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+ aom_write_bit(w, 0);
+ } else {
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ if (cm->do_subframe_update &&
+ cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ unsigned int this_eob_counts_copy[PLANE_TYPES][REF_TYPES][COEF_BANDS]
+ [COEFF_CONTEXTS];
+ av1_coeff_count coef_counts_copy[PLANE_TYPES];
+ av1_copy(this_eob_counts_copy, cpi->common.counts.eob_branch[tx_size]);
+ av1_copy(coef_counts_copy, cpi->td.rd_counts.coef_counts[tx_size]);
+ build_tree_distribution(cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+ av1_copy(cpi->common.counts.eob_branch[tx_size],
+ cpi->wholeframe_stats.eob_counts_buf[i][tx_size]);
+ av1_copy(cpi->td.rd_counts.coef_counts[tx_size],
+ cpi->wholeframe_stats.coef_counts_buf[i][tx_size]);
+ build_tree_distribution(cpi, tx_size, cpi->branch_ct_buf[i][tx_size],
+ dummy_frame_coef_probs);
+ }
+ av1_copy(cpi->common.counts.eob_branch[tx_size], this_eob_counts_copy);
+ av1_copy(cpi->td.rd_counts.coef_counts[tx_size], coef_counts_copy);
+
+ update_coef_probs_subframe(w, cpi, tx_size, cpi->branch_ct_buf,
+ frame_coef_probs);
+ } else {
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+ build_tree_distribution(cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ }
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+ }
+ }
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+ av1_copy(subframe_stats->coef_probs_buf[0], cm->fc->coef_probs);
+ if (cm->do_subframe_update &&
+ cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ unsigned int eob_counts_copy[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
+ [COEFF_CONTEXTS];
+ av1_copy(eob_counts_copy, cm->counts.eob_branch);
+ for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
+ for (tx_size = 0; tx_size <= max_tx_size; ++tx_size)
+ av1_full_to_model_counts(cm->counts.coef[tx_size],
+ subframe_stats->coef_counts_buf[i][tx_size]);
+ av1_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
+ av1_partial_adapt_probs(cm, 0, 0);
+ av1_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
+ }
+ av1_copy(cm->fc->coef_probs, subframe_stats->coef_probs_buf[0]);
+ av1_copy(cm->counts.eob_branch, eob_counts_copy);
+ }
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+}
+#endif // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#endif // !CONFIG_EC_ADAPT
+#endif // !CONFIG_LV_MAP
+
+#if CONFIG_LOOP_RESTORATION
+static void encode_restoration_mode(AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ int p;
+ RestorationInfo *rsi = &cm->rst_info[0];
+ switch (rsi->frame_restoration_type) {
+ case RESTORE_NONE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_WIENER:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_SGRPROJ:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 1);
+ break;
+ case RESTORE_SWITCHABLE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 1);
+ break;
+ default: assert(0);
+ }
+ for (p = 1; p < MAX_MB_PLANE; ++p) {
+ rsi = &cm->rst_info[p];
+ switch (rsi->frame_restoration_type) {
+ case RESTORE_NONE: aom_wb_write_bit(wb, 0); break;
+ case RESTORE_WIENER: aom_wb_write_bit(wb, 1); break;
+ default: assert(0);
+ }
+ }
+ if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+ rsi = &cm->rst_info[0];
+ aom_wb_write_bit(wb, rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX);
+ if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) {
+ aom_wb_write_bit(
+ wb, rsi->restoration_tilesize != (RESTORATION_TILESIZE_MAX >> 1));
+ }
+ }
+}
+
+static void write_wiener_filter(WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info, aom_writer *wb) {
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static void write_sgrproj_filter(SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info,
+ aom_writer *wb) {
+ aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+ aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1,
+ SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1,
+ SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
+ int i, p;
+ const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
+ cm->rst_info[0].restoration_tilesize,
+ NULL, NULL, NULL, NULL);
+ WienerInfo ref_wiener_info;
+ SgrprojInfo ref_sgrproj_info;
+ set_default_wiener(&ref_wiener_info);
+ set_default_sgrproj(&ref_sgrproj_info);
+ const int ntiles_uv = av1_get_rest_ntiles(
+ ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
+ ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
+ cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL);
+ RestorationInfo *rsi = &cm->rst_info[0];
+ if (rsi->frame_restoration_type != RESTORE_NONE) {
+ if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+ // RESTORE_SWITCHABLE
+ for (i = 0; i < ntiles; ++i) {
+ av1_write_token(
+ wb, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
+ &switchable_restore_encodings[rsi->restoration_type[i]]);
+ if (rsi->restoration_type[i] == RESTORE_WIENER) {
+ write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
+ } else if (rsi->restoration_type[i] == RESTORE_SGRPROJ) {
+ write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
+ }
+ }
+ } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+ for (i = 0; i < ntiles; ++i) {
+ aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
+ RESTORE_NONE_WIENER_PROB);
+ if (rsi->restoration_type[i] != RESTORE_NONE) {
+ write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
+ }
+ }
+ } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+ for (i = 0; i < ntiles; ++i) {
+ aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
+ RESTORE_NONE_SGRPROJ_PROB);
+ if (rsi->restoration_type[i] != RESTORE_NONE) {
+ write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
+ }
+ }
+ }
+ }
+ for (p = 1; p < MAX_MB_PLANE; ++p) {
+ set_default_wiener(&ref_wiener_info);
+ rsi = &cm->rst_info[p];
+ if (rsi->frame_restoration_type == RESTORE_WIENER) {
+ for (i = 0; i < ntiles_uv; ++i) {
+ if (ntiles_uv > 1)
+ aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
+ RESTORE_NONE_WIENER_PROB);
+ if (rsi->restoration_type[i] != RESTORE_NONE) {
+ write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
+ }
+ }
+ } else if (rsi->frame_restoration_type != RESTORE_NONE) {
+ assert(0);
+ }
+ }
+}
+#endif // CONFIG_LOOP_RESTORATION
+
+static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+ int i;
+ struct loopfilter *lf = &cm->lf;
+
+ // Encode the loop filter level and type
+ aom_wb_write_literal(wb, lf->filter_level, 6);
+ aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+ // Write out loop filter deltas applied at the MB level based on mode or
+ // ref frame (if they are enabled).
+ aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+ if (lf->mode_ref_delta_enabled) {
+ aom_wb_write_bit(wb, lf->mode_ref_delta_update);
+ if (lf->mode_ref_delta_update) {
+ for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) {
+ const int delta = lf->ref_deltas[i];
+ const int changed = delta != lf->last_ref_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) {
+ lf->last_ref_deltas[i] = delta;
+ aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+ }
+
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ const int delta = lf->mode_deltas[i];
+ const int changed = delta != lf->last_mode_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) {
+ lf->last_mode_deltas[i] = delta;
+ aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+ }
+ }
+ }
+}
+
+#if CONFIG_CDEF
+static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+ int i;
+ aom_wb_write_literal(wb, cm->cdef_dering_damping - 5, 1);
+ aom_wb_write_literal(wb, cm->cdef_clpf_damping - 3, 2);
+ aom_wb_write_literal(wb, cm->cdef_bits, 2);
+ for (i = 0; i < cm->nb_cdef_strengths; i++) {
+ aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
+ aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
+ }
+}
+#endif
+
+static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
+ if (delta_q != 0) {
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ }
+}
+
+static void encode_quantization(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+ write_delta_q(wb, cm->y_dc_delta_q);
+ write_delta_q(wb, cm->uv_dc_delta_q);
+ write_delta_q(wb, cm->uv_ac_delta_q);
+#if CONFIG_AOM_QM
+ aom_wb_write_bit(wb, cm->using_qmatrix);
+ if (cm->using_qmatrix) {
+ aom_wb_write_literal(wb, cm->min_qmlevel, QM_LEVEL_BITS);
+ aom_wb_write_literal(wb, cm->max_qmlevel, QM_LEVEL_BITS);
+ }
+#endif
+}
+
+static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+ struct aom_write_bit_buffer *wb) {
+ int i, j;
+ const struct segmentation *seg = &cm->seg;
+
+ aom_wb_write_bit(wb, seg->enabled);
+ if (!seg->enabled) return;
+
+ // Segmentation map
+ if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+ aom_wb_write_bit(wb, seg->update_map);
+ } else {
+ assert(seg->update_map == 1);
+ }
+ if (seg->update_map) {
+ // Select the coding strategy (temporal or spatial)
+ av1_choose_segmap_coding_method(cm, xd);
+
+ // Write out the chosen coding method.
+ if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+ aom_wb_write_bit(wb, seg->temporal_update);
+ } else {
+ assert(seg->temporal_update == 0);
+ }
+ }
+
+ // Segmentation data
+ aom_wb_write_bit(wb, seg->update_data);
+ if (seg->update_data) {
+ aom_wb_write_bit(wb, seg->abs_delta);
+
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ const int active = segfeature_active(seg, i, j);
+ aom_wb_write_bit(wb, active);
+ if (active) {
+ const int data = get_segdata(seg, i, j);
+ const int data_max = av1_seg_feature_data_max(j);
+
+ if (av1_is_segfeature_signed(j)) {
+ encode_unsigned_max(wb, abs(data), data_max);
+ aom_wb_write_bit(wb, data < 0);
+ } else {
+ encode_unsigned_max(wb, data, data_max);
+ }
+ }
+ }
+ }
+ }
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_seg_probs(AV1_COMP *cpi, aom_writer *w) {
+ AV1_COMMON *cm = &cpi->common;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+
+ if (!cm->seg.enabled || !cm->seg.update_map) return;
+
+ if (cm->seg.temporal_update) {
+ int i;
+
+ for (i = 0; i < PREDICTION_PROBS; i++)
+ av1_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i],
+ cm->counts.seg.pred[i], probwt);
+
+ prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
+ cm->counts.seg.tree_mispred, MAX_SEGMENTS, probwt, w);
+ } else {
+ prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
+ cm->counts.seg.tree_total, MAX_SEGMENTS, probwt, w);
+ }
+}
+#endif
+
+static void write_tx_mode(AV1_COMMON *cm, MACROBLOCKD *xd, TX_MODE *mode,
+ struct aom_write_bit_buffer *wb) {
+ int i, all_lossless = 1;
+
+ if (cm->seg.enabled) {
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ if (!xd->lossless[i]) {
+ all_lossless = 0;
+ break;
+ }
+ }
+ } else {
+ all_lossless = xd->lossless[0];
+ }
+ if (all_lossless) {
+ *mode = ONLY_4X4;
+ return;
+ }
+#if CONFIG_TX64X64
+ aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
+ if (*mode != TX_MODE_SELECT) {
+ aom_wb_write_literal(wb, AOMMIN(*mode, ALLOW_32X32), 2);
+ if (*mode >= ALLOW_32X32) aom_wb_write_bit(wb, *mode == ALLOW_64X64);
+ }
+#else
+ aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
+ if (*mode != TX_MODE_SELECT) aom_wb_write_literal(wb, *mode, 2);
+#endif // CONFIG_TX64X64
+}
+
+#if !CONFIG_EC_ADAPT
+static void update_txfm_probs(AV1_COMMON *cm, aom_writer *w,
+ FRAME_COUNTS *counts) {
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ if (cm->tx_mode == TX_MODE_SELECT) {
+ int i, j;
+ for (i = 0; i < MAX_TX_DEPTH; ++i)
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+ prob_diff_update(av1_tx_size_tree[i], cm->fc->tx_size_probs[i][j],
+ counts->tx_size[i][j], i + 2, probwt, w);
+ }
+}
+#endif
+
+static void write_frame_interp_filter(InterpFilter filter,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_bit(wb, filter == SWITCHABLE);
+ if (filter != SWITCHABLE)
+ aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
+ if (cm->interp_filter == SWITCHABLE) {
+ // Check to see if only one of the filters is actually used
+ int count[SWITCHABLE_FILTERS];
+ int i, j, c = 0;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ count[i] = 0;
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ count[i] += counts->switchable_interp[j][i];
+ c += (count[i] > 0);
+ }
+ if (c == 1) {
+ // Only one filter is used. So set the filter at frame level
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+#if CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
+#if CONFIG_WARPED_MOTION
+ if (i == EIGHTTAP_REGULAR || WARP_WM_NEIGHBORS_WITH_OBMC)
+#else
+ if (i == EIGHTTAP_REGULAR || WARP_GM_NEIGHBORS_WITH_OBMC)
+#endif // CONFIG_WARPED_MOTION
+#endif // CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
+ cm->interp_filter = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+static void write_tile_info(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+#if CONFIG_EXT_TILE
+ const int tile_width =
+ ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >>
+ cm->mib_size_log2;
+ const int tile_height =
+ ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >>
+ cm->mib_size_log2;
+
+ assert(tile_width > 0);
+ assert(tile_height > 0);
+
+ aom_wb_write_literal(wb, cm->tile_encoding_mode, 1);
+
+// Write the tile sizes
+#if CONFIG_EXT_PARTITION
+ if (cm->sb_size == BLOCK_128X128) {
+ assert(tile_width <= 32);
+ assert(tile_height <= 32);
+ aom_wb_write_literal(wb, tile_width - 1, 5);
+ aom_wb_write_literal(wb, tile_height - 1, 5);
+ } else
+#endif // CONFIG_EXT_PARTITION
+ {
+ assert(tile_width <= 64);
+ assert(tile_height <= 64);
+ aom_wb_write_literal(wb, tile_width - 1, 6);
+ aom_wb_write_literal(wb, tile_height - 1, 6);
+ }
+#else
+ int min_log2_tile_cols, max_log2_tile_cols, ones;
+ av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+ // columns
+ ones = cm->log2_tile_cols - min_log2_tile_cols;
+ while (ones--) aom_wb_write_bit(wb, 1);
+
+ if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
+
+ // rows
+ aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
+ if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
+#endif // CONFIG_EXT_TILE
+
+#if CONFIG_DEPENDENT_HORZTILES
+ if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
+#endif
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+ aom_wb_write_bit(wb, cm->loop_filter_across_tiles_enabled);
+#endif // CONFIG_LOOPFILTERING_ACROSS_TILES
+}
+
+static int get_refresh_mask(AV1_COMP *cpi) {
+ int refresh_mask = 0;
+
+#if CONFIG_EXT_REFS
+ // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
+ // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
+ // the 3 LAST reference frames will be updated accordingly, i.e.:
+ // (1) The original virtual index for LAST3_FRAME will become the new virtual
+ // index for LAST_FRAME; and
+ // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be
+ // shifted and become the new virtual indexes for LAST2_FRAME and
+ // LAST3_FRAME.
+ refresh_mask |=
+ (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]);
+ if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+ // We have swapped the virtual indices
+ refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->arf_map[0]);
+ } else {
+ refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
+ }
+#else
+ refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
+#endif // CONFIG_EXT_REFS
+
+ if (av1_preserve_existing_gf(cpi)) {
+ // We have decided to preserve the previously existing golden frame as our
+ // new ARF frame. However, in the short term we leave it in the GF slot and,
+ // if we're updating the GF with the current decoded frame, we save it
+ // instead to the ARF slot.
+ // Later, in the function av1_encoder.c:av1_update_reference_frames() we
+ // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+ // there so that it can be done outside of the recode loop.
+ // Note: This is highly specific to the use of ARF as a forward reference,
+ // and this needs to be generalized as other uses are implemented
+ // (like RTC/temporal scalability).
+ return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+ } else {
+#if CONFIG_EXT_REFS
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int arf_idx = cpi->arf_map[gf_group->arf_update_idx[gf_group->index]];
+#else
+ int arf_idx = cpi->alt_fb_idx;
+ if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ arf_idx = gf_group->arf_update_idx[gf_group->index];
+ }
+#endif // CONFIG_EXT_REFS
+ return refresh_mask | (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+ (cpi->refresh_alt_ref_frame << arf_idx);
+ }
+}
+
+#if CONFIG_EXT_TILE
+static INLINE int find_identical_tile(
+ const int tile_row, const int tile_col,
+ TileBufferEnc (*const tile_buffers)[1024]) {
+ const MV32 candidate_offset[1] = { { 1, 0 } };
+ const uint8_t *const cur_tile_data =
+ tile_buffers[tile_row][tile_col].data + 4;
+ const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+ int i;
+
+ if (tile_row == 0) return 0;
+
+ // (TODO: yunqingwang) For now, only above tile is checked and used.
+ // More candidates such as left tile can be added later.
+ for (i = 0; i < 1; i++) {
+ int row_offset = candidate_offset[0].row;
+ int col_offset = candidate_offset[0].col;
+ int row = tile_row - row_offset;
+ int col = tile_col - col_offset;
+ uint8_t tile_hdr;
+ const uint8_t *tile_data;
+ TileBufferEnc *candidate;
+
+ if (row < 0 || col < 0) continue;
+
+ tile_hdr = *(tile_buffers[row][col].data);
+
+ // Read out tcm bit
+ if ((tile_hdr >> 7) == 1) {
+ // The candidate is a copy tile itself
+ row_offset += tile_hdr & 0x7f;
+ row = tile_row - row_offset;
+ }
+
+ candidate = &tile_buffers[row][col];
+
+ if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+ tile_data = candidate->data + 4;
+
+ if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+ // Identical tile found
+ assert(row_offset > 0);
+ return row_offset;
+ }
+
+ // No identical tile found
+ return 0;
+}
+#endif // CONFIG_EXT_TILE
+
+#if CONFIG_TILE_GROUPS
+static uint32_t write_tiles(AV1_COMP *const cpi,
+ struct aom_write_bit_buffer *wb,
+ unsigned int *max_tile_size,
+ unsigned int *max_tile_col_size) {
+#else
+static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
+ unsigned int *max_tile_size,
+ unsigned int *max_tile_col_size) {
+#endif
+ const AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_ANS
+ struct BufAnsCoder *buf_ans = &cpi->buf_ans;
+#else
+ aom_writer mode_bc;
+#endif // CONFIG_ANS
+ int tile_row, tile_col;
+ TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
+ TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+ uint32_t total_size = 0;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ unsigned int tile_size = 0;
+#if CONFIG_TILE_GROUPS
+ const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+ const int have_tiles = n_log2_tiles > 0;
+ uint32_t comp_hdr_size;
+ // Fixed size tile groups for the moment
+ const int num_tg_hdrs = cm->num_tg;
+ const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+ int tile_count = 0;
+ int tg_count = 1;
+ int tile_size_bytes = 4;
+ int tile_col_size_bytes;
+ uint32_t uncompressed_hdr_size = 0;
+ uint8_t *dst = NULL;
+ struct aom_write_bit_buffer comp_hdr_len_wb;
+ struct aom_write_bit_buffer tg_params_wb;
+ struct aom_write_bit_buffer tile_size_bytes_wb;
+ uint32_t saved_offset;
+ int mtu_size = cpi->oxcf.mtu;
+ int curr_tg_data_size = 0;
+ int hdr_size;
+#endif
+#if CONFIG_EXT_TILE
+ const int have_tiles = tile_cols * tile_rows > 1;
+#endif // CONFIG_EXT_TILE
+
+ *max_tile_size = 0;
+ *max_tile_col_size = 0;
+
+// All tile size fields are output on 4 bytes. A call to remux_tiles will
+// later compact the data if smaller headers are adequate.
+
+#if CONFIG_EXT_TILE
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileInfo tile_info;
+ const int is_last_col = (tile_col == tile_cols - 1);
+ const uint32_t col_offset = total_size;
+
+ av1_tile_set_col(&tile_info, cm, tile_col);
+
+ // The last column does not have a column header
+ if (!is_last_col) total_size += 4;
+
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+ const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+ const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+ const int data_offset = have_tiles ? 4 : 0;
+#if CONFIG_EC_ADAPT
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+#endif
+ av1_tile_set_row(&tile_info, cm, tile_row);
+
+ buf->data = dst + total_size;
+
+ // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+ // even for the last one, unless no tiling is used at all.
+ total_size += data_offset;
+#if CONFIG_EC_ADAPT
+ // Initialise tile context from the frame context
+ this_tile->tctx = *cm->fc;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+#endif
+#if CONFIG_PVQ
+ cpi->td.mb.pvq_q = &this_tile->pvq_q;
+ cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif // CONFIG_PVQ
+#if !CONFIG_ANS
+ aom_start_encode(&mode_bc, buf->data + data_offset);
+ write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+ assert(tok == tok_end);
+ aom_stop_encode(&mode_bc);
+ tile_size = mode_bc.pos;
+#else
+ buf_ans_write_init(buf_ans, buf->data + data_offset);
+ write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+ assert(tok == tok_end);
+ aom_buf_ans_flush(buf_ans);
+ tile_size = buf_ans_write_end(buf_ans);
+#endif // !CONFIG_ANS
+#if CONFIG_PVQ
+ cpi->td.mb.pvq_q = NULL;
+#endif
+ buf->size = tile_size;
+
+ // Record the maximum tile size we see, so we can compact headers later.
+ *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+ if (have_tiles) {
+ // tile header: size of this tile, or copy offset
+ uint32_t tile_header = tile_size;
+
+ // If the tile_encoding_mode is 1 (i.e. TILE_VR), check if this tile is
+ // a copy tile.
+ // Very low chances to have copy tiles on the key frames, so don't
+ // search on key frames to reduce unnecessary search.
+ if (cm->frame_type != KEY_FRAME && cm->tile_encoding_mode) {
+ const int idendical_tile_offset =
+ find_identical_tile(tile_row, tile_col, tile_buffers);
+
+ if (idendical_tile_offset > 0) {
+ tile_size = 0;
+ tile_header = idendical_tile_offset | 0x80;
+ tile_header <<= 24;
+ }
+ }
+
+ mem_put_le32(buf->data, tile_header);
+ }
+
+ total_size += tile_size;
+ }
+
+ if (!is_last_col) {
+ uint32_t col_size = total_size - col_offset - 4;
+ mem_put_le32(dst + col_offset, col_size);
+
+ // If it is not final packing, record the maximum tile column size we see,
+ // otherwise, check if the tile size is out of the range.
+ *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+ }
+ }
+#else
+#if CONFIG_TILE_GROUPS
+ write_uncompressed_header(cpi, wb);
+
+#if CONFIG_EXT_REFS
+ if (cm->show_existing_frame) {
+ total_size = aom_wb_bytes_written(wb);
+ return (uint32_t)total_size;
+ }
+#endif // CONFIG_EXT_REFS
+
+ // Write the tile length code
+ tile_size_bytes_wb = *wb;
+ aom_wb_write_literal(wb, 3, 2);
+
+ /* Write a placeholder for the number of tiles in each tile group */
+ tg_params_wb = *wb;
+ saved_offset = wb->bit_offset;
+ if (have_tiles) {
+ aom_wb_overwrite_literal(wb, 3, n_log2_tiles);
+ aom_wb_overwrite_literal(wb, (1 << n_log2_tiles) - 1, n_log2_tiles);
+ }
+
+ /* Write a placeholder for the compressed header length */
+ comp_hdr_len_wb = *wb;
+ aom_wb_write_literal(wb, 0, 16);
+
+ uncompressed_hdr_size = aom_wb_bytes_written(wb);
+ dst = wb->bit_buffer;
+ comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
+ aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
+ hdr_size = uncompressed_hdr_size + comp_hdr_size;
+ total_size += hdr_size;
+#endif
+
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileInfo tile_info;
+ const int is_last_row = (tile_row == tile_rows - 1);
+ av1_tile_set_row(&tile_info, cm, tile_row);
+
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+#if CONFIG_PVQ || CONFIG_EC_ADAPT
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+#endif
+ const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+ const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+ const int is_last_col = (tile_col == tile_cols - 1);
+ const int is_last_tile = is_last_col && is_last_row;
+#if !CONFIG_TILE_GROUPS
+ (void)tile_idx;
+#else
+
+ if ((!mtu_size && tile_count > tg_size) ||
+ (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) {
+ // New tile group
+ tg_count++;
+ // We've exceeded the packet size
+ if (tile_count > 1) {
+ /* The last tile exceeded the packet size. The tile group size
+ should therefore be tile_count-1.
+ Move the last tile and insert headers before it
+ */
+ uint32_t old_total_size = total_size - tile_size - 4;
+ memmove(dst + old_total_size + hdr_size, dst + old_total_size,
+ (tile_size + 4) * sizeof(uint8_t));
+ // Copy uncompressed header
+ memmove(dst + old_total_size, dst,
+ uncompressed_hdr_size * sizeof(uint8_t));
+ // Write the number of tiles in the group into the last uncompressed
+ // header before the one we've just inserted
+ aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
+ n_log2_tiles);
+ aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2, n_log2_tiles);
+ // Update the pointer to the last TG params
+ tg_params_wb.bit_offset = saved_offset + 8 * old_total_size;
+ // Copy compressed header
+ memmove(dst + old_total_size + uncompressed_hdr_size,
+ dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
+ total_size += hdr_size;
+ tile_count = 1;
+ curr_tg_data_size = hdr_size + tile_size + 4;
+
+ } else {
+ // We exceeded the packet size in just one tile
+ // Copy uncompressed header
+ memmove(dst + total_size, dst,
+ uncompressed_hdr_size * sizeof(uint8_t));
+ // Write the number of tiles in the group into the last uncompressed
+ // header
+ aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
+ n_log2_tiles);
+ aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+ tg_params_wb.bit_offset = saved_offset + 8 * total_size;
+ // Copy compressed header
+ memmove(dst + total_size + uncompressed_hdr_size,
+ dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
+ total_size += hdr_size;
+ tile_count = 0;
+ curr_tg_data_size = hdr_size;
+ }
+ }
+ tile_count++;
+#endif
+ av1_tile_set_col(&tile_info, cm, tile_col);
+
+#if CONFIG_DEPENDENT_HORZTILES && CONFIG_TILE_GROUPS
+ av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
+#endif
+ buf->data = dst + total_size;
+
+ // The last tile does not have a header.
+ if (!is_last_tile) total_size += 4;
+
+#if CONFIG_EC_ADAPT
+ // Initialise tile context from the frame context
+ this_tile->tctx = *cm->fc;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+#endif
+#if CONFIG_PVQ
+ cpi->td.mb.pvq_q = &this_tile->pvq_q;
+ cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif // CONFIG_PVQ
+#if CONFIG_ANS
+ buf_ans_write_init(buf_ans, dst + total_size);
+ write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+ assert(tok == tok_end);
+ aom_buf_ans_flush(buf_ans);
+ tile_size = buf_ans_write_end(buf_ans);
+#else
+ aom_start_encode(&mode_bc, dst + total_size);
+ write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+#if !CONFIG_LV_MAP
+ assert(tok == tok_end);
+#endif // !CONFIG_LV_MAP
+ aom_stop_encode(&mode_bc);
+ tile_size = mode_bc.pos;
+#endif // CONFIG_ANS
+#if CONFIG_PVQ
+ cpi->td.mb.pvq_q = NULL;
+#endif
+
+ assert(tile_size > 0);
+
+#if CONFIG_TILE_GROUPS
+ curr_tg_data_size += tile_size + 4;
+#endif
+ buf->size = tile_size;
+
+ if (!is_last_tile) {
+ *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+ // size of this tile
+ mem_put_le32(buf->data, tile_size);
+ }
+
+ total_size += tile_size;
+ }
+ }
+#if CONFIG_TILE_GROUPS
+ // Write the final tile group size
+ if (n_log2_tiles) {
+ aom_wb_overwrite_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count,
+ n_log2_tiles);
+ aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+ }
+ // Remux if possible. TODO (Thomas Davies): do this for more than one tile
+ // group
+ if (have_tiles && tg_count == 1) {
+ int data_size = total_size - (uncompressed_hdr_size + comp_hdr_size);
+ data_size = remux_tiles(cm, dst + uncompressed_hdr_size + comp_hdr_size,
+ data_size, *max_tile_size, *max_tile_col_size,
+ &tile_size_bytes, &tile_col_size_bytes);
+ total_size = data_size + uncompressed_hdr_size + comp_hdr_size;
+ aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2);
+ }
+
+#endif
+#endif // CONFIG_EXT_TILE
+ return (uint32_t)total_size;
+}
+
+static void write_render_size(const AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ const int scaling_active =
+ cm->width != cm->render_width || cm->height != cm->render_height;
+ aom_wb_write_bit(wb, scaling_active);
+ if (scaling_active) {
+ aom_wb_write_literal(wb, cm->render_width - 1, 16);
+ aom_wb_write_literal(wb, cm->render_height - 1, 16);
+ }
+}
+
+#if CONFIG_FRAME_SUPERRES
+static void write_superres_scale(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ // This scaling and frame superres are probably incompatible
+ assert(cm->width == cm->render_width && cm->height == cm->render_height);
+
+ // First bit is whether to to scale or not
+ if (cm->superres_scale_numerator == SUPERRES_SCALE_DENOMINATOR) {
+ aom_wb_write_bit(wb, 0); // no scaling
+ } else {
+ aom_wb_write_bit(wb, 1); // scaling, write scale factor
+ // TODO(afergs): write factor to the compressed header instead
+ aom_wb_write_literal(
+ wb, cm->superres_scale_numerator - SUPERRES_SCALE_NUMERATOR_MIN,
+ SUPERRES_SCALE_BITS);
+ }
+}
+#endif // CONFIG_FRAME_SUPERRES
+
+static void write_frame_size(const AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+#if CONFIG_FRAME_SUPERRES
+ // If SUPERRES scaling is happening, write the full resolution instead of the
+ // downscaled resolution. The decoder will reduce this resolution itself.
+ if (cm->superres_scale_numerator != SUPERRES_SCALE_DENOMINATOR) {
+ aom_wb_write_literal(wb, cm->superres_width - 1, 16);
+ aom_wb_write_literal(wb, cm->superres_height - 1, 16);
+ } else {
+#endif // CONFIG_FRAME_SUPERRES
+ aom_wb_write_literal(wb, cm->width - 1, 16);
+ aom_wb_write_literal(wb, cm->height - 1, 16);
+#if CONFIG_FRAME_SUPERRES
+ }
+#endif // CONFIG_FRAME_SUPERRES
+
+ // TODO(afergs): Also write something different to render_size?
+ // When superres scales, they'll be almost guaranteed to be
+ // different on the other side.
+ write_render_size(cm, wb);
+#if CONFIG_FRAME_SUPERRES
+ write_superres_scale(cm, wb);
+#endif // CONFIG_FRAME_SUPERRES
+}
+
+static void write_frame_size_with_refs(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ int found = 0;
+
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+
+ if (cfg != NULL) {
+ found =
+ cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height;
+ found &= cm->render_width == cfg->render_width &&
+ cm->render_height == cfg->render_height;
+ }
+ aom_wb_write_bit(wb, found);
+ if (found) {
+ break;
+ }
+ }
+
+ if (!found) {
+ write_frame_size(cm, wb);
+ }
+}
+
+static void write_sync_code(struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(wb, AV1_SYNC_CODE_0, 8);
+ aom_wb_write_literal(wb, AV1_SYNC_CODE_1, 8);
+ aom_wb_write_literal(wb, AV1_SYNC_CODE_2, 8);
+}
+
+static void write_profile(BITSTREAM_PROFILE profile,
+ struct aom_write_bit_buffer *wb) {
+ switch (profile) {
+ case PROFILE_0: aom_wb_write_literal(wb, 0, 2); break;
+ case PROFILE_1: aom_wb_write_literal(wb, 2, 2); break;
+ case PROFILE_2: aom_wb_write_literal(wb, 1, 2); break;
+ case PROFILE_3: aom_wb_write_literal(wb, 6, 3); break;
+ default: assert(0);
+ }
+}
+
+static void write_bitdepth_colorspace_sampling(
+ AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+ if (cm->profile >= PROFILE_2) {
+ assert(cm->bit_depth > AOM_BITS_8);
+ aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
+ }
+ aom_wb_write_literal(wb, cm->color_space, 3);
+ if (cm->color_space != AOM_CS_SRGB) {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ aom_wb_write_bit(wb, cm->color_range);
+ if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+ assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
+ aom_wb_write_bit(wb, cm->subsampling_x);
+ aom_wb_write_bit(wb, cm->subsampling_y);
+ aom_wb_write_bit(wb, 0); // unused
+ } else {
+ assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+ }
+ } else {
+ assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
+ aom_wb_write_bit(wb, 0); // unused
+ }
+}
+
+#if CONFIG_REFERENCE_BUFFER
+void write_sequence_header(SequenceHeader *seq_params) {
+ /* Placeholder for actually writing to the bitstream */
+ seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG;
+ seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
+ seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
+}
+#endif
+
+static void write_uncompressed_header(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+#if CONFIG_REFERENCE_BUFFER
+ /* TODO: Move outside frame loop or inside key-frame branch */
+ write_sequence_header(&cpi->seq_params);
+#endif
+
+ aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2);
+
+ write_profile(cm->profile, wb);
+
+#if CONFIG_EXT_REFS
+ // NOTE: By default all coded frames to be used as a reference
+ cm->is_reference_frame = 1;
+
+ if (cm->show_existing_frame) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+ if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer %d does not contain a reconstructed frame",
+ frame_to_show);
+ }
+ ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+ aom_wb_write_bit(wb, 1); // show_existing_frame
+ aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+#if CONFIG_REFERENCE_BUFFER
+ if (cpi->seq_params.frame_id_numbers_present_flag) {
+ int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+ int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+ aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+ /* Add a zero byte to prevent emulation of superframe marker */
+ /* Same logic as when when terminating the entropy coder */
+ /* Consider to have this logic only one place */
+ aom_wb_write_literal(wb, 0, 8);
+ }
+#endif
+
+ return;
+ } else {
+#endif // CONFIG_EXT_REFS
+ aom_wb_write_bit(wb, 0); // show_existing_frame
+#if CONFIG_EXT_REFS
+ }
+#endif // CONFIG_EXT_REFS
+
+ aom_wb_write_bit(wb, cm->frame_type);
+ aom_wb_write_bit(wb, cm->show_frame);
+ aom_wb_write_bit(wb, cm->error_resilient_mode);
+
+#if CONFIG_REFERENCE_BUFFER
+ cm->invalid_delta_frame_id_minus1 = 0;
+ if (cpi->seq_params.frame_id_numbers_present_flag) {
+ int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+ aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+ }
+#endif
+
+#if CONFIG_FRAME_SUPERRES
+ // TODO(afergs): Remove - this is just to stop superres from breaking
+ cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
+#endif // CONFIG_FRAME_SUPERRES
+
+ if (cm->frame_type == KEY_FRAME) {
+ write_sync_code(wb);
+ write_bitdepth_colorspace_sampling(cm, wb);
+ write_frame_size(cm, wb);
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+ assert(cpi->common.ans_window_size_log2 >= 8);
+ assert(cpi->common.ans_window_size_log2 < 24);
+ aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
+#endif // CONFIG_ANS && ANS_MAX_SYMBOLS
+#if CONFIG_PALETTE
+ aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+#endif // CONFIG_PALETTE
+ } else {
+ if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
+#if CONFIG_PALETTE
+ if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+#endif // CONFIG_PALETTE
+ if (!cm->error_resilient_mode) {
+ if (cm->intra_only) {
+ aom_wb_write_bit(wb,
+ cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+ } else {
+ aom_wb_write_bit(wb,
+ cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
+ if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
+ aom_wb_write_bit(wb,
+ cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+ }
+ }
+
+#if CONFIG_EXT_REFS
+ cpi->refresh_frame_mask = get_refresh_mask(cpi);
+#endif // CONFIG_EXT_REFS
+
+ if (cm->intra_only) {
+ write_sync_code(wb);
+ write_bitdepth_colorspace_sampling(cm, wb);
+
+#if CONFIG_EXT_REFS
+ aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
+ aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif // CONFIG_EXT_REFS
+ write_frame_size(cm, wb);
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+ assert(cpi->common.ans_window_size_log2 >= 8);
+ assert(cpi->common.ans_window_size_log2 < 24);
+ aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
+#endif // CONFIG_ANS && ANS_MAX_SYMBOLS
+ } else {
+ MV_REFERENCE_FRAME ref_frame;
+
+#if CONFIG_EXT_REFS
+ aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
+ aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+ if (!cpi->refresh_frame_mask) {
+ // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+ // will not be used as a reference
+ cm->is_reference_frame = 0;
+ }
+#endif // CONFIG_EXT_REFS
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+ aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+ REF_FRAMES_LOG2);
+ aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+#if CONFIG_REFERENCE_BUFFER
+ if (cpi->seq_params.frame_id_numbers_present_flag) {
+ int i = get_ref_frame_map_idx(cpi, ref_frame);
+ int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+ int diff_len = cpi->seq_params.delta_frame_id_length_minus2 + 2;
+ int delta_frame_id_minus1 =
+ ((cm->current_frame_id - cm->ref_frame_id[i] +
+ (1 << frame_id_len)) %
+ (1 << frame_id_len)) -
+ 1;
+ if (delta_frame_id_minus1 < 0 ||
+ delta_frame_id_minus1 >= (1 << diff_len))
+ cm->invalid_delta_frame_id_minus1 = 1;
+ aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+ }
+#endif
+ }
+
+#if CONFIG_FRAME_SIZE
+ if (cm->error_resilient_mode == 0) {
+ write_frame_size_with_refs(cpi, wb);
+ } else {
+ write_frame_size(cm, wb);
+ }
+#else
+ write_frame_size_with_refs(cpi, wb);
+#endif
+
+ aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+
+ fix_interp_filter(cm, cpi->td.counts);
+ write_frame_interp_filter(cm->interp_filter, wb);
+#if CONFIG_TEMPMV_SIGNALING
+ if (!cm->error_resilient_mode) {
+ aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
+ }
+#endif
+ }
+ }
+
+#if CONFIG_REFERENCE_BUFFER
+ cm->refresh_mask = cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
+#endif
+
+ if (!cm->error_resilient_mode) {
+ aom_wb_write_bit(
+ wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
+ }
+
+ aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+
+ assert(cm->mib_size == mi_size_wide[cm->sb_size]);
+ assert(cm->mib_size == 1 << cm->mib_size_log2);
+#if CONFIG_EXT_PARTITION
+ assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
+ aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
+#else
+ assert(cm->sb_size == BLOCK_64X64);
+#endif // CONFIG_EXT_PARTITION
+
+ encode_loopfilter(cm, wb);
+#if CONFIG_CDEF
+ encode_cdef(cm, wb);
+#endif
+#if CONFIG_LOOP_RESTORATION
+ encode_restoration_mode(cm, wb);
+#endif // CONFIG_LOOP_RESTORATION
+ encode_quantization(cm, wb);
+ encode_segmentation(cm, xd, wb);
+#if CONFIG_DELTA_Q
+ {
+ int i;
+ struct segmentation *const seg = &cm->seg;
+ int segment_quantizer_active = 0;
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
+ segment_quantizer_active = 1;
+ }
+ }
+
+ if (cm->delta_q_present_flag)
+ assert(segment_quantizer_active == 0 && cm->base_qindex > 0);
+ if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
+ aom_wb_write_bit(wb, cm->delta_q_present_flag);
+ if (cm->delta_q_present_flag) {
+ aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+ xd->prev_qindex = cm->base_qindex;
+#if CONFIG_EXT_DELTA_Q
+ assert(seg->abs_delta == SEGMENT_DELTADATA);
+ aom_wb_write_bit(wb, cm->delta_lf_present_flag);
+ if (cm->delta_lf_present_flag) {
+ aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+ xd->prev_delta_lf_from_base = 0;
+ }
+#endif // CONFIG_EXT_DELTA_Q
+ }
+ }
+ }
+#endif
+
+ write_tx_mode(cm, xd, &cm->tx_mode, wb);
+
+ if (cpi->allow_comp_inter_inter) {
+ const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+#if !CONFIG_REF_ADAPT
+ const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+#endif // !CONFIG_REF_ADAPT
+
+ aom_wb_write_bit(wb, use_hybrid_pred);
+#if !CONFIG_REF_ADAPT
+ if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
+#endif // !CONFIG_REF_ADAPT
+ }
+
+#if CONFIG_EXT_TX
+ aom_wb_write_bit(wb, cm->reduced_tx_set_used);
+#endif // CONFIG_EXT_TX
+
+ write_tile_info(cm, wb);
+}
+
+#if CONFIG_GLOBAL_MOTION
+static void write_global_motion_params(WarpedMotionParams *params,
+ WarpedMotionParams *ref_params,
+ aom_prob *probs, aom_writer *w,
+ int allow_hp) {
+ TransformationType type = params->wmtype;
+ int trans_bits;
+ int trans_prec_diff;
+ av1_write_token(w, av1_global_motion_types_tree, probs,
+ &global_motion_types_encodings[type]);
+ switch (type) {
+ case HOMOGRAPHY:
+ case HORTRAPEZOID:
+ case VERTRAPEZOID:
+ if (type != HORTRAPEZOID)
+ aom_write_signed_primitive_refsubexpfin(
+ w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
+ (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
+ if (type != VERTRAPEZOID)
+ aom_write_signed_primitive_refsubexpfin(
+ w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
+ (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
+ // fallthrough intended
+ case AFFINE:
+ case ROTZOOM:
+ aom_write_signed_primitive_refsubexpfin(
+ w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ if (type != VERTRAPEZOID)
+ aom_write_signed_primitive_refsubexpfin(
+ w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ if (type >= AFFINE) {
+ if (type != HORTRAPEZOID)
+ aom_write_signed_primitive_refsubexpfin(
+ w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ aom_write_signed_primitive_refsubexpfin(
+ w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS));
+ }
+ // fallthrough intended
+ case TRANSLATION:
+ trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ trans_prec_diff = (type == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ aom_write_signed_primitive_refsubexpfin(
+ w, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[0] >> trans_prec_diff),
+ (params->wmmat[0] >> trans_prec_diff));
+ aom_write_signed_primitive_refsubexpfin(
+ w, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[1] >> trans_prec_diff),
+ (params->wmmat[1] >> trans_prec_diff));
+ break;
+ case IDENTITY: break;
+ default: assert(0);
+ }
+}
+
+static void write_global_motion(AV1_COMP *cpi, aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ int frame;
+ for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+#if !CONFIG_REF_MV
+ // With ref-mv, clearing unused global motion models here is
+ // unsafe, and we need to rely on the recode loop to do it
+ // instead. See av1_find_mv_refs for details.
+ if (!cpi->td.rd_counts.global_motion_used[frame]) {
+ set_default_warp_params(&cm->global_motion[frame]);
+ }
+#endif
+ write_global_motion_params(
+ &cm->global_motion[frame], &cm->prev_frame->global_motion[frame],
+ cm->fc->global_motion_types_prob, w, cm->allow_high_precision_mv);
+ /*
+ printf("Frame %d/%d: Enc Ref %d (used %d): %d %d %d %d\n",
+ cm->current_video_frame, cm->show_frame, frame,
+ cpi->global_motion_used[frame], cm->global_motion[frame].wmmat[0],
+ cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+ cm->global_motion[frame].wmmat[3]);
+ */
+ }
+}
+#endif
+
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
+ AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_SUPERTX
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+#endif // CONFIG_SUPERTX
+ FRAME_CONTEXT *const fc = cm->fc;
+ FRAME_COUNTS *counts = cpi->td.counts;
+ aom_writer *header_bc;
+ int i, j;
+
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+
+#if CONFIG_ANS
+ int header_size;
+ header_bc = &cpi->buf_ans;
+ buf_ans_write_init(header_bc, data);
+#else
+ aom_writer real_header_bc;
+ header_bc = &real_header_bc;
+ aom_start_encode(header_bc, data);
+#endif
+
+#if CONFIG_LOOP_RESTORATION
+ encode_restoration(cm, header_bc);
+#endif // CONFIG_LOOP_RESTORATION
+#if !CONFIG_EC_ADAPT
+ update_txfm_probs(cm, header_bc, counts);
+#endif
+#if CONFIG_LV_MAP
+ av1_write_txb_probs(cpi, header_bc);
+#else
+#if !CONFIG_PVQ
+#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+ update_coef_probs(cpi, header_bc);
+#endif // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#endif // CONFIG_PVQ
+#endif // CONFIG_LV_MAP
+
+#if CONFIG_VAR_TX
+ update_txfm_partition_probs(cm, header_bc, counts, probwt);
+#endif
+
+ update_skip_probs(cm, header_bc, counts);
+#if !CONFIG_EC_ADAPT && CONFIG_DELTA_Q
+ update_delta_q_probs(cm, header_bc, counts);
+#if CONFIG_EXT_DELTA_Q
+ update_delta_lf_probs(cm, header_bc, counts);
+#endif
+#endif
+#if !CONFIG_EC_ADAPT
+ update_seg_probs(cpi, header_bc);
+
+ for (i = 0; i < INTRA_MODES; ++i) {
+ prob_diff_update(av1_intra_mode_tree, fc->uv_mode_prob[i],
+ counts->uv_mode[i], INTRA_MODES, probwt, header_bc);
+ }
+
+#if CONFIG_EXT_PARTITION_TYPES
+ for (i = 0; i < PARTITION_PLOFFSET; ++i)
+ prob_diff_update(av1_partition_tree, fc->partition_prob[i],
+ counts->partition[i], PARTITION_TYPES, probwt, header_bc);
+ for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+ prob_diff_update(av1_ext_partition_tree, fc->partition_prob[i],
+ counts->partition[i], EXT_PARTITION_TYPES, probwt,
+ header_bc);
+#else
+ for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+ prob_diff_update(av1_partition_tree, fc->partition_prob[i],
+ counts->partition[i], PARTITION_TYPES, probwt, header_bc);
+#endif // CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_UNPOISON_PARTITION_CTX
+ for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
+ unsigned int ct[2] = { counts->partition[i][PARTITION_VERT],
+ counts->partition[i][PARTITION_SPLIT] };
+ assert(counts->partition[i][PARTITION_NONE] == 0);
+ assert(counts->partition[i][PARTITION_HORZ] == 0);
+ assert(fc->partition_prob[i][PARTITION_NONE] == 0);
+ assert(fc->partition_prob[i][PARTITION_HORZ] == 0);
+ av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_VERT],
+ ct, probwt);
+ }
+ for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
+ unsigned int ct[2] = { counts->partition[i][PARTITION_HORZ],
+ counts->partition[i][PARTITION_SPLIT] };
+ assert(counts->partition[i][PARTITION_NONE] == 0);
+ assert(counts->partition[i][PARTITION_VERT] == 0);
+ assert(fc->partition_prob[i][PARTITION_NONE] == 0);
+ assert(fc->partition_prob[i][PARTITION_VERT] == 0);
+ av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_HORZ],
+ ct, probwt);
+ }
+#endif
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+ for (i = 0; i < INTRA_FILTERS + 1; ++i)
+ prob_diff_update(av1_intra_filter_tree, fc->intra_filter_probs[i],
+ counts->intra_filter[i], INTRA_FILTERS, probwt, header_bc);
+#endif // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+#endif // !CONFIG_EC_ADAPT
+
+ if (frame_is_intra_only(cm)) {
+ av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
+#if CONFIG_EC_MULTISYMBOL
+ av1_copy(cm->fc->kf_y_cdf, av1_kf_y_mode_cdf);
+#endif
+
+#if !CONFIG_EC_ADAPT
+ for (i = 0; i < INTRA_MODES; ++i)
+ for (j = 0; j < INTRA_MODES; ++j)
+ prob_diff_update(av1_intra_mode_tree, cm->kf_y_prob[i][j],
+ counts->kf_y_mode[i][j], INTRA_MODES, probwt,
+ header_bc);
+#endif // CONFIG_EC_ADAPT
+ } else {
+#if CONFIG_REF_MV
+ update_inter_mode_probs(cm, header_bc, counts);
+#else
+#if !CONFIG_EC_ADAPT
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+ prob_diff_update(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
+ counts->inter_mode[i], INTER_MODES, probwt, header_bc);
+ }
+#endif
+#endif
+#if CONFIG_EXT_INTER
+ update_inter_compound_mode_probs(cm, probwt, header_bc);
+
+ if (cm->reference_mode != COMPOUND_REFERENCE) {
+ for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+ if (is_interintra_allowed_bsize_group(i)) {
+ av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
+ cm->counts.interintra[i], probwt);
+ }
+ }
+ for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+ prob_diff_update(
+ av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i],
+ counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc);
+ }
+ for (i = 0; i < BLOCK_SIZES; i++) {
+ if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
+ av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
+ cm->counts.wedge_interintra[i], probwt);
+ }
+ }
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+ if (cm->reference_mode != SINGLE_REFERENCE) {
+ for (i = 0; i < BLOCK_SIZES; i++)
+ prob_diff_update(av1_compound_type_tree, fc->compound_type_prob[i],
+ cm->counts.compound_interinter[i], COMPOUND_TYPES,
+ probwt, header_bc);
+ }
+#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
+ prob_diff_update(av1_motion_mode_tree, fc->motion_mode_prob[i],
+ counts->motion_mode[i], MOTION_MODES, probwt, header_bc);
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if !CONFIG_EC_ADAPT
+ if (cm->interp_filter == SWITCHABLE)
+ update_switchable_interp_probs(cm, header_bc, counts);
+#endif
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+ av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
+ counts->intra_inter[i], probwt);
+
+ if (cpi->allow_comp_inter_inter) {
+ const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+ if (use_hybrid_pred)
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+ av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i],
+ counts->comp_inter[i], probwt);
+ }
+
+ if (cm->reference_mode != COMPOUND_REFERENCE) {
+ for (i = 0; i < REF_CONTEXTS; i++) {
+ for (j = 0; j < (SINGLE_REFS - 1); j++) {
+ av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
+ counts->single_ref[i][j], probwt);
+ }
+ }
+ }
+ if (cm->reference_mode != SINGLE_REFERENCE) {
+ for (i = 0; i < REF_CONTEXTS; i++) {
+#if CONFIG_EXT_REFS
+ for (j = 0; j < (FWD_REFS - 1); j++) {
+ av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+ counts->comp_ref[i][j], probwt);
+ }
+ for (j = 0; j < (BWD_REFS - 1); j++) {
+ av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
+ counts->comp_bwdref[i][j], probwt);
+ }
+#else
+ for (j = 0; j < (COMP_REFS - 1); j++) {
+ av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+ counts->comp_ref[i][j], probwt);
+ }
+#endif // CONFIG_EXT_REFS
+ }
+ }
+
+#if !CONFIG_EC_ADAPT
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+ prob_diff_update(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
+ counts->y_mode[i], INTRA_MODES, probwt, header_bc);
+ }
+#endif
+
+ av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc,
+#if CONFIG_REF_MV
+ counts->mv);
+#else
+ &counts->mv);
+#endif
+#if !CONFIG_EC_ADAPT
+ update_ext_tx_probs(cm, header_bc);
+#endif
+#if CONFIG_SUPERTX
+ if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
+#endif // CONFIG_SUPERTX
+#if CONFIG_GLOBAL_MOTION
+ write_global_motion(cpi, header_bc);
+#endif // CONFIG_GLOBAL_MOTION
+ }
+#if CONFIG_EC_MULTISYMBOL
+#if !CONFIG_EC_ADAPT
+#if CONFIG_NEW_TOKENSET
+ av1_coef_head_cdfs(fc);
+#endif
+ av1_coef_pareto_cdfs(fc);
+#if CONFIG_REF_MV
+ for (i = 0; i < NMV_CONTEXTS; ++i) av1_set_mv_cdfs(&fc->nmvc[i]);
+#else
+ av1_set_mv_cdfs(&fc->nmvc);
+#endif
+#if CONFIG_EC_MULTISYMBOL
+ av1_set_mode_cdfs(cm);
+#endif
+#endif // !CONFIG_EC_ADAPT
+#endif
+#if CONFIG_ANS
+ aom_buf_ans_flush(header_bc);
+ header_size = buf_ans_write_end(header_bc);
+ assert(header_size <= 0xffff);
+ return header_size;
+#else
+ aom_stop_encode(header_bc);
+ assert(header_bc->pos <= 0xffff);
+ return header_bc->pos;
+#endif // CONFIG_ANS
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+ // Choose the number of bytes required to represent size, without
+ // using the 'spare_msbs' number of most significant bits.
+
+ // Make sure we will fit in 4 bytes to start with..
+ if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+ // Normalise to 32 bits
+ size <<= spare_msbs;
+
+ if (size >> 24 != 0)
+ return 4;
+ else if (size >> 16 != 0)
+ return 3;
+ else if (size >> 8 != 0)
+ return 2;
+ else
+ return 1;
+}
+
+static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
+ switch (sz) {
+ case 1: dst[0] = (uint8_t)(val & 0xff); break;
+ case 2: mem_put_le16(dst, val); break;
+ case 3: mem_put_le24(dst, val); break;
+ case 4: mem_put_le32(dst, val); break;
+ default: assert(0 && "Invalid size"); break;
+ }
+}
+static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+ const uint32_t data_size, const uint32_t max_tile_size,
+ const uint32_t max_tile_col_size,
+ int *const tile_size_bytes,
+ int *const tile_col_size_bytes) {
+// Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+#if CONFIG_EXT_TILE
+ // The top bit in the tile size field indicates tile copy mode, so we
+ // have 1 less bit to code the tile size
+ const int tsb = choose_size_bytes(max_tile_size, 1);
+ const int tcsb = choose_size_bytes(max_tile_col_size, 0);
+#else
+ const int tsb = choose_size_bytes(max_tile_size, 0);
+ const int tcsb = 4; // This is ignored
+ (void)max_tile_col_size;
+#endif // CONFIG_EXT_TILE
+
+ assert(tsb > 0);
+ assert(tcsb > 0);
+
+ *tile_size_bytes = tsb;
+ *tile_col_size_bytes = tcsb;
+
+ if (tsb == 4 && tcsb == 4) {
+ return data_size;
+ } else {
+ uint32_t wpos = 0;
+ uint32_t rpos = 0;
+
+#if CONFIG_EXT_TILE
+ int tile_row;
+ int tile_col;
+
+ for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+ // All but the last column has a column header
+ if (tile_col < cm->tile_cols - 1) {
+ uint32_t tile_col_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // Adjust the tile column size by the number of bytes removed
+ // from the tile size fields.
+ tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+ mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+ wpos += tcsb;
+ }
+
+ for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+ // All, including the last row has a header
+ uint32_t tile_header = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // If this is a copy tile, we need to shift the MSB to the
+ // top bit of the new width, and there is no data to copy.
+ if (tile_header >> 31 != 0) {
+ if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+ } else {
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+
+ memmove(dst + wpos, dst + rpos, tile_header);
+ rpos += tile_header;
+ wpos += tile_header;
+ }
+ }
+ }
+#else
+ const int n_tiles = cm->tile_cols * cm->tile_rows;
+ int n;
+
+ for (n = 0; n < n_tiles; n++) {
+ int tile_size;
+
+ if (n == n_tiles - 1) {
+ tile_size = data_size - rpos;
+ } else {
+ tile_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+ mem_put_varsize(dst + wpos, tsb, tile_size);
+ wpos += tsb;
+ }
+
+ memmove(dst + wpos, dst + rpos, tile_size);
+
+ rpos += tile_size;
+ wpos += tile_size;
+ }
+#endif // CONFIG_EXT_TILE
+
+ assert(rpos > wpos);
+ assert(rpos == data_size);
+
+ return wpos;
+ }
+}
+
+void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+ uint8_t *data = dst;
+#if !CONFIG_TILE_GROUPS
+ uint32_t compressed_header_size;
+ uint32_t uncompressed_header_size;
+ struct aom_write_bit_buffer saved_wb;
+#endif
+ uint32_t data_size;
+ struct aom_write_bit_buffer wb = { data, 0 };
+
+ unsigned int max_tile_size;
+ unsigned int max_tile_col_size;
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_reset_write();
+#endif
+
+#if !CONFIG_TILE_GROUPS
+ int tile_size_bytes;
+ int tile_col_size_bytes;
+ AV1_COMMON *const cm = &cpi->common;
+ const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
+
+ // Write the uncompressed header
+ write_uncompressed_header(cpi, &wb);
+
+#if CONFIG_EXT_REFS
+ if (cm->show_existing_frame) {
+ *size = aom_wb_bytes_written(&wb);
+ return;
+ }
+#endif // CONFIG_EXT_REFS
+
+ // We do not know these in advance. Output placeholder bit.
+ saved_wb = wb;
+ // Write tile size magnitudes
+ if (have_tiles) {
+// Note that the last item in the uncompressed header is the data
+// describing tile configuration.
+#if CONFIG_EXT_TILE
+ // Number of bytes in tile column size - 1
+ aom_wb_write_literal(&wb, 0, 2);
+#endif // CONFIG_EXT_TILE
+ // Number of bytes in tile size - 1
+ aom_wb_write_literal(&wb, 0, 2);
+ }
+ // Size of compressed header
+ aom_wb_write_literal(&wb, 0, 16);
+
+ uncompressed_header_size = (uint32_t)aom_wb_bytes_written(&wb);
+ data += uncompressed_header_size;
+
+ aom_clear_system_state();
+
+ // Write the compressed header
+ compressed_header_size = write_compressed_header(cpi, data);
+ data += compressed_header_size;
+
+ // Write the encoded tile data
+ data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
+#else
+ data_size = write_tiles(cpi, &wb, &max_tile_size, &max_tile_col_size);
+#endif
+#if !CONFIG_TILE_GROUPS
+ if (have_tiles) {
+ data_size =
+ remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
+ &tile_size_bytes, &tile_col_size_bytes);
+ }
+
+ data += data_size;
+
+ // Now fill in the gaps in the uncompressed header.
+ if (have_tiles) {
+#if CONFIG_EXT_TILE
+ assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+ aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
+#endif // CONFIG_EXT_TILE
+ assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+ aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
+ }
+ // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
+ assert(compressed_header_size <= 0xffff);
+ aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
+#else
+ data += data_size;
+#endif
+#if CONFIG_ANS && ANS_REVERSE
+ // Avoid aliasing the superframe index
+ *data++ = 0;
+#endif
+ *size = data - dst;
+}
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
new file mode 100644
index 0000000000..c75d80891b
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_BITSTREAM_H_
+#define AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+#if CONFIG_REFERENCE_BUFFER
+void write_sequence_header(SequenceHeader *seq_params);
+#endif
+
+void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
+
+void av1_encode_token_init(void);
+
+static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
+#if CONFIG_EXT_REFS
+ // Do not swap gf and arf indices for internal overlay frames
+ return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
+ !cpi->rc.is_src_frame_ext_arf;
+#else
+ return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+ cpi->rc.is_src_frame_alt_ref;
+#endif // CONFIG_EXT_REFS
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+ const int supertx_enabled,
+#endif
+#if CONFIG_TXK_SEL
+ int block, int plane,
+#endif
+ aom_writer *w);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
new file mode 100644
index 0000000000..39e08d5b4c
--- /dev/null
+++ b/third_party/aom/av1/encoder/block.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_BLOCK_H_
+#define AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#if CONFIG_PVQ
+#include "av1/encoder/encint.h"
+#endif
+#if CONFIG_REF_MV
+#include "av1/common/mvref_common.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_PVQ
+// Maximum possible # of tx blocks in luma plane, which is currently 256,
+// since there can be 16x16 of 4x4 tx.
+#define MAX_PVQ_BLOCKS_IN_SB (MAX_SB_SQUARE >> 2 * OD_LOG_BSIZE0)
+#endif
+
+typedef struct {
+ unsigned int sse;
+ int sum;
+ unsigned int var;
+} DIFF;
+
+typedef struct macroblock_plane {
+ DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
+#if CONFIG_PVQ
+ DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]);
+#endif
+ tran_low_t *qcoeff;
+ tran_low_t *coeff;
+ uint16_t *eobs;
+#if CONFIG_LV_MAP
+ uint8_t *txb_entropy_ctx;
+#endif
+ struct buf_2d src;
+
+ // Quantizer setings
+ const int16_t *quant_fp;
+ const int16_t *round_fp;
+ const int16_t *quant;
+ const int16_t *quant_shift;
+ const int16_t *zbin;
+ const int16_t *round;
+#if CONFIG_NEW_QUANT
+ const cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES];
+#endif // CONFIG_NEW_QUANT
+} MACROBLOCK_PLANE;
+
+/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
+ * coefficient in this block was zero) or not. */
+typedef unsigned int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
+ [COEFF_CONTEXTS][ENTROPY_TOKENS];
+
+typedef struct {
+ int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+ int16_t mode_context[MODE_CTX_REF_FRAMES];
+#if CONFIG_LV_MAP
+ // TODO(angiebird): Reduce the buffer size according to sb_type
+ tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
+ uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+ uint8_t txb_skip_ctx[MAX_MB_PLANE]
+ [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+ int dc_sign_ctx[MAX_MB_PLANE]
+ [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#endif
+#if CONFIG_REF_MV
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+ CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+#if CONFIG_EXT_INTER
+ int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+#endif // CONFIG_EXT_INTER
+#endif
+} MB_MODE_INFO_EXT;
+
+typedef struct {
+ int col_min;
+ int col_max;
+ int row_min;
+ int row_max;
+} MvLimits;
+
+#if CONFIG_PALETTE
+typedef struct {
+ uint8_t best_palette_color_map[MAX_SB_SQUARE];
+ float kmeans_data_buf[2 * MAX_SB_SQUARE];
+} PALETTE_BUFFER;
+#endif // CONFIG_PALETTE
+
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+ struct macroblock_plane plane[MAX_MB_PLANE];
+
+ MACROBLOCKD e_mbd;
+ MB_MODE_INFO_EXT *mbmi_ext;
+ int skip_block;
+ int qindex;
+
+ // The equivalent error at the current rdmult of one whole bit (not one
+ // bitcost unit).
+ int errorperbit;
+ // The equivalend SAD error of one (whole) bit at the current quantizer
+ // for large blocks.
+ int sadperbit16;
+ // The equivalend SAD error of one (whole) bit at the current quantizer
+ // for sub-8x8 blocks.
+ int sadperbit4;
+ int rddiv;
+ int rdmult;
+ int mb_energy;
+ int *m_search_count_ptr;
+ int *ex_search_count_ptr;
+
+#if CONFIG_VAR_TX
+ unsigned int txb_split_count;
+#endif
+
+ // These are set to their default values at the beginning, and then adjusted
+ // further in the encoding process.
+ BLOCK_SIZE min_partition_size;
+ BLOCK_SIZE max_partition_size;
+
+ int mv_best_ref_index[TOTAL_REFS_PER_FRAME];
+ unsigned int max_mv_context[TOTAL_REFS_PER_FRAME];
+ unsigned int source_variance;
+ unsigned int pred_sse[TOTAL_REFS_PER_FRAME];
+ int pred_mv_sad[TOTAL_REFS_PER_FRAME];
+
+#if CONFIG_REF_MV
+ int *nmvjointcost;
+ int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
+ int *nmvcost[NMV_CONTEXTS][2];
+ int *nmvcost_hp[NMV_CONTEXTS][2];
+ int **mv_cost_stack[NMV_CONTEXTS];
+ int *nmvjointsadcost;
+#else
+ int nmvjointcost[MV_JOINTS];
+ int *nmvcost[2];
+ int *nmvcost_hp[2];
+ int nmvjointsadcost[MV_JOINTS];
+#endif
+
+ int **mvcost;
+ int *nmvsadcost[2];
+ int *nmvsadcost_hp[2];
+ int **mvsadcost;
+#if CONFIG_MOTION_VAR
+ int32_t *wsrc_buf;
+ int32_t *mask_buf;
+#endif // CONFIG_MOTION_VAR
+
+#if CONFIG_PALETTE
+ PALETTE_BUFFER *palette_buffer;
+#endif // CONFIG_PALETTE
+
+ // These define limits to motion vector components to prevent them
+ // from extending outside the UMV borders
+ MvLimits mv_limits;
+
+#if CONFIG_VAR_TX
+ uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+#if CONFIG_REF_MV
+ uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+#endif
+#endif
+
+ int skip;
+
+#if CONFIG_CB4X4
+ int skip_chroma_rd;
+#endif
+
+ // note that token_costs is the cost when eob node is skipped
+ av1_coeff_cost token_costs[TX_SIZES];
+
+ int optimize;
+
+ // Used to store sub partition's choices.
+ MV pred_mv[TOTAL_REFS_PER_FRAME];
+
+ // Store the best motion vector during motion search
+ int_mv best_mv;
+ // Store the second best motion vector during full-pixel motion search
+ int_mv second_best_mv;
+
+ // use default transform and skip transform type search for intra modes
+ int use_default_intra_tx_type;
+ // use default transform and skip transform type search for inter modes
+ int use_default_inter_tx_type;
+#if CONFIG_PVQ
+ int rate;
+ // 1 if neither AC nor DC is coded. Only used during RDO.
+ int pvq_skip[MAX_MB_PLANE];
+ PVQ_QUEUE *pvq_q;
+
+ // Storage for PVQ tx block encodings in a superblock.
+ // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ
+ // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from:
+ // 1) Since PVQ is applied to each trasnform-ed block
+ // 2) 4x4 is the smallest tx size in AV1
+ // 3) AV1 allows using smaller tx size than block (i.e. partition) size
+ // TODO(yushin) : The memory usage could be improved a lot, since this has
+ // storage for 10 bands and 128 coefficients for every 4x4 block,
+ PVQ_INFO pvq[MAX_PVQ_BLOCKS_IN_SB][MAX_MB_PLANE];
+ daala_enc_ctx daala_enc;
+ int pvq_speed;
+ int pvq_coded; // Indicates whether pvq_info needs be stored to tokenize
+#endif
+#if CONFIG_DAALA_DIST
+ // Keep rate of each 4x4 block in the current macroblock during RDO
+ // This is needed when using the 8x8 Daala distortion metric during RDO,
+ // because it evaluates distortion in a different order than the underlying
+ // 4x4 blocks are coded.
+ int rate_4x4[256];
+#endif
+#if CONFIG_CFL
+ // Whether luma needs to be stored during RDO.
+ int cfl_store_y;
+#endif
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
new file mode 100644
index 0000000000..113ceb29d2
--- /dev/null
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+ return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+ return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+ return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+// p0 p1 p2 p3
+// q0 q1 q2 q3
+// block edge ->
+// r0 r1 r2 r3
+// s0 s1 s2 s3
+
+// blockiness = p0*-2+q0*6+r0*-6+s0*2 +
+// p1*-2+q1*6+r1*-6+s1*2 +
+// p2*-2+q2*6+r2*-6+s2*2 +
+// p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+// blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, s += sp, r += rp) {
+ s_blockiness += horizontal_filter(s);
+ r_blockiness += horizontal_filter(r);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-1];
+ sum_sq_1 += s[-1] * s[-1];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, ++s, ++r) {
+ s_blockiness += vertical_filter(s, sp);
+ r_blockiness += vertical_filter(r, rp);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-sp];
+ sum_sq_1 += s[-sp] * s[-sp];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch, int width,
+ int height) {
+ double blockiness = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4) {
+ if (i > 0 && i < height && j > 0 && j < width) {
+ blockiness +=
+ blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+ blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+ img2_pitch, 4);
+ }
+ }
+ }
+ blockiness /= width * height / 16;
+ return blockiness;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
new file mode 100644
index 0000000000..4c7d6ff00a
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+#if CONFIG_CB4X4
+ BLOCK_4X4,
+#endif
+ BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+ BLOCK_128X128,
+#endif // CONFIG_EXT_PARTITION
+};
+
+static void alloc_mode_context(AV1_COMMON *cm, int num_4x4_blk,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_TYPE partition,
+#endif
+ PICK_MODE_CONTEXT *ctx) {
+ const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
+ const int num_pix = num_blk * tx_size_2d[0];
+ int i;
+#if CONFIG_CB4X4 && CONFIG_VAR_TX
+ ctx->num_4x4_blk = num_blk / 4;
+#else
+ ctx->num_4x4_blk = num_blk;
+#endif
+
+#if CONFIG_EXT_PARTITION_TYPES
+ ctx->partition = partition;
+#endif
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+ CHECK_MEM_ERROR(cm, ctx->blk_skip[i], aom_calloc(num_blk, sizeof(uint8_t)));
+#endif
+ CHECK_MEM_ERROR(cm, ctx->coeff[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
+ CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i])));
+ CHECK_MEM_ERROR(cm, ctx->dqcoeff[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
+ CHECK_MEM_ERROR(cm, ctx->eobs[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+#if CONFIG_LV_MAP
+ CHECK_MEM_ERROR(
+ cm, ctx->txb_entropy_ctx[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+#endif
+
+#if CONFIG_PVQ
+ CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i])));
+#endif
+ }
+
+#if CONFIG_PALETTE
+ if (cm->allow_screen_content_tools) {
+ for (i = 0; i < 2; ++i) {
+ CHECK_MEM_ERROR(
+ cm, ctx->color_index_map[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+ }
+ }
+#endif // CONFIG_PALETTE
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+ aom_free(ctx->blk_skip[i]);
+ ctx->blk_skip[i] = 0;
+#endif
+ aom_free(ctx->coeff[i]);
+ ctx->coeff[i] = 0;
+ aom_free(ctx->qcoeff[i]);
+ ctx->qcoeff[i] = 0;
+ aom_free(ctx->dqcoeff[i]);
+ ctx->dqcoeff[i] = 0;
+#if CONFIG_PVQ
+ aom_free(ctx->pvq_ref_coeff[i]);
+ ctx->pvq_ref_coeff[i] = 0;
+#endif
+ aom_free(ctx->eobs[i]);
+ ctx->eobs[i] = 0;
+#if CONFIG_LV_MAP
+ aom_free(ctx->txb_entropy_ctx[i]);
+ ctx->txb_entropy_ctx[i] = 0;
+#endif
+ }
+
+#if CONFIG_PALETTE
+ for (i = 0; i < 2; ++i) {
+ aom_free(ctx->color_index_map[i]);
+ ctx->color_index_map[i] = 0;
+ }
+#endif // CONFIG_PALETTE
+}
+
+static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree,
+ int num_4x4_blk) {
+#if CONFIG_EXT_PARTITION_TYPES
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_NONE, &tree->none);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ, &tree->horizontal[0]);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[0]);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->horizontal[1]);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[1]);
+
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
+ &tree->horizontala[0]);
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
+ &tree->horizontala[1]);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_A,
+ &tree->horizontala[2]);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_B,
+ &tree->horizontalb[0]);
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
+ &tree->horizontalb[1]);
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
+ &tree->horizontalb[2]);
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
+ &tree->verticala[0]);
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
+ &tree->verticala[1]);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_A,
+ &tree->verticala[2]);
+ alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_B,
+ &tree->verticalb[0]);
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
+ &tree->verticalb[1]);
+ alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
+ &tree->verticalb[2]);
+#ifdef CONFIG_SUPERTX
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ,
+ &tree->horizontal_supertx);
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx);
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_SPLIT, &tree->split_supertx);
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_A,
+ &tree->horizontala_supertx);
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_B,
+ &tree->horizontalb_supertx);
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_A,
+ &tree->verticala_supertx);
+ alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_B,
+ &tree->verticalb_supertx);
+#endif // CONFIG_SUPERTX
+#else
+ alloc_mode_context(cm, num_4x4_blk, &tree->none);
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]);
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]);
+#ifdef CONFIG_SUPERTX
+ alloc_mode_context(cm, num_4x4_blk, &tree->horizontal_supertx);
+ alloc_mode_context(cm, num_4x4_blk, &tree->vertical_supertx);
+ alloc_mode_context(cm, num_4x4_blk, &tree->split_supertx);
+#endif
+
+ if (num_4x4_blk > 4) {
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[1]);
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[1]);
+ } else {
+ memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
+ memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+ }
+#endif // CONFIG_EXT_PARTITION_TYPES
+}
+
+static void free_tree_contexts(PC_TREE *tree) {
+#if CONFIG_EXT_PARTITION_TYPES
+ int i;
+ for (i = 0; i < 3; i++) {
+ free_mode_context(&tree->horizontala[i]);
+ free_mode_context(&tree->horizontalb[i]);
+ free_mode_context(&tree->verticala[i]);
+ free_mode_context(&tree->verticalb[i]);
+ }
+#endif // CONFIG_EXT_PARTITION_TYPES
+ free_mode_context(&tree->none);
+ free_mode_context(&tree->horizontal[0]);
+ free_mode_context(&tree->horizontal[1]);
+ free_mode_context(&tree->vertical[0]);
+ free_mode_context(&tree->vertical[1]);
+#ifdef CONFIG_SUPERTX
+ free_mode_context(&tree->horizontal_supertx);
+ free_mode_context(&tree->vertical_supertx);
+ free_mode_context(&tree->split_supertx);
+#if CONFIG_EXT_PARTITION_TYPES
+ free_mode_context(&tree->horizontala_supertx);
+ free_mode_context(&tree->horizontalb_supertx);
+ free_mode_context(&tree->verticala_supertx);
+ free_mode_context(&tree->verticalb_supertx);
+#endif // CONFIG_EXT_PARTITION_TYPES
+#endif // CONFIG_SUPERTX
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split. Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
+ int i, j;
+// TODO(jingning): The pc_tree allocation is redundant. We can take out all
+// the leaf nodes after cb4x4 mode is enabled.
+#if CONFIG_CB4X4
+#if CONFIG_EXT_PARTITION
+ const int tree_nodes_inc = 1024;
+#else
+ const int tree_nodes_inc = 256;
+#endif // CONFIG_EXT_PARTITION
+ const int leaf_factor = 4;
+#else
+ const int tree_nodes_inc = 0;
+ const int leaf_factor = 1;
+#endif
+#if CONFIG_EXT_PARTITION
+ const int leaf_nodes = 256 * leaf_factor;
+ const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+#else
+ const int leaf_nodes = 64 * leaf_factor;
+ const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
+#endif // CONFIG_EXT_PARTITION
+ int pc_tree_index = 0;
+ PC_TREE *this_pc;
+ PICK_MODE_CONTEXT *this_leaf;
+ int square_index = 1;
+ int nodes;
+
+ aom_free(td->leaf_tree);
+ CHECK_MEM_ERROR(cm, td->leaf_tree,
+ aom_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
+ aom_free(td->pc_tree);
+ CHECK_MEM_ERROR(cm, td->pc_tree,
+ aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
+
+ this_pc = &td->pc_tree[0];
+ this_leaf = &td->leaf_tree[0];
+
+ // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
+ // context so we only need to allocate 1 for each 8x8 block.
+ for (i = 0; i < leaf_nodes; ++i) {
+#if CONFIG_EXT_PARTITION_TYPES
+ alloc_mode_context(cm, 4, PARTITION_NONE, &td->leaf_tree[i]);
+#else
+ alloc_mode_context(cm, 16, &td->leaf_tree[i]);
+#endif
+ }
+
+ // Sets up all the leaf nodes in the tree.
+ for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+ tree->block_size = square[0];
+#if CONFIG_CB4X4
+ alloc_tree_contexts(cm, tree, 16);
+#else
+ alloc_tree_contexts(cm, tree, 4);
+#endif
+ tree->leaf_split[0] = this_leaf++;
+ for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+ }
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (i = 0; i < nodes; ++i) {
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+#if CONFIG_CB4X4
+ alloc_tree_contexts(cm, tree, 16 << (2 * square_index));
+#else
+ alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
+#endif
+ tree->block_size = square[square_index];
+ for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+ ++pc_tree_index;
+ }
+ ++square_index;
+ }
+
+ // Set up the root node for the largest superblock size
+ i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+ td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+ td->pc_root[i]->none.best_mode_index = 2;
+ // Set up the root nodes for the rest of the possible superblock sizes
+ while (--i >= 0) {
+ td->pc_root[i] = td->pc_root[i + 1]->split[0];
+ td->pc_root[i]->none.best_mode_index = 2;
+ }
+}
+
+void av1_free_pc_tree(ThreadData *td) {
+#if CONFIG_CB4X4
+#if CONFIG_EXT_PARTITION
+ const int tree_nodes_inc = 1024;
+#else
+ const int tree_nodes_inc = 256;
+#endif // CONFIG_EXT_PARTITION
+ const int leaf_factor = 4;
+#else
+ const int tree_nodes_inc = 0;
+ const int leaf_factor = 1;
+#endif
+
+#if CONFIG_EXT_PARTITION
+ const int leaf_nodes = 256 * leaf_factor;
+ const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+#else
+ const int leaf_nodes = 64 * leaf_factor;
+ const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
+#endif // CONFIG_EXT_PARTITION
+ int i;
+
+ // Set up all 4x4 mode contexts
+ for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]);
+
+ // Sets up all the leaf nodes in the tree.
+ for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+
+ aom_free(td->pc_tree);
+ td->pc_tree = NULL;
+ aom_free(td->leaf_tree);
+ td->leaf_tree = NULL;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
new file mode 100644
index 0000000000..67954126c6
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_CONTEXT_TREE_H_
+#define AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+ MODE_INFO mic;
+ MB_MODE_INFO_EXT mbmi_ext;
+#if CONFIG_PALETTE
+ uint8_t *color_index_map[2];
+#endif // CONFIG_PALETTE
+#if CONFIG_VAR_TX
+ uint8_t *blk_skip[MAX_MB_PLANE];
+#endif
+
+ // dual buffer pointers, 0: in use, 1: best in store
+ tran_low_t *coeff[MAX_MB_PLANE];
+ tran_low_t *qcoeff[MAX_MB_PLANE];
+ tran_low_t *dqcoeff[MAX_MB_PLANE];
+#if CONFIG_PVQ
+ tran_low_t *pvq_ref_coeff[MAX_MB_PLANE];
+#endif
+ uint16_t *eobs[MAX_MB_PLANE];
+#if CONFIG_LV_MAP
+ uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+#endif
+
+ int num_4x4_blk;
+ int skip;
+ int pred_pixel_ready;
+ // For current partition, only if all Y, U, and V transform blocks'
+ // coefficients are quantized to 0, skippable is set to 0.
+ int skippable;
+ int best_mode_index;
+ int hybrid_pred_diff;
+ int comp_pred_diff;
+ int single_pred_diff;
+
+ // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+ // scope of refactoring.
+ int rate;
+ int64_t dist;
+
+ // motion vector cache for adaptive motion search control in partition
+ // search loop
+ MV pred_mv[TOTAL_REFS_PER_FRAME];
+ InterpFilter pred_interp_filter;
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_TYPE partition;
+#endif
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+ int index;
+ PARTITION_TYPE partitioning;
+ BLOCK_SIZE block_size;
+ PICK_MODE_CONTEXT none;
+ PICK_MODE_CONTEXT horizontal[2];
+ PICK_MODE_CONTEXT vertical[2];
+#if CONFIG_EXT_PARTITION_TYPES
+ PICK_MODE_CONTEXT horizontala[3];
+ PICK_MODE_CONTEXT horizontalb[3];
+ PICK_MODE_CONTEXT verticala[3];
+ PICK_MODE_CONTEXT verticalb[3];
+#endif
+ union {
+ struct PC_TREE *split[4];
+ PICK_MODE_CONTEXT *leaf_split[4];
+ };
+#ifdef CONFIG_SUPERTX
+ PICK_MODE_CONTEXT horizontal_supertx;
+ PICK_MODE_CONTEXT vertical_supertx;
+ PICK_MODE_CONTEXT split_supertx;
+#if CONFIG_EXT_PARTITION_TYPES
+ PICK_MODE_CONTEXT horizontala_supertx;
+ PICK_MODE_CONTEXT horizontalb_supertx;
+ PICK_MODE_CONTEXT verticala_supertx;
+ PICK_MODE_CONTEXT verticalb_supertx;
+#endif
+#endif
+} PC_TREE;
+
+void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_pc_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */
diff --git a/third_party/aom/av1/encoder/corner_detect.c b/third_party/aom/av1/encoder/corner_detect.c
new file mode 100644
index 0000000000..e4c59dd9c6
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "av1/encoder/corner_detect.h"
+
+// Fast_9 wrapper
+#define FAST_BARRIER 18
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+ int *points, int max_points) {
+ int num_points;
+ xy *const frm_corners_xy = fast9_detect_nonmax(buf, width, height, stride,
+ FAST_BARRIER, &num_points);
+ num_points = (num_points <= max_points ? num_points : max_points);
+ if (num_points > 0 && frm_corners_xy) {
+ memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points);
+ free(frm_corners_xy);
+ return num_points;
+ }
+ free(frm_corners_xy);
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h
new file mode 100644
index 0000000000..0317db5b31
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_CORNER_DETECT_H_
+#define AV1_ENCODER_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+ int *points, int max_points);
+
+#endif // AV1_ENCODER_CORNER_DETECT_H_
diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c
new file mode 100644
index 0000000000..64ee0c5ae1
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "av1/encoder/corner_match.h"
+
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.75
+
+/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im,
+ centered at (x, y).
+*/
+static double compute_variance(unsigned char *im, int stride, int x, int y) {
+ int sum = 0.0;
+ int sumsq = 0.0;
+ int var;
+ int i, j;
+ for (i = 0; i < MATCH_SZ; ++i)
+ for (j = 0; j < MATCH_SZ; ++j) {
+ sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+ sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+ im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+ }
+ var = sumsq * MATCH_SZ_SQ - sum * sum;
+ return (double)var;
+}
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+ correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+ of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+static double compute_cross_correlation(unsigned char *im1, int stride1, int x1,
+ int y1, unsigned char *im2, int stride2,
+ int x2, int y2) {
+ int v1, v2;
+ int sum1 = 0;
+ int sum2 = 0;
+ int sumsq2 = 0;
+ int cross = 0;
+ int var2, cov;
+ int i, j;
+ for (i = 0; i < MATCH_SZ; ++i)
+ for (j = 0; j < MATCH_SZ; ++j) {
+ v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+ v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+ sum1 += v1;
+ sum2 += v2;
+ sumsq2 += v2 * v2;
+ cross += v1 * v2;
+ }
+ var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+ cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+ return cov / sqrt((double)var2);
+}
+
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
+ return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+ pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+ int point2y, int width, int height) {
+ const int thresh = (width < height ? height : width) >> 4;
+ return ((point1x - point2x) * (point1x - point2x) +
+ (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(unsigned char *frm, unsigned char *ref,
+ int width, int height, int frm_stride,
+ int ref_stride,
+ Correspondence *correspondences,
+ int num_correspondences) {
+ int i;
+ for (i = 0; i < num_correspondences; ++i) {
+ int x, y, best_x = 0, best_y = 0;
+ double best_match_ncc = 0.0;
+ for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+ for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+ double match_ncc;
+ if (!is_eligible_point(correspondences[i].rx + x,
+ correspondences[i].ry + y, width, height))
+ continue;
+ if (!is_eligible_distance(correspondences[i].x, correspondences[i].y,
+ correspondences[i].rx + x,
+ correspondences[i].ry + y, width, height))
+ continue;
+ match_ncc = compute_cross_correlation(
+ frm, frm_stride, correspondences[i].x, correspondences[i].y, ref,
+ ref_stride, correspondences[i].rx + x, correspondences[i].ry + y);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_y = y;
+ best_x = x;
+ }
+ }
+ }
+ correspondences[i].rx += best_x;
+ correspondences[i].ry += best_y;
+ }
+ for (i = 0; i < num_correspondences; ++i) {
+ int x, y, best_x = 0, best_y = 0;
+ double best_match_ncc = 0.0;
+ for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+ for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+ double match_ncc;
+ if (!is_eligible_point(correspondences[i].x + x,
+ correspondences[i].y + y, width, height))
+ continue;
+ if (!is_eligible_distance(
+ correspondences[i].x + x, correspondences[i].y + y,
+ correspondences[i].rx, correspondences[i].ry, width, height))
+ continue;
+ match_ncc = compute_cross_correlation(
+ ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm,
+ frm_stride, correspondences[i].x + x, correspondences[i].y + y);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_y = y;
+ best_x = x;
+ }
+ }
+ correspondences[i].x += best_x;
+ correspondences[i].y += best_y;
+ }
+}
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+ int num_frm_corners, unsigned char *ref,
+ int *ref_corners, int num_ref_corners, int width,
+ int height, int frm_stride, int ref_stride,
+ int *correspondence_pts) {
+ // TODO(sarahparker) Improve this to include 2-way match
+ int i, j;
+ Correspondence *correspondences = (Correspondence *)correspondence_pts;
+ int num_correspondences = 0;
+ for (i = 0; i < num_frm_corners; ++i) {
+ double best_match_ncc = 0.0;
+ double template_norm;
+ int best_match_j = -1;
+ if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+ height))
+ continue;
+ for (j = 0; j < num_ref_corners; ++j) {
+ double match_ncc;
+ if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+ height))
+ continue;
+ if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+ ref_corners[2 * j], ref_corners[2 * j + 1],
+ width, height))
+ continue;
+ match_ncc = compute_cross_correlation(
+ frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+ ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_match_j = j;
+ }
+ }
+ // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+ // but need to account for the normalization in compute_cross_correlation.
+ template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
+ frm_corners[2 * i + 1]);
+ if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+ correspondences[num_correspondences].x = frm_corners[2 * i];
+ correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+ correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
+ correspondences[num_correspondences].ry =
+ ref_corners[2 * best_match_j + 1];
+ num_correspondences++;
+ }
+ }
+ improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+ correspondences, num_correspondences);
+ return num_correspondences;
+}
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
new file mode 100644
index 0000000000..c0458642c1
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_ENCODER_CORNER_MATCH_H_
+#define AV1_ENCODER_CORNER_MATCH_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+typedef struct {
+ int x, y;
+ int rx, ry;
+} Correspondence;
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+ int num_frm_corners, unsigned char *ref,
+ int *ref_corners, int num_ref_corners, int width,
+ int height, int frm_stride, int ref_stride,
+ int *correspondence_pts);
+
+#endif // AV1_ENCODER_CORNER_MATCH_H_
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
new file mode 100644
index 0000000000..e3151a5973
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+/* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT))
+ Begins with a bogus entry for simpler addressing. */
+const uint16_t av1_prob_cost[256] = {
+ 4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260,
+ 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718,
+ 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409,
+ 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192,
+ 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024,
+ 1013, 1001, 990, 979, 968, 958, 947, 937, 927, 917, 907, 897, 887,
+ 878, 868, 859, 850, 841, 832, 823, 814, 806, 797, 789, 780, 772,
+ 764, 756, 748, 740, 732, 724, 717, 709, 702, 694, 687, 680, 673,
+ 665, 658, 651, 644, 637, 631, 624, 617, 611, 604, 598, 591, 585,
+ 578, 572, 566, 560, 554, 547, 541, 535, 530, 524, 518, 512, 506,
+ 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+ 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371,
+ 366, 361, 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311,
+ 307, 302, 298, 294, 289, 285, 281, 277, 273, 268, 264, 260, 256,
+ 252, 248, 244, 240, 236, 232, 228, 224, 220, 216, 212, 209, 205,
+ 201, 197, 194, 190, 186, 182, 179, 175, 171, 168, 164, 161, 157,
+ 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, 119, 115, 112,
+ 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, 70,
+ 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29,
+ 26, 23, 20, 18, 15, 12, 9, 6, 3
+};
+
+static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i,
+ int c) {
+ const aom_prob prob = probs[i / 2];
+ int b;
+
+ assert(prob != 0);
+ for (b = 0; b <= 1; ++b) {
+ const int cc = c + av1_cost_bit(prob, b);
+ const aom_tree_index ii = tree[i + b];
+
+ if (ii <= 0)
+ costs[-ii] = cc;
+ else
+ cost(costs, tree, probs, ii, cc);
+ }
+}
+
+void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) {
+ cost(costs, tree, probs, 0, 0);
+}
+
+void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree) {
+ assert(tree[0] <= 0 && tree[1] > 0);
+
+ costs[-tree[0]] = av1_cost_bit(probs[0], 0);
+ cost(costs, tree, probs, 2, 0);
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
new file mode 100644
index 0000000000..d8fb357e6d
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_COST_H_
+#define AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[256];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+#define av1_cost_zero(prob) (av1_prob_cost[prob])
+
+#define av1_cost_one(prob) av1_cost_zero(256 - (prob))
+
+#define av1_cost_bit(prob, bit) av1_cost_zero((bit) ? 256 - (prob) : (prob))
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+static INLINE unsigned int cost_branch256(const unsigned int ct[2],
+ aom_prob p) {
+ return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p);
+}
+
+static INLINE int treed_cost(aom_tree tree, const aom_prob *probs, int bits,
+ int len) {
+ int cost = 0;
+ aom_tree_index i = 0;
+
+ do {
+ const int bit = (bits >> --len) & 1;
+ cost += av1_cost_bit(probs[i >> 1], bit);
+ i = tree[i + bit];
+ } while (len);
+
+ return cost;
+}
+
+void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree);
+void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/daala_compat_enc.c b/third_party/aom/av1/encoder/daala_compat_enc.c
new file mode 100644
index 0000000000..3df424cac2
--- /dev/null
+++ b/third_party/aom/av1/encoder/daala_compat_enc.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "encint.h"
+
+void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) {
+#if CONFIG_DAALA_EC
+ od_ec_enc_checkpoint(&rbuf->ec, &enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ OD_COPY(&rbuf->adapt, enc->state.adapt, 1);
+}
+
+void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) {
+#if CONFIG_DAALA_EC
+ od_ec_enc_rollback(&enc->w.ec, &rbuf->ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ OD_COPY(enc->state.adapt, &rbuf->adapt, 1);
+}
diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c
new file mode 100644
index 0000000000..09e1b05637
--- /dev/null
+++ b/third_party/aom/av1/encoder/dct.c
@@ -0,0 +1,2228 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+#include "aom_dsp/fwd_txfm.h"
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/idct.h"
+
+static INLINE void range_check(const tran_low_t *input, const int size,
+ const int bit) {
+#if 0 // CONFIG_COEFFICIENT_RANGE_CHECKING
+// TODO(angiebird): the range_check is not used because the bit range
+// in fdct# is not correct. Since we are going to merge in a new version
+// of fdct# from nextgenv2, we won't fix the incorrect bit range now.
+ int i;
+ for (i = 0; i < size; ++i) {
+ assert(abs(input[i]) < (1 << bit));
+ }
+#else
+ (void)input;
+ (void)size;
+ (void)bit;
+#endif
+}
+
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t temp;
+ tran_low_t step[4];
+
+ // stage 0
+ range_check(input, 4, 14);
+
+ // stage 1
+ output[0] = input[0] + input[3];
+ output[1] = input[1] + input[2];
+ output[2] = input[1] - input[2];
+ output[3] = input[0] - input[3];
+
+ range_check(output, 4, 15);
+
+ // stage 2
+ temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+ step[0] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+ step[1] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+ step[2] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+ step[3] = (tran_low_t)fdct_round_shift(temp);
+
+ range_check(step, 4, 16);
+
+ // stage 3
+ output[0] = step[0];
+ output[1] = step[2];
+ output[2] = step[1];
+ output[3] = step[3];
+
+ range_check(output, 4, 16);
+}
+
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t temp;
+ tran_low_t step[8];
+
+ // stage 0
+ range_check(input, 8, 13);
+
+ // stage 1
+ output[0] = input[0] + input[7];
+ output[1] = input[1] + input[6];
+ output[2] = input[2] + input[5];
+ output[3] = input[3] + input[4];
+ output[4] = input[3] - input[4];
+ output[5] = input[2] - input[5];
+ output[6] = input[1] - input[6];
+ output[7] = input[0] - input[7];
+
+ range_check(output, 8, 14);
+
+ // stage 2
+ step[0] = output[0] + output[3];
+ step[1] = output[1] + output[2];
+ step[2] = output[1] - output[2];
+ step[3] = output[0] - output[3];
+ step[4] = output[4];
+ temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+ step[5] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+ step[6] = (tran_low_t)fdct_round_shift(temp);
+ step[7] = output[7];
+
+ range_check(step, 8, 15);
+
+ // stage 3
+ temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+ output[0] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+ output[1] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+ output[2] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+ output[3] = (tran_low_t)fdct_round_shift(temp);
+ output[4] = step[4] + step[5];
+ output[5] = step[4] - step[5];
+ output[6] = step[7] - step[6];
+ output[7] = step[7] + step[6];
+
+ range_check(output, 8, 16);
+
+ // stage 4
+ step[0] = output[0];
+ step[1] = output[1];
+ step[2] = output[2];
+ step[3] = output[3];
+ temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+ step[4] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+ step[5] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+ step[6] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+ step[7] = (tran_low_t)fdct_round_shift(temp);
+
+ range_check(step, 8, 16);
+
+ // stage 5
+ output[0] = step[0];
+ output[1] = step[4];
+ output[2] = step[2];
+ output[3] = step[6];
+ output[4] = step[1];
+ output[5] = step[5];
+ output[6] = step[3];
+ output[7] = step[7];
+
+ range_check(output, 8, 16);
+}
+
+static void fdct16(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t temp;
+ tran_low_t step[16];
+
+ // stage 0
+ range_check(input, 16, 13);
+
+ // stage 1
+ output[0] = input[0] + input[15];
+ output[1] = input[1] + input[14];
+ output[2] = input[2] + input[13];
+ output[3] = input[3] + input[12];
+ output[4] = input[4] + input[11];
+ output[5] = input[5] + input[10];
+ output[6] = input[6] + input[9];
+ output[7] = input[7] + input[8];
+ output[8] = input[7] - input[8];
+ output[9] = input[6] - input[9];
+ output[10] = input[5] - input[10];
+ output[11] = input[4] - input[11];
+ output[12] = input[3] - input[12];
+ output[13] = input[2] - input[13];
+ output[14] = input[1] - input[14];
+ output[15] = input[0] - input[15];
+
+ range_check(output, 16, 14);
+
+ // stage 2
+ step[0] = output[0] + output[7];
+ step[1] = output[1] + output[6];
+ step[2] = output[2] + output[5];
+ step[3] = output[3] + output[4];
+ step[4] = output[3] - output[4];
+ step[5] = output[2] - output[5];
+ step[6] = output[1] - output[6];
+ step[7] = output[0] - output[7];
+ step[8] = output[8];
+ step[9] = output[9];
+ temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
+ step[10] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
+ step[11] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
+ step[12] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
+ step[13] = (tran_low_t)fdct_round_shift(temp);
+ step[14] = output[14];
+ step[15] = output[15];
+
+ range_check(step, 16, 15);
+
+ // stage 3
+ output[0] = step[0] + step[3];
+ output[1] = step[1] + step[2];
+ output[2] = step[1] - step[2];
+ output[3] = step[0] - step[3];
+ output[4] = step[4];
+ temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
+ output[5] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
+ output[6] = (tran_low_t)fdct_round_shift(temp);
+ output[7] = step[7];
+ output[8] = step[8] + step[11];
+ output[9] = step[9] + step[10];
+ output[10] = step[9] - step[10];
+ output[11] = step[8] - step[11];
+ output[12] = step[15] - step[12];
+ output[13] = step[14] - step[13];
+ output[14] = step[14] + step[13];
+ output[15] = step[15] + step[12];
+
+ range_check(output, 16, 16);
+
+ // stage 4
+ temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+ step[0] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+ step[1] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+ step[2] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+ step[3] = (tran_low_t)fdct_round_shift(temp);
+ step[4] = output[4] + output[5];
+ step[5] = output[4] - output[5];
+ step[6] = output[7] - output[6];
+ step[7] = output[7] + output[6];
+ step[8] = output[8];
+ temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
+ step[9] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
+ step[10] = (tran_low_t)fdct_round_shift(temp);
+ step[11] = output[11];
+ step[12] = output[12];
+ temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
+ step[13] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
+ step[14] = (tran_low_t)fdct_round_shift(temp);
+ step[15] = output[15];
+
+ range_check(step, 16, 16);
+
+ // stage 5
+ output[0] = step[0];
+ output[1] = step[1];
+ output[2] = step[2];
+ output[3] = step[3];
+ temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
+ output[4] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
+ output[5] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
+ output[6] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
+ output[7] = (tran_low_t)fdct_round_shift(temp);
+ output[8] = step[8] + step[9];
+ output[9] = step[8] - step[9];
+ output[10] = step[11] - step[10];
+ output[11] = step[11] + step[10];
+ output[12] = step[12] + step[13];
+ output[13] = step[12] - step[13];
+ output[14] = step[15] - step[14];
+ output[15] = step[15] + step[14];
+
+ range_check(output, 16, 16);
+
+ // stage 6
+ step[0] = output[0];
+ step[1] = output[1];
+ step[2] = output[2];
+ step[3] = output[3];
+ step[4] = output[4];
+ step[5] = output[5];
+ step[6] = output[6];
+ step[7] = output[7];
+ temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
+ step[8] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
+ step[9] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
+ step[10] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
+ step[11] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
+ step[12] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
+ step[13] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
+ step[14] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
+ step[15] = (tran_low_t)fdct_round_shift(temp);
+
+ range_check(step, 16, 16);
+
+ // stage 7
+ output[0] = step[0];
+ output[1] = step[8];
+ output[2] = step[4];
+ output[3] = step[12];
+ output[4] = step[2];
+ output[5] = step[10];
+ output[6] = step[6];
+ output[7] = step[14];
+ output[8] = step[1];
+ output[9] = step[9];
+ output[10] = step[5];
+ output[11] = step[13];
+ output[12] = step[3];
+ output[13] = step[11];
+ output[14] = step[7];
+ output[15] = step[15];
+
+ range_check(output, 16, 16);
+}
+
+static void fdct32(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t temp;
+ tran_low_t step[32];
+
+ // stage 0
+ range_check(input, 32, 14);
+
+ // stage 1
+ output[0] = input[0] + input[31];
+ output[1] = input[1] + input[30];
+ output[2] = input[2] + input[29];
+ output[3] = input[3] + input[28];
+ output[4] = input[4] + input[27];
+ output[5] = input[5] + input[26];
+ output[6] = input[6] + input[25];
+ output[7] = input[7] + input[24];
+ output[8] = input[8] + input[23];
+ output[9] = input[9] + input[22];
+ output[10] = input[10] + input[21];
+ output[11] = input[11] + input[20];
+ output[12] = input[12] + input[19];
+ output[13] = input[13] + input[18];
+ output[14] = input[14] + input[17];
+ output[15] = input[15] + input[16];
+ output[16] = input[15] - input[16];
+ output[17] = input[14] - input[17];
+ output[18] = input[13] - input[18];
+ output[19] = input[12] - input[19];
+ output[20] = input[11] - input[20];
+ output[21] = input[10] - input[21];
+ output[22] = input[9] - input[22];
+ output[23] = input[8] - input[23];
+ output[24] = input[7] - input[24];
+ output[25] = input[6] - input[25];
+ output[26] = input[5] - input[26];
+ output[27] = input[4] - input[27];
+ output[28] = input[3] - input[28];
+ output[29] = input[2] - input[29];
+ output[30] = input[1] - input[30];
+ output[31] = input[0] - input[31];
+
+ range_check(output, 32, 15);
+
+ // stage 2
+ step[0] = output[0] + output[15];
+ step[1] = output[1] + output[14];
+ step[2] = output[2] + output[13];
+ step[3] = output[3] + output[12];
+ step[4] = output[4] + output[11];
+ step[5] = output[5] + output[10];
+ step[6] = output[6] + output[9];
+ step[7] = output[7] + output[8];
+ step[8] = output[7] - output[8];
+ step[9] = output[6] - output[9];
+ step[10] = output[5] - output[10];
+ step[11] = output[4] - output[11];
+ step[12] = output[3] - output[12];
+ step[13] = output[2] - output[13];
+ step[14] = output[1] - output[14];
+ step[15] = output[0] - output[15];
+ step[16] = output[16];
+ step[17] = output[17];
+ step[18] = output[18];
+ step[19] = output[19];
+ temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
+ step[20] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
+ step[21] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
+ step[22] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
+ step[23] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
+ step[24] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
+ step[25] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
+ step[26] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
+ step[27] = (tran_low_t)fdct_round_shift(temp);
+ step[28] = output[28];
+ step[29] = output[29];
+ step[30] = output[30];
+ step[31] = output[31];
+
+ range_check(step, 32, 16);
+
+ // stage 3
+ output[0] = step[0] + step[7];
+ output[1] = step[1] + step[6];
+ output[2] = step[2] + step[5];
+ output[3] = step[3] + step[4];
+ output[4] = step[3] - step[4];
+ output[5] = step[2] - step[5];
+ output[6] = step[1] - step[6];
+ output[7] = step[0] - step[7];
+ output[8] = step[8];
+ output[9] = step[9];
+ temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
+ output[10] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
+ output[11] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
+ output[12] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
+ output[13] = (tran_low_t)fdct_round_shift(temp);
+ output[14] = step[14];
+ output[15] = step[15];
+ output[16] = step[16] + step[23];
+ output[17] = step[17] + step[22];
+ output[18] = step[18] + step[21];
+ output[19] = step[19] + step[20];
+ output[20] = step[19] - step[20];
+ output[21] = step[18] - step[21];
+ output[22] = step[17] - step[22];
+ output[23] = step[16] - step[23];
+ output[24] = step[31] - step[24];
+ output[25] = step[30] - step[25];
+ output[26] = step[29] - step[26];
+ output[27] = step[28] - step[27];
+ output[28] = step[28] + step[27];
+ output[29] = step[29] + step[26];
+ output[30] = step[30] + step[25];
+ output[31] = step[31] + step[24];
+
+ range_check(output, 32, 17);
+
+ // stage 4
+ step[0] = output[0] + output[3];
+ step[1] = output[1] + output[2];
+ step[2] = output[1] - output[2];
+ step[3] = output[0] - output[3];
+ step[4] = output[4];
+ temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+ step[5] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+ step[6] = (tran_low_t)fdct_round_shift(temp);
+ step[7] = output[7];
+ step[8] = output[8] + output[11];
+ step[9] = output[9] + output[10];
+ step[10] = output[9] - output[10];
+ step[11] = output[8] - output[11];
+ step[12] = output[15] - output[12];
+ step[13] = output[14] - output[13];
+ step[14] = output[14] + output[13];
+ step[15] = output[15] + output[12];
+ step[16] = output[16];
+ step[17] = output[17];
+ temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
+ step[18] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
+ step[19] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
+ step[20] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
+ step[21] = (tran_low_t)fdct_round_shift(temp);
+ step[22] = output[22];
+ step[23] = output[23];
+ step[24] = output[24];
+ step[25] = output[25];
+ temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
+ step[26] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
+ step[27] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
+ step[28] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
+ step[29] = (tran_low_t)fdct_round_shift(temp);
+ step[30] = output[30];
+ step[31] = output[31];
+
+ range_check(step, 32, 18);
+
+ // stage 5
+ temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+ output[0] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+ output[1] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+ output[2] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+ output[3] = (tran_low_t)fdct_round_shift(temp);
+ output[4] = step[4] + step[5];
+ output[5] = step[4] - step[5];
+ output[6] = step[7] - step[6];
+ output[7] = step[7] + step[6];
+ output[8] = step[8];
+ temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
+ output[9] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
+ output[10] = (tran_low_t)fdct_round_shift(temp);
+ output[11] = step[11];
+ output[12] = step[12];
+ temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
+ output[13] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
+ output[14] = (tran_low_t)fdct_round_shift(temp);
+ output[15] = step[15];
+ output[16] = step[16] + step[19];
+ output[17] = step[17] + step[18];
+ output[18] = step[17] - step[18];
+ output[19] = step[16] - step[19];
+ output[20] = step[23] - step[20];
+ output[21] = step[22] - step[21];
+ output[22] = step[22] + step[21];
+ output[23] = step[23] + step[20];
+ output[24] = step[24] + step[27];
+ output[25] = step[25] + step[26];
+ output[26] = step[25] - step[26];
+ output[27] = step[24] - step[27];
+ output[28] = step[31] - step[28];
+ output[29] = step[30] - step[29];
+ output[30] = step[30] + step[29];
+ output[31] = step[31] + step[28];
+
+ range_check(output, 32, 18);
+
+ // stage 6
+ step[0] = output[0];
+ step[1] = output[1];
+ step[2] = output[2];
+ step[3] = output[3];
+ temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+ step[4] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+ step[5] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+ step[6] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+ step[7] = (tran_low_t)fdct_round_shift(temp);
+ step[8] = output[8] + output[9];
+ step[9] = output[8] - output[9];
+ step[10] = output[11] - output[10];
+ step[11] = output[11] + output[10];
+ step[12] = output[12] + output[13];
+ step[13] = output[12] - output[13];
+ step[14] = output[15] - output[14];
+ step[15] = output[15] + output[14];
+ step[16] = output[16];
+ temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
+ step[17] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
+ step[18] = (tran_low_t)fdct_round_shift(temp);
+ step[19] = output[19];
+ step[20] = output[20];
+ temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
+ step[21] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
+ step[22] = (tran_low_t)fdct_round_shift(temp);
+ step[23] = output[23];
+ step[24] = output[24];
+ temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
+ step[25] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
+ step[26] = (tran_low_t)fdct_round_shift(temp);
+ step[27] = output[27];
+ step[28] = output[28];
+ temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
+ step[29] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
+ step[30] = (tran_low_t)fdct_round_shift(temp);
+ step[31] = output[31];
+
+ range_check(step, 32, 18);
+
+ // stage 7
+ output[0] = step[0];
+ output[1] = step[1];
+ output[2] = step[2];
+ output[3] = step[3];
+ output[4] = step[4];
+ output[5] = step[5];
+ output[6] = step[6];
+ output[7] = step[7];
+ temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
+ output[8] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
+ output[9] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
+ output[10] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
+ output[11] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
+ output[12] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
+ output[13] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
+ output[14] = (tran_low_t)fdct_round_shift(temp);
+ temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
+ output[15] = (tran_low_t)fdct_round_shift(temp);
+ output[16] = step[16] + step[17];
+ output[17] = step[16] - step[17];
+ output[18] = step[19] - step[18];
+ output[19] = step[19] + step[18];
+ output[20] = step[20] + step[21];
+ output[21] = step[20] - step[21];
+ output[22] = step[23] - step[22];
+ output[23] = step[23] + step[22];
+ output[24] = step[24] + step[25];
+ output[25] = step[24] - step[25];
+ output[26] = step[27] - step[26];
+ output[27] = step[27] + step[26];
+ output[28] = step[28] + step[29];
+ output[29] = step[28] - step[29];
+ output[30] = step[31] - step[30];
+ output[31] = step[31] + step[30];
+
+ range_check(output, 32, 18);
+
+ // stage 8
+ step[0] = output[0];
+ step[1] = output[1];
+ step[2] = output[2];
+ step[3] = output[3];
+ step[4] = output[4];
+ step[5] = output[5];
+ step[6] = output[6];
+ step[7] = output[7];
+ step[8] = output[8];
+ step[9] = output[9];
+ step[10] = output[10];
+ step[11] = output[11];
+ step[12] = output[12];
+ step[13] = output[13];
+ step[14] = output[14];
+ step[15] = output[15];
+ temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
+ step[16] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
+ step[17] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
+ step[18] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
+ step[19] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
+ step[20] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
+ step[21] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
+ step[22] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
+ step[23] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
+ step[24] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
+ step[25] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
+ step[26] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
+ step[27] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
+ step[28] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
+ step[29] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
+ step[30] = (tran_low_t)fdct_round_shift(temp);
+ temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
+ step[31] = (tran_low_t)fdct_round_shift(temp);
+
+ range_check(step, 32, 18);
+
+ // stage 9
+ output[0] = step[0];
+ output[1] = step[16];
+ output[2] = step[8];
+ output[3] = step[24];
+ output[4] = step[4];
+ output[5] = step[20];
+ output[6] = step[12];
+ output[7] = step[28];
+ output[8] = step[2];
+ output[9] = step[18];
+ output[10] = step[10];
+ output[11] = step[26];
+ output[12] = step[6];
+ output[13] = step[22];
+ output[14] = step[14];
+ output[15] = step[30];
+ output[16] = step[1];
+ output[17] = step[17];
+ output[18] = step[9];
+ output[19] = step[25];
+ output[20] = step[5];
+ output[21] = step[21];
+ output[22] = step[13];
+ output[23] = step[29];
+ output[24] = step[3];
+ output[25] = step[19];
+ output[26] = step[11];
+ output[27] = step[27];
+ output[28] = step[7];
+ output[29] = step[23];
+ output[30] = step[15];
+ output[31] = step[31];
+
+ range_check(output, 32, 18);
+}
+
+#ifndef AV1_DCT_GTEST
+
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_4_9 * x0;
+ s2 = sinpi_2_9 * x1;
+ s3 = sinpi_1_9 * x1;
+ s4 = sinpi_3_9 * x2;
+ s5 = sinpi_4_9 * x3;
+ s6 = sinpi_2_9 * x3;
+ s7 = x0 + x1 - x3;
+
+ x0 = s0 + s2 + s5;
+ x1 = sinpi_3_9 * s7;
+ x2 = s1 - s3 + s6;
+ x3 = s4;
+
+ s0 = x0 + x3;
+ s1 = x1;
+ s2 = x2 - x3;
+ s3 = x2 - x0 + x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ output[0] = (tran_low_t)fdct_round_shift(s0);
+ output[1] = (tran_low_t)fdct_round_shift(s1);
+ output[2] = (tran_low_t)fdct_round_shift(s2);
+ output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = fdct_round_shift(s0 - s4);
+ x5 = fdct_round_shift(s1 - s5);
+ x6 = fdct_round_shift(s2 - s6);
+ x7 = fdct_round_shift(s3 - s7);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = fdct_round_shift(s0 + s2);
+ x1 = fdct_round_shift(s1 + s3);
+ x2 = fdct_round_shift(s0 - s2);
+ x3 = fdct_round_shift(s1 - s3);
+ x4 = fdct_round_shift(s4 + s6);
+ x5 = fdct_round_shift(s5 + s7);
+ x6 = fdct_round_shift(s4 - s6);
+ x7 = fdct_round_shift(s5 - s7);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = fdct_round_shift(s2);
+ x3 = fdct_round_shift(s3);
+ x6 = fdct_round_shift(s6);
+ x7 = fdct_round_shift(s7);
+
+ output[0] = (tran_low_t)x0;
+ output[1] = (tran_low_t)-x4;
+ output[2] = (tran_low_t)x6;
+ output[3] = (tran_low_t)-x2;
+ output[4] = (tran_low_t)x3;
+ output[5] = (tran_low_t)-x7;
+ output[6] = (tran_low_t)x5;
+ output[7] = (tran_low_t)-x1;
+}
+
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = s0 + s8;
+ x1 = s1 + s9;
+ x2 = s2 + s10;
+ x3 = s3 + s11;
+ x4 = s4 + s12;
+ x5 = s5 + s13;
+ x6 = s6 + s14;
+ x7 = s7 + s15;
+
+ x8 = fdct_round_shift(s0 - s8);
+ x9 = fdct_round_shift(s1 - s9);
+ x10 = fdct_round_shift(s2 - s10);
+ x11 = fdct_round_shift(s3 - s11);
+ x12 = fdct_round_shift(s4 - s12);
+ x13 = fdct_round_shift(s5 - s13);
+ x14 = fdct_round_shift(s6 - s14);
+ x15 = fdct_round_shift(s7 - s15);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = fdct_round_shift(s0 - s4);
+ x5 = fdct_round_shift(s1 - s5);
+ x6 = fdct_round_shift(s2 - s6);
+ x7 = fdct_round_shift(s3 - s7);
+
+ x8 = s8 + s12;
+ x9 = s9 + s13;
+ x10 = s10 + s14;
+ x11 = s11 + s15;
+ x12 = fdct_round_shift(s8 - s12);
+ x13 = fdct_round_shift(s9 - s13);
+ x14 = fdct_round_shift(s10 - s14);
+ x15 = fdct_round_shift(s11 - s15);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = fdct_round_shift(s0 + s2);
+ x1 = fdct_round_shift(s1 + s3);
+ x2 = fdct_round_shift(s0 - s2);
+ x3 = fdct_round_shift(s1 - s3);
+
+ x4 = fdct_round_shift(s4 + s6);
+ x5 = fdct_round_shift(s5 + s7);
+ x6 = fdct_round_shift(s4 - s6);
+ x7 = fdct_round_shift(s5 - s7);
+
+ x8 = fdct_round_shift(s8 + s10);
+ x9 = fdct_round_shift(s9 + s11);
+ x10 = fdct_round_shift(s8 - s10);
+ x11 = fdct_round_shift(s9 - s11);
+
+ x12 = fdct_round_shift(s12 + s14);
+ x13 = fdct_round_shift(s13 + s15);
+ x14 = fdct_round_shift(s12 - s14);
+ x15 = fdct_round_shift(s13 - s15);
+
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = fdct_round_shift(s2);
+ x3 = fdct_round_shift(s3);
+ x6 = fdct_round_shift(s6);
+ x7 = fdct_round_shift(s7);
+ x10 = fdct_round_shift(s10);
+ x11 = fdct_round_shift(s11);
+ x14 = fdct_round_shift(s14);
+ x15 = fdct_round_shift(s15);
+
+ output[0] = (tran_low_t)x0;
+ output[1] = (tran_low_t)-x8;
+ output[2] = (tran_low_t)x12;
+ output[3] = (tran_low_t)-x4;
+ output[4] = (tran_low_t)x6;
+ output[5] = (tran_low_t)x14;
+ output[6] = (tran_low_t)x10;
+ output[7] = (tran_low_t)x2;
+ output[8] = (tran_low_t)x3;
+ output[9] = (tran_low_t)x11;
+ output[10] = (tran_low_t)x15;
+ output[11] = (tran_low_t)x7;
+ output[12] = (tran_low_t)x5;
+ output[13] = (tran_low_t)-x13;
+ output[14] = (tran_low_t)x9;
+ output[15] = (tran_low_t)-x1;
+}
+
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[16];
+ for (i = 0; i < 16; ++i) {
+ output[16 + i] = input[i] * 4;
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 16; ++i) {
+ inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+ }
+ fdct16(inputhalf, output);
+ // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_EXT_TX
+static void fidtx4(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 4; ++i)
+ output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+}
+
+static void fidtx8(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+}
+
+static void fidtx16(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 16; ++i)
+ output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void fidtx32(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+}
+
+static void copy_block(const int16_t *src, int src_stride, int l, int w,
+ int16_t *dest, int dest_stride) {
+ int i;
+ for (i = 0; i < l; ++i) {
+ memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
+ }
+}
+
+static void fliplr(int16_t *dest, int stride, int l, int w) {
+ int i, j;
+ for (i = 0; i < l; ++i) {
+ for (j = 0; j < w / 2; ++j) {
+ const int16_t tmp = dest[i * stride + j];
+ dest[i * stride + j] = dest[i * stride + w - 1 - j];
+ dest[i * stride + w - 1 - j] = tmp;
+ }
+ }
+}
+
+static void flipud(int16_t *dest, int stride, int l, int w) {
+ int i, j;
+ for (j = 0; j < w; ++j) {
+ for (i = 0; i < l / 2; ++i) {
+ const int16_t tmp = dest[i * stride + j];
+ dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
+ dest[(l - 1 - i) * stride + j] = tmp;
+ }
+ }
+}
+
+static void fliplrud(int16_t *dest, int stride, int l, int w) {
+ int i, j;
+ for (i = 0; i < l / 2; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int16_t tmp = dest[i * stride + j];
+ dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
+ dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
+ }
+ }
+}
+
+static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
+ int16_t *dest, int dest_stride) {
+ copy_block(src, src_stride, l, w, dest, dest_stride);
+ fliplr(dest, dest_stride, l, w);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
+ int16_t *dest, int dest_stride) {
+ copy_block(src, src_stride, l, w, dest, dest_stride);
+ flipud(dest, dest_stride, l, w);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
+ int16_t *dest, int dest_stride) {
+ copy_block(src, src_stride, l, w, dest, dest_stride);
+ fliplrud(dest, dest_stride, l, w);
+}
+
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
+ int16_t *buff, int tx_type) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case IDTX:
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST: break;
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST:
+ copy_flipud(*src, *src_stride, l, w, buff, w);
+ *src = buff;
+ *src_stride = w;
+ break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ copy_fliplr(*src, *src_stride, l, w, buff, w);
+ *src = buff;
+ *src_stride = w;
+ break;
+ case FLIPADST_FLIPADST:
+ copy_fliplrud(*src, *src_stride, l, w, buff, w);
+ *src = buff;
+ *src_stride = w;
+ break;
+ default: assert(0); break;
+ }
+}
+#endif // CONFIG_EXT_TX
+
+void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ if (tx_type == DCT_DCT) {
+ aom_fdct4x4_c(input, output, stride);
+ } else {
+ static const transform_2d FHT[] = {
+ { fdct4, fdct4 }, // DCT_DCT
+ { fadst4, fdct4 }, // ADST_DCT
+ { fdct4, fadst4 }, // DCT_ADST
+ { fadst4, fadst4 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst4, fdct4 }, // FLIPADST_DCT
+ { fdct4, fadst4 }, // DCT_FLIPADST
+ { fadst4, fadst4 }, // FLIPADST_FLIPADST
+ { fadst4, fadst4 }, // ADST_FLIPADST
+ { fadst4, fadst4 }, // FLIPADST_ADST
+ { fidtx4, fidtx4 }, // IDTX
+ { fdct4, fidtx4 }, // V_DCT
+ { fidtx4, fdct4 }, // H_DCT
+ { fadst4, fidtx4 }, // V_ADST
+ { fidtx4, fadst4 }, // H_ADST
+ { fadst4, fidtx4 }, // V_FLIPADST
+ { fidtx4, fadst4 }, // H_FLIPADST
+#endif // CONFIG_EXT_TX
+ };
+ const transform_2d ht = FHT[tx_type];
+ tran_low_t out[4 * 4];
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+
+#if CONFIG_EXT_TX
+ int16_t flipped_input[4 * 4];
+ maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
+ if (i == 0 && temp_in[0]) temp_in[0] += 1;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
+ }
+ }
+}
+
+void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct8, fdct4 }, // DCT_DCT
+ { fadst8, fdct4 }, // ADST_DCT
+ { fdct8, fadst4 }, // DCT_ADST
+ { fadst8, fadst4 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst8, fdct4 }, // FLIPADST_DCT
+ { fdct8, fadst4 }, // DCT_FLIPADST
+ { fadst8, fadst4 }, // FLIPADST_FLIPADST
+ { fadst8, fadst4 }, // ADST_FLIPADST
+ { fadst8, fadst4 }, // FLIPADST_ADST
+ { fidtx8, fidtx4 }, // IDTX
+ { fdct8, fidtx4 }, // V_DCT
+ { fidtx8, fdct4 }, // H_DCT
+ { fadst8, fidtx4 }, // V_ADST
+ { fidtx8, fadst4 }, // H_ADST
+ { fadst8, fidtx4 }, // V_FLIPADST
+ { fidtx8, fadst4 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 4;
+ const int n2 = 8;
+ tran_low_t out[8 * 4];
+ tran_low_t temp_in[8], temp_out[8];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[8 * 4];
+ maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+ // Rows
+ for (i = 0; i < n2; ++i) {
+ for (j = 0; j < n; ++j)
+ temp_in[j] =
+ (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+ }
+
+ // Columns
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n2; ++j)
+ output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+ }
+ // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct4, fdct8 }, // DCT_DCT
+ { fadst4, fdct8 }, // ADST_DCT
+ { fdct4, fadst8 }, // DCT_ADST
+ { fadst4, fadst8 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst4, fdct8 }, // FLIPADST_DCT
+ { fdct4, fadst8 }, // DCT_FLIPADST
+ { fadst4, fadst8 }, // FLIPADST_FLIPADST
+ { fadst4, fadst8 }, // ADST_FLIPADST
+ { fadst4, fadst8 }, // FLIPADST_ADST
+ { fidtx4, fidtx8 }, // IDTX
+ { fdct4, fidtx8 }, // V_DCT
+ { fidtx4, fdct8 }, // H_DCT
+ { fadst4, fidtx8 }, // V_ADST
+ { fidtx4, fadst8 }, // H_ADST
+ { fadst4, fidtx8 }, // V_FLIPADST
+ { fidtx4, fadst8 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 4;
+ const int n2 = 8;
+ tran_low_t out[8 * 4];
+ tran_low_t temp_in[8], temp_out[8];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[8 * 4];
+ maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < n2; ++i) {
+ for (j = 0; j < n; ++j)
+ temp_in[j] =
+ (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n2; ++j)
+ output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+ }
+ // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct16, fdct4 }, // DCT_DCT
+ { fadst16, fdct4 }, // ADST_DCT
+ { fdct16, fadst4 }, // DCT_ADST
+ { fadst16, fadst4 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst16, fdct4 }, // FLIPADST_DCT
+ { fdct16, fadst4 }, // DCT_FLIPADST
+ { fadst16, fadst4 }, // FLIPADST_FLIPADST
+ { fadst16, fadst4 }, // ADST_FLIPADST
+ { fadst16, fadst4 }, // FLIPADST_ADST
+ { fidtx16, fidtx4 }, // IDTX
+ { fdct16, fidtx4 }, // V_DCT
+ { fidtx16, fdct4 }, // H_DCT
+ { fadst16, fidtx4 }, // V_ADST
+ { fidtx16, fadst4 }, // H_ADST
+ { fadst16, fidtx4 }, // V_FLIPADST
+ { fidtx16, fadst4 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 4;
+ const int n4 = 16;
+ tran_low_t out[16 * 4];
+ tran_low_t temp_in[16], temp_out[16];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[16 * 4];
+ maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
+#endif
+
+ // Rows
+ for (i = 0; i < n4; ++i) {
+ for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+ }
+
+ // Columns
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n4; ++j)
+ output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+ }
+ // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct4, fdct16 }, // DCT_DCT
+ { fadst4, fdct16 }, // ADST_DCT
+ { fdct4, fadst16 }, // DCT_ADST
+ { fadst4, fadst16 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst4, fdct16 }, // FLIPADST_DCT
+ { fdct4, fadst16 }, // DCT_FLIPADST
+ { fadst4, fadst16 }, // FLIPADST_FLIPADST
+ { fadst4, fadst16 }, // ADST_FLIPADST
+ { fadst4, fadst16 }, // FLIPADST_ADST
+ { fidtx4, fidtx16 }, // IDTX
+ { fdct4, fidtx16 }, // V_DCT
+ { fidtx4, fdct16 }, // H_DCT
+ { fadst4, fidtx16 }, // V_ADST
+ { fidtx4, fadst16 }, // H_ADST
+ { fadst4, fidtx16 }, // V_FLIPADST
+ { fidtx4, fadst16 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 4;
+ const int n4 = 16;
+ tran_low_t out[16 * 4];
+ tran_low_t temp_in[16], temp_out[16];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[16 * 4];
+ maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < n4; ++i) {
+ for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n4; ++j)
+ output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+ }
+ // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct16, fdct8 }, // DCT_DCT
+ { fadst16, fdct8 }, // ADST_DCT
+ { fdct16, fadst8 }, // DCT_ADST
+ { fadst16, fadst8 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst16, fdct8 }, // FLIPADST_DCT
+ { fdct16, fadst8 }, // DCT_FLIPADST
+ { fadst16, fadst8 }, // FLIPADST_FLIPADST
+ { fadst16, fadst8 }, // ADST_FLIPADST
+ { fadst16, fadst8 }, // FLIPADST_ADST
+ { fidtx16, fidtx8 }, // IDTX
+ { fdct16, fidtx8 }, // V_DCT
+ { fidtx16, fdct8 }, // H_DCT
+ { fadst16, fidtx8 }, // V_ADST
+ { fidtx16, fadst8 }, // H_ADST
+ { fadst16, fidtx8 }, // V_FLIPADST
+ { fidtx16, fadst8 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 8;
+ const int n2 = 16;
+ tran_low_t out[16 * 8];
+ tran_low_t temp_in[16], temp_out[16];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[16 * 8];
+ maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+ // Rows
+ for (i = 0; i < n2; ++i) {
+ for (j = 0; j < n; ++j)
+ temp_in[j] =
+ (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n; ++j)
+ out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ }
+
+ // Columns
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
+ }
+ // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct8, fdct16 }, // DCT_DCT
+ { fadst8, fdct16 }, // ADST_DCT
+ { fdct8, fadst16 }, // DCT_ADST
+ { fadst8, fadst16 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst8, fdct16 }, // FLIPADST_DCT
+ { fdct8, fadst16 }, // DCT_FLIPADST
+ { fadst8, fadst16 }, // FLIPADST_FLIPADST
+ { fadst8, fadst16 }, // ADST_FLIPADST
+ { fadst8, fadst16 }, // FLIPADST_ADST
+ { fidtx8, fidtx16 }, // IDTX
+ { fdct8, fidtx16 }, // V_DCT
+ { fidtx8, fdct16 }, // H_DCT
+ { fadst8, fidtx16 }, // V_ADST
+ { fidtx8, fadst16 }, // H_ADST
+ { fadst8, fidtx16 }, // V_FLIPADST
+ { fidtx8, fadst16 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 8;
+ const int n2 = 16;
+ tran_low_t out[16 * 8];
+ tran_low_t temp_in[16], temp_out[16];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[16 * 8];
+ maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < n2; ++i) {
+ for (j = 0; j < n; ++j)
+ temp_in[j] =
+ (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n; ++j)
+ out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ }
+
+ // Rows
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
+ }
+ // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct32, fdct8 }, // DCT_DCT
+ { fhalfright32, fdct8 }, // ADST_DCT
+ { fdct32, fadst8 }, // DCT_ADST
+ { fhalfright32, fadst8 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fhalfright32, fdct8 }, // FLIPADST_DCT
+ { fdct32, fadst8 }, // DCT_FLIPADST
+ { fhalfright32, fadst8 }, // FLIPADST_FLIPADST
+ { fhalfright32, fadst8 }, // ADST_FLIPADST
+ { fhalfright32, fadst8 }, // FLIPADST_ADST
+ { fidtx32, fidtx8 }, // IDTX
+ { fdct32, fidtx8 }, // V_DCT
+ { fidtx32, fdct8 }, // H_DCT
+ { fhalfright32, fidtx8 }, // V_ADST
+ { fidtx32, fadst8 }, // H_ADST
+ { fhalfright32, fidtx8 }, // V_FLIPADST
+ { fidtx32, fadst8 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 8;
+ const int n4 = 32;
+ tran_low_t out[32 * 8];
+ tran_low_t temp_in[32], temp_out[32];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[32 * 8];
+ maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
+#endif
+
+ // Rows
+ for (i = 0; i < n4; ++i) {
+ for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+ }
+
+ // Columns
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n4; ++j)
+ output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ }
+ // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct8, fdct32 }, // DCT_DCT
+ { fadst8, fdct32 }, // ADST_DCT
+ { fdct8, fhalfright32 }, // DCT_ADST
+ { fadst8, fhalfright32 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst8, fdct32 }, // FLIPADST_DCT
+ { fdct8, fhalfright32 }, // DCT_FLIPADST
+ { fadst8, fhalfright32 }, // FLIPADST_FLIPADST
+ { fadst8, fhalfright32 }, // ADST_FLIPADST
+ { fadst8, fhalfright32 }, // FLIPADST_ADST
+ { fidtx8, fidtx32 }, // IDTX
+ { fdct8, fidtx32 }, // V_DCT
+ { fidtx8, fdct32 }, // H_DCT
+ { fadst8, fidtx32 }, // V_ADST
+ { fidtx8, fhalfright32 }, // H_ADST
+ { fadst8, fidtx32 }, // V_FLIPADST
+ { fidtx8, fhalfright32 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 8;
+ const int n4 = 32;
+ tran_low_t out[32 * 8];
+ tran_low_t temp_in[32], temp_out[32];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[32 * 8];
+ maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < n4; ++i) {
+ for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n4; ++j)
+ output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ }
+ // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct32, fdct16 }, // DCT_DCT
+ { fhalfright32, fdct16 }, // ADST_DCT
+ { fdct32, fadst16 }, // DCT_ADST
+ { fhalfright32, fadst16 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fhalfright32, fdct16 }, // FLIPADST_DCT
+ { fdct32, fadst16 }, // DCT_FLIPADST
+ { fhalfright32, fadst16 }, // FLIPADST_FLIPADST
+ { fhalfright32, fadst16 }, // ADST_FLIPADST
+ { fhalfright32, fadst16 }, // FLIPADST_ADST
+ { fidtx32, fidtx16 }, // IDTX
+ { fdct32, fidtx16 }, // V_DCT
+ { fidtx32, fdct16 }, // H_DCT
+ { fhalfright32, fidtx16 }, // V_ADST
+ { fidtx32, fadst16 }, // H_ADST
+ { fhalfright32, fidtx16 }, // V_FLIPADST
+ { fidtx32, fadst16 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 16;
+ const int n2 = 32;
+ tran_low_t out[32 * 16];
+ tran_low_t temp_in[32], temp_out[32];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[32 * 16];
+ maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+ // Rows
+ for (i = 0; i < n2; ++i) {
+ for (j = 0; j < n; ++j)
+ temp_in[j] =
+ (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n; ++j)
+ out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+ }
+
+ // Columns
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
+ }
+ // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct16, fdct32 }, // DCT_DCT
+ { fadst16, fdct32 }, // ADST_DCT
+ { fdct16, fhalfright32 }, // DCT_ADST
+ { fadst16, fhalfright32 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst16, fdct32 }, // FLIPADST_DCT
+ { fdct16, fhalfright32 }, // DCT_FLIPADST
+ { fadst16, fhalfright32 }, // FLIPADST_FLIPADST
+ { fadst16, fhalfright32 }, // ADST_FLIPADST
+ { fadst16, fhalfright32 }, // FLIPADST_ADST
+ { fidtx16, fidtx32 }, // IDTX
+ { fdct16, fidtx32 }, // V_DCT
+ { fidtx16, fdct32 }, // H_DCT
+ { fadst16, fidtx32 }, // V_ADST
+ { fidtx16, fhalfright32 }, // H_ADST
+ { fadst16, fidtx32 }, // V_FLIPADST
+ { fidtx16, fhalfright32 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ const int n = 16;
+ const int n2 = 32;
+ tran_low_t out[32 * 16];
+ tran_low_t temp_in[32], temp_out[32];
+ int i, j;
+#if CONFIG_EXT_TX
+ int16_t flipped_input[32 * 16];
+ maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < n2; ++i) {
+ for (j = 0; j < n; ++j)
+ temp_in[j] =
+ (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < n; ++j)
+ out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+ }
+
+ // Rows
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
+ }
+ // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fdct8x8_quant_c(const int16_t *input, int stride,
+ tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan
+#if CONFIG_AOM_QM
+ ,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+ ) {
+ int eob = -1;
+
+ int i, j;
+ tran_low_t intermediate[64];
+
+ // Transform columns
+ {
+ tran_low_t *output = intermediate;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ for (i = 0; i < 8; i++) {
+ // stage 1
+ s0 = (input[0 * stride] + input[7 * stride]) * 4;
+ s1 = (input[1 * stride] + input[6 * stride]) * 4;
+ s2 = (input[2 * stride] + input[5 * stride]) * 4;
+ s3 = (input[3 * stride] + input[4 * stride]) * 4;
+ s4 = (input[3 * stride] - input[4 * stride]) * 4;
+ s5 = (input[2 * stride] - input[5 * stride]) * 4;
+ s6 = (input[1 * stride] - input[6 * stride]) * 4;
+ s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+ output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+ output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+ output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+ // stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+ output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+ output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+ output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+ input++;
+ output++;
+ }
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+ for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
+ }
+
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+ const qm_val_t wt = qm_ptr[rc];
+ const qm_val_t iwt = iqm_ptr[rc];
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+#endif
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ int tmp32;
+#if CONFIG_AOM_QM
+ tmp32 = (int)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+#else
+ tmp32 = (int)((tmp * quant_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+#endif
+
+ if (tmp32) eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ if (tx_type == DCT_DCT) {
+ aom_fdct8x8_c(input, output, stride);
+ } else {
+ static const transform_2d FHT[] = {
+ { fdct8, fdct8 }, // DCT_DCT
+ { fadst8, fdct8 }, // ADST_DCT
+ { fdct8, fadst8 }, // DCT_ADST
+ { fadst8, fadst8 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst8, fdct8 }, // FLIPADST_DCT
+ { fdct8, fadst8 }, // DCT_FLIPADST
+ { fadst8, fadst8 }, // FLIPADST_FLIPADST
+ { fadst8, fadst8 }, // ADST_FLIPADST
+ { fadst8, fadst8 }, // FLIPADST_ADST
+ { fidtx8, fidtx8 }, // IDTX
+ { fdct8, fidtx8 }, // V_DCT
+ { fidtx8, fdct8 }, // H_DCT
+ { fadst8, fidtx8 }, // V_ADST
+ { fidtx8, fadst8 }, // H_ADST
+ { fadst8, fidtx8 }, // V_FLIPADST
+ { fidtx8, fadst8 }, // H_FLIPADST
+#endif // CONFIG_EXT_TX
+ };
+ const transform_2d ht = FHT[tx_type];
+ tran_low_t out[64];
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+
+#if CONFIG_EXT_TX
+ int16_t flipped_input[8 * 8];
+ maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 8; ++j)
+ output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+ }
+ }
+}
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+ pixel. */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i;
+ tran_high_t a1, b1, c1, d1, e1;
+ const int16_t *ip_pass0 = input;
+ const tran_low_t *ip = NULL;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip_pass0[0 * stride];
+ b1 = ip_pass0[1 * stride];
+ c1 = ip_pass0[2 * stride];
+ d1 = ip_pass0[3 * stride];
+
+ a1 += b1;
+ d1 = d1 - c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)a1;
+ op[4] = (tran_low_t)c1;
+ op[8] = (tran_low_t)d1;
+ op[12] = (tran_low_t)b1;
+
+ ip_pass0++;
+ op++;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0];
+ b1 = ip[1];
+ c1 = ip[2];
+ d1 = ip[3];
+
+ a1 += b1;
+ d1 -= c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+ op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+ op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+ op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+ ip += 4;
+ op += 4;
+ }
+}
+
+void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct16, fdct16 }, // DCT_DCT
+ { fadst16, fdct16 }, // ADST_DCT
+ { fdct16, fadst16 }, // DCT_ADST
+ { fadst16, fadst16 }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { fadst16, fdct16 }, // FLIPADST_DCT
+ { fdct16, fadst16 }, // DCT_FLIPADST
+ { fadst16, fadst16 }, // FLIPADST_FLIPADST
+ { fadst16, fadst16 }, // ADST_FLIPADST
+ { fadst16, fadst16 }, // FLIPADST_ADST
+ { fidtx16, fidtx16 }, // IDTX
+ { fdct16, fidtx16 }, // V_DCT
+ { fidtx16, fdct16 }, // H_DCT
+ { fadst16, fidtx16 }, // V_ADST
+ { fidtx16, fadst16 }, // H_ADST
+ { fadst16, fidtx16 }, // V_FLIPADST
+ { fidtx16, fadst16 }, // H_FLIPADST
+#endif // CONFIG_EXT_TX
+ };
+
+ const transform_2d ht = FHT[tx_type];
+ tran_low_t out[256];
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+
+#if CONFIG_EXT_TX
+ int16_t flipped_input[16 * 16];
+ maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht4x4_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht4x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht8x4_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht8x16_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht16x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht16x32_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht32x16_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht4x16_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht16x4_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht8x32_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht32x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht8x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ av1_fwht4x4_c(input, output, stride);
+}
+
+void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht16x16_c(input, output, stride, tx_type);
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct32, fdct32 }, // DCT_DCT
+#if CONFIG_EXT_TX
+ { fhalfright32, fdct32 }, // ADST_DCT
+ { fdct32, fhalfright32 }, // DCT_ADST
+ { fhalfright32, fhalfright32 }, // ADST_ADST
+ { fhalfright32, fdct32 }, // FLIPADST_DCT
+ { fdct32, fhalfright32 }, // DCT_FLIPADST
+ { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST
+ { fhalfright32, fhalfright32 }, // ADST_FLIPADST
+ { fhalfright32, fhalfright32 }, // FLIPADST_ADST
+ { fidtx32, fidtx32 }, // IDTX
+ { fdct32, fidtx32 }, // V_DCT
+ { fidtx32, fdct32 }, // H_DCT
+ { fhalfright32, fidtx32 }, // V_ADST
+ { fidtx32, fhalfright32 }, // H_ADST
+ { fhalfright32, fidtx32 }, // V_FLIPADST
+ { fidtx32, fhalfright32 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ tran_low_t out[1024];
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+
+#if CONFIG_EXT_TX
+ int16_t flipped_input[32 * 32];
+ maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
+#endif
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
+ }
+}
+
+#if CONFIG_TX64X64
+#if CONFIG_EXT_TX
+static void fidtx64(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 64; ++i)
+ output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[32];
+ for (i = 0; i < 32; ++i) {
+ output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 32; ++i) {
+ inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
+ }
+ fdct32(inputhalf, output);
+ // Note overall scaling factor is 2 times unitary
+}
+#endif // CONFIG_EXT_TX
+
+static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
+ int32_t in[64], out[64];
+ int i;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64,
+ fwd_stage_range_col_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
+ int32_t in[64], out[64];
+ int i;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64,
+ fwd_stage_range_row_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct64_col, fdct64_row }, // DCT_DCT
+#if CONFIG_EXT_TX
+ { fhalfright64, fdct64_row }, // ADST_DCT
+ { fdct64_col, fhalfright64 }, // DCT_ADST
+ { fhalfright64, fhalfright64 }, // ADST_ADST
+ { fhalfright64, fdct64_row }, // FLIPADST_DCT
+ { fdct64_col, fhalfright64 }, // DCT_FLIPADST
+ { fhalfright64, fhalfright64 }, // FLIPADST_FLIPADST
+ { fhalfright64, fhalfright64 }, // ADST_FLIPADST
+ { fhalfright64, fhalfright64 }, // FLIPADST_ADST
+ { fidtx64, fidtx64 }, // IDTX
+ { fdct64_col, fidtx64 }, // V_DCT
+ { fidtx64, fdct64_row }, // H_DCT
+ { fhalfright64, fidtx64 }, // V_ADST
+ { fidtx64, fhalfright64 }, // H_ADST
+ { fhalfright64, fidtx64 }, // V_FLIPADST
+ { fidtx64, fhalfright64 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ tran_low_t out[4096];
+ int i, j;
+ tran_low_t temp_in[64], temp_out[64];
+#if CONFIG_EXT_TX
+ int16_t flipped_input[64 * 64];
+ maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
+#endif
+ // Columns
+ for (i = 0; i < 64; ++i) {
+ for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 64; ++j)
+ out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 64; ++i) {
+ for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 64; ++j)
+ output[j + i * 64] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+#endif // CONFIG_TX64X64
+
+#if CONFIG_EXT_TX
+// Forward identity transform.
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
+ int bs, int tx_type) {
+ int r, c;
+ const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
+ if (tx_type == IDTX) {
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
+ src_diff += stride;
+ coeff += bs;
+ }
+ }
+}
+#endif // CONFIG_EXT_TX
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht32x32_c(input, output, stride, tx_type);
+}
+
+#if CONFIG_TX64X64
+void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht64x64_c(input, output, stride, tx_type);
+}
+#endif // CONFIG_TX64X64
+#endif // CONFIG_HIGHBITDEPTH
+#endif // !AV1_DCT_GTEST
diff --git a/third_party/aom/av1/encoder/encint.h b/third_party/aom/av1/encoder/encint.h
new file mode 100644
index 0000000000..30ea8521fb
--- /dev/null
+++ b/third_party/aom/av1/encoder/encint.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+/* clang-format off */
+
+#if !defined(_encint_H)
+# define _encint_H (1)
+
+typedef struct daala_enc_ctx od_enc_ctx;
+typedef struct od_params_ctx od_params_ctx;
+typedef struct od_rollback_buffer od_rollback_buffer;
+
+# include "aom_dsp/entenc.h"
+# include "av1/common/odintrin.h"
+# include "av1/common/pvq_state.h"
+
+struct daala_enc_ctx{
+ /* Stores context-adaptive CDFs for PVQ. */
+ od_state state;
+ /* AOM entropy encoder. */
+ aom_writer w;
+ int use_activity_masking;
+ /* Mode of quantization matrice : FLAT (0) or HVS (1) */
+ int qm;
+ /*Normalized PVQ lambda for use where we've already performed
+ quantization.*/
+ double pvq_norm_lambda;
+ double pvq_norm_lambda_dc;
+};
+
+// from daalaenc.h
+/**The encoder context.*/
+typedef struct daala_enc_ctx daala_enc_ctx;
+
+/** Holds important encoder information so we can roll back decisions */
+struct od_rollback_buffer {
+ od_ec_enc ec;
+ od_adapt_ctx adapt;
+};
+
+void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf);
+void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf);
+
+#endif
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
new file mode 100644
index 0000000000..d254157e72
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -0,0 +1,7160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#if CONFIG_SUPERTX
+#include "av1/encoder/cost.h"
+#endif
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#include "av1/common/warped_motion.h"
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+#include "av1/encoder/global_motion.h"
+#endif // CONFIG_GLOBAL_MOTION
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#if CONFIG_PVQ
+#include "av1/common/pvq.h"
+#include "av1/encoder/pvq_encoder.h"
+#endif
+#if CONFIG_HIGHBITDEPTH
+#define IF_HBD(...) __VA_ARGS__
+#else
+#define IF_HBD(...)
+#endif // CONFIG_HIGHBITDEPTH
+
+static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
+ TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int *rate);
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx);
+
+static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PC_TREE *pc_tree);
+static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+ int mi_row_ori, int mi_col_ori,
+#endif // CONFIG_EXT_INTER
+ int mi_row_pred, int mi_col_pred,
+ BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+ PC_TREE *pc_tree);
+static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row,
+ int mi_col, int mi_row_ori, int mi_col_ori,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
+ int dst_stride[3], PC_TREE *pc_tree);
+static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run, PC_TREE *pc_tree);
+static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
+ TX_TYPE *best_tx, PC_TREE *pc_tree);
+#endif // CONFIG_SUPERTX
+
+// This is used as a reference when computing the source variance for the
+// purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+// which will be faster.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+#endif // CONFIG_EXT_PARTITION
+};
+
+#if CONFIG_HIGHBITDEPTH
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+#endif // CONFIG_EXT_PARTITION
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+#if CONFIG_EXT_PARTITION
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+#endif // CONFIG_EXT_PARTITION
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16,
+#if CONFIG_EXT_PARTITION
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16
+#endif // CONFIG_EXT_PARTITION
+};
+#endif // CONFIG_HIGHBITDEPTH
+
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs) {
+ unsigned int sse;
+ const unsigned int var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_HIGHBITDEPTH
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd) {
+ unsigned int var, sse;
+ switch (bd) {
+ case 10:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), 0, &sse);
+ break;
+ case 12:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), 0, &sse);
+ break;
+ case 8:
+ default:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), 0, &sse);
+ break;
+ }
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
+ const struct buf_2d *ref,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bs) {
+ unsigned int sse, var;
+ uint8_t *last_y;
+ const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+ assert(last != NULL);
+ last_y =
+ &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+ var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col) {
+ unsigned int var = get_sby_perpixel_diff_variance(
+ cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
+ if (var < 8)
+ return BLOCK_64X64;
+ else if (var < 128)
+ return BLOCK_32X32;
+ else if (var < 2048)
+ return BLOCK_16X16;
+ else
+ return BLOCK_8X8;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static void set_mode_info_offsets(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, MACROBLOCKD *const xd,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int idx_str = xd->mi_stride * mi_row + mi_col;
+ xd->mi = cm->mi_grid_visible + idx_str;
+ xd->mi[0] = cm->mi + idx_str;
+ x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
+ const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ set_skip_context(xd, mi_row, mi_col);
+
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+#if CONFIG_VAR_TX
+ xd->above_txfm_context = cm->above_txfm_context + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
+
+ // Set up destination pointers.
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ x->mv_limits.row_min =
+ -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
+ x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+
+ set_plane_n4(xd, mi_width, mi_height);
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+
+ // Set up source buffers.
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
+
+ // R/D setup.
+ x->rddiv = cpi->rd.RDDIV;
+ x->rdmult = cpi->rd.RDMULT;
+
+ // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+ xd->tile = *tile;
+}
+
+static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+ const struct segmentation *const seg = &cm->seg;
+
+ set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+ mbmi = &xd->mi[0]->mbmi;
+
+ // Setup segment ID.
+ if (seg->enabled) {
+ if (!cpi->vaq_refresh) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ }
+ av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+ } else {
+ mbmi->segment_id = 0;
+ }
+
+#if CONFIG_SUPERTX
+ mbmi->segment_id_supertx = MAX_SEGMENTS;
+#endif // CONFIG_SUPERTX
+}
+
+#if CONFIG_SUPERTX
+static void set_offsets_supertx(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ MACROBLOCK *const x = &td->mb;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+#if CONFIG_DEPENDENT_HORZTILES
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col, cm->dependent_horz_tiles);
+#else
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+#endif
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+}
+
+static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row_pred,
+ int mi_col_pred, int mi_row_ori, int mi_col_ori,
+ BLOCK_SIZE bsize_pred) {
+ // Used in supertx
+ // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
+ // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+ MACROBLOCK *const x = &td->mb;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[bsize_pred];
+ const int mi_height = mi_size_high[bsize_pred];
+
+#if CONFIG_DEPENDENT_HORZTILES
+ set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori,
+ cm->dependent_horz_tiles);
+#else
+ set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori);
+#endif
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ x->mv_limits.row_min =
+ -(((mi_row_pred + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.col_min =
+ -(((mi_col_pred + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.row_max =
+ (cm->mi_rows - mi_row_pred) * MI_SIZE + AOM_INTERP_EXTEND;
+ x->mv_limits.col_max =
+ (cm->mi_cols - mi_col_pred) * MI_SIZE + AOM_INTERP_EXTEND;
+
+// Set up distance of MB to edge of frame in 1/8th pel units.
+#if !CONFIG_CB4X4
+ assert(!(mi_col_pred & (mi_width - mi_size_wide[BLOCK_8X8])) &&
+ !(mi_row_pred & (mi_height - mi_size_high[BLOCK_8X8])));
+#endif
+ set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+ xd->up_available = (mi_row_ori > tile->mi_row_start);
+ xd->left_available = (mi_col_ori > tile->mi_col_start);
+
+ // R/D setup.
+ x->rddiv = cpi->rd.RDDIV;
+ x->rdmult = cpi->rd.RDMULT;
+}
+
+static void set_segment_id_supertx(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, const int mi_row,
+ const int mi_col, const BLOCK_SIZE bsize) {
+ const AV1_COMMON *cm = &cpi->common;
+ const struct segmentation *seg = &cm->seg;
+ const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
+ const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
+ const int mi_offset = mi_row * cm->mi_stride + mi_col;
+ MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
+ int r, c;
+ int seg_id_supertx = MAX_SEGMENTS;
+
+ if (!seg->enabled) {
+ seg_id_supertx = 0;
+ } else {
+ // Find the minimum segment_id
+ for (r = 0; r < mih; r++)
+ for (c = 0; c < miw; c++)
+ seg_id_supertx =
+ AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
+ assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
+
+ // Initialize plane quantisers
+ av1_init_plane_quantizers(cpi, x, seg_id_supertx);
+ }
+
+ // Assign the the segment_id back to segment_id_supertx
+ for (r = 0; r < mih; r++)
+ for (c = 0; c < miw; c++)
+ mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+}
+#endif // CONFIG_SUPERTX
+
+static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+ const int mi_width = AOMMAX(mi_size_wide[bsize], mi_size_wide[BLOCK_8X8]);
+ const int mi_height = AOMMAX(mi_size_high[bsize], mi_size_high[BLOCK_8X8]);
+ for (int r = 0; r < mi_height; ++r) {
+ for (int c = 0; c < mi_width; ++c) {
+ set_mode_info_offsets(cpi, x, xd, mi_row + r, mi_col + c);
+ xd->mi[0]->mbmi.sb_type = bsize;
+ }
+ }
+ }
+}
+
+static void set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, VAR_TREE *vt, int mi_row,
+ int mi_col, const int64_t *const threshold,
+ const BLOCK_SIZE *const bsize_min) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int hbw = mi_size_wide[vt->bsize] / 2;
+ const int hbh = mi_size_high[vt->bsize] / 2;
+ const int has_cols = mi_col + hbw < cm->mi_cols;
+ const int has_rows = mi_row + hbh < cm->mi_rows;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ assert(vt->bsize >= BLOCK_8X8);
+
+ assert(hbh == hbw);
+
+ if (vt->bsize == BLOCK_8X8 && cm->frame_type != KEY_FRAME) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_8X8);
+ return;
+ }
+
+ if (vt->force_split || (!has_cols && !has_rows)) goto split;
+
+ // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+ // variance is below threshold, otherwise split will be selected.
+ // No check for vert/horiz split as too few samples for variance.
+ if (vt->bsize == bsize_min[0]) {
+ if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+ return;
+ } else {
+ BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ if (vt->bsize > BLOCK_8X8) {
+ set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+ set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
+ }
+ return;
+ }
+ } else if (vt->bsize > bsize_min[0]) {
+ // For key frame: take split for bsize above 32X32 or very high variance.
+ if (cm->frame_type == KEY_FRAME &&
+ (vt->bsize > BLOCK_32X32 ||
+ vt->variances.none.variance > (threshold[0] << 4))) {
+ goto split;
+ }
+ // If variance is low, take the bsize (no split).
+ if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+ return;
+ }
+
+ // Check vertical split.
+ if (has_rows) {
+ BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
+ if (vt->variances.vert[0].variance < threshold[0] &&
+ vt->variances.vert[1].variance < threshold[0] &&
+ get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+ return;
+ }
+ }
+ // Check horizontal split.
+ if (has_cols) {
+ BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
+ if (vt->variances.horz[0].variance < threshold[0] &&
+ vt->variances.horz[1].variance < threshold[0] &&
+ get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+ return;
+ }
+ }
+ }
+
+split : {
+ set_vt_partitioning(cpi, x, xd, vt->split[0], mi_row, mi_col, threshold + 1,
+ bsize_min + 1);
+ set_vt_partitioning(cpi, x, xd, vt->split[1], mi_row, mi_col + hbw,
+ threshold + 1, bsize_min + 1);
+ set_vt_partitioning(cpi, x, xd, vt->split[2], mi_row + hbh, mi_col,
+ threshold + 1, bsize_min + 1);
+ set_vt_partitioning(cpi, x, xd, vt->split[3], mi_row + hbh, mi_col + hbw,
+ threshold + 1, bsize_min + 1);
+ return;
+}
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_key_frame = (cm->frame_type == KEY_FRAME);
+ const int threshold_multiplier = is_key_frame ? 20 : 1;
+ const int64_t threshold_base =
+ (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
+ if (is_key_frame) {
+ thresholds[1] = threshold_base;
+ thresholds[2] = threshold_base >> 2;
+ thresholds[3] = threshold_base >> 2;
+ thresholds[4] = threshold_base << 2;
+ } else {
+ thresholds[2] = threshold_base;
+ if (cm->width <= 352 && cm->height <= 288) {
+ thresholds[1] = threshold_base >> 2;
+ thresholds[3] = threshold_base << 3;
+ } else {
+ thresholds[1] = threshold_base;
+ thresholds[2] = (5 * threshold_base) >> 2;
+ if (cm->width >= 1920 && cm->height >= 1080)
+ thresholds[2] = (7 * threshold_base) >> 2;
+ thresholds[3] = threshold_base << cpi->oxcf.speed;
+ }
+ }
+ thresholds[0] = INT64_MIN;
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q) {
+ AV1_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const int is_key_frame = (cm->frame_type == KEY_FRAME);
+ if (sf->partition_search_type != VAR_BASED_PARTITION &&
+ sf->partition_search_type != REFERENCE_PARTITION) {
+ return;
+ } else {
+ set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+ // The thresholds below are not changed locally.
+ if (is_key_frame) {
+ cpi->vbp_threshold_sad = 0;
+ cpi->vbp_bsize_min = BLOCK_8X8;
+ } else {
+ if (cm->width <= 352 && cm->height <= 288)
+ cpi->vbp_threshold_sad = 100;
+ else
+ cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000
+ ? (cpi->y_dequant[q][1] << 1)
+ : 1000;
+ cpi->vbp_bsize_min = BLOCK_16X16;
+ }
+ cpi->vbp_threshold_minmax = 15 + (q >> 3);
+ }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+#if CONFIG_HIGHBITDEPTH
+ int highbd,
+#endif
+ int pixels_wide, int pixels_high) {
+ int k;
+ int minmax_max = 0;
+ int minmax_min = 255;
+ // Loop over the 4 8x8 subblocks.
+ for (k = 0; k < 4; k++) {
+ const int x8_idx = ((k & 1) << 3);
+ const int y8_idx = ((k >> 1) << 3);
+ int min = 0;
+ int max = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ const int src_offset = y8_idx * src_stride + x8_idx;
+ const int ref_offset = y8_idx * ref_stride + x8_idx;
+#if CONFIG_HIGHBITDEPTH
+ if (highbd) {
+ aom_highbd_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
+ ref_stride, &min, &max);
+ } else {
+ aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
+ ref_stride, &min, &max);
+ }
+#else
+ aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset, ref_stride,
+ &min, &max);
+#endif
+ if ((max - min) > minmax_max) minmax_max = (max - min);
+ if ((max - min) < minmax_min) minmax_min = (max - min);
+ }
+ }
+ return (minmax_max - minmax_min);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE int avg_4x4(const uint8_t *const src, const int stride,
+ const int highbd) {
+ if (highbd) {
+ return aom_highbd_avg_4x4(src, stride);
+ } else {
+ return aom_avg_4x4(src, stride);
+ }
+}
+#else
+static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
+ return aom_avg_4x4(src, stride);
+}
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE int avg_8x8(const uint8_t *const src, const int stride,
+ const int highbd) {
+ if (highbd) {
+ return aom_highbd_avg_8x8(src, stride);
+ } else {
+ return aom_avg_8x8(src, stride);
+ }
+}
+#else
+static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
+ return aom_avg_8x8(src, stride);
+}
+#endif
+
+static void init_variance_tree(VAR_TREE *const vt,
+#if CONFIG_HIGHBITDEPTH
+ const int highbd,
+#endif
+ BLOCK_SIZE bsize, BLOCK_SIZE leaf_size,
+ const int width, const int height,
+ const uint8_t *const src, const int src_stride,
+ const uint8_t *const ref, const int ref_stride) {
+ assert(bsize >= leaf_size);
+
+ vt->bsize = bsize;
+
+ vt->force_split = 0;
+
+ vt->src = src;
+ vt->src_stride = src_stride;
+ vt->ref = ref;
+ vt->ref_stride = ref_stride;
+
+ vt->width = width;
+ vt->height = height;
+
+#if CONFIG_HIGHBITDEPTH
+ vt->highbd = highbd;
+#endif // CONFIG_HIGHBITDEPTH
+
+ if (bsize > leaf_size) {
+ const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+ const int px = block_size_wide[subsize];
+
+ init_variance_tree(vt->split[0],
+#if CONFIG_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_HIGHBITDEPTH
+ subsize, leaf_size, AOMMIN(px, width),
+ AOMMIN(px, height), src, src_stride, ref, ref_stride);
+ init_variance_tree(vt->split[1],
+#if CONFIG_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_HIGHBITDEPTH
+ subsize, leaf_size, width - px, AOMMIN(px, height),
+ src + px, src_stride, ref + px, ref_stride);
+ init_variance_tree(vt->split[2],
+#if CONFIG_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_HIGHBITDEPTH
+ subsize, leaf_size, AOMMIN(px, width), height - px,
+ src + px * src_stride, src_stride, ref + px * ref_stride,
+ ref_stride);
+ init_variance_tree(vt->split[3],
+#if CONFIG_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_HIGHBITDEPTH
+ subsize, leaf_size, width - px, height - px,
+ src + px * src_stride + px, src_stride,
+ ref + px * ref_stride + px, ref_stride);
+ }
+}
+
+// Fill the variance tree based on averaging pixel values (sub-sampling), at
+// the leaf node size.
+static void fill_variance_tree(VAR_TREE *const vt, const BLOCK_SIZE leaf_size) {
+ if (vt->bsize > leaf_size) {
+ fill_variance_tree(vt->split[0], leaf_size);
+ fill_variance_tree(vt->split[1], leaf_size);
+ fill_variance_tree(vt->split[2], leaf_size);
+ fill_variance_tree(vt->split[3], leaf_size);
+ fill_variance_node(vt);
+ } else if (vt->width <= 0 || vt->height <= 0) {
+ fill_variance(0, 0, 0, &vt->variances.none);
+ } else {
+ unsigned int sse = 0;
+ int sum = 0;
+ int src_avg;
+ int ref_avg;
+ assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
+ if (leaf_size == BLOCK_4X4) {
+ src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+ ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+ } else {
+ src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+ ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+ }
+ sum = src_avg - ref_avg;
+ sse = sum * sum;
+ fill_variance(sse, sum, 0, &vt->variances.none);
+ }
+}
+
+static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
+ if (vt->bsize >= BLOCK_8X8) {
+ if (vt->bsize == BLOCK_16X16) {
+ if (vt->variances.none.variance <= threshold)
+ return;
+ else
+ vt->force_split = 0;
+ }
+
+ refine_variance_tree(vt->split[0], threshold);
+ refine_variance_tree(vt->split[1], threshold);
+ refine_variance_tree(vt->split[2], threshold);
+ refine_variance_tree(vt->split[3], threshold);
+
+ if (vt->bsize <= BLOCK_16X16) fill_variance_node(vt);
+ } else if (vt->width <= 0 || vt->height <= 0) {
+ fill_variance(0, 0, 0, &vt->variances.none);
+ } else {
+ const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+ const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+ const int sum = src_avg - ref_avg;
+ const unsigned int sse = sum * sum;
+ assert(vt->bsize == BLOCK_4X4);
+ fill_variance(sse, sum, 0, &vt->variances.none);
+ }
+}
+
+static int check_split_key_frame(VAR_TREE *const vt, const int64_t threshold) {
+ if (vt->bsize == BLOCK_32X32) {
+ vt->force_split = vt->variances.none.variance > threshold;
+ } else {
+ vt->force_split |= check_split_key_frame(vt->split[0], threshold);
+ vt->force_split |= check_split_key_frame(vt->split[1], threshold);
+ vt->force_split |= check_split_key_frame(vt->split[2], threshold);
+ vt->force_split |= check_split_key_frame(vt->split[3], threshold);
+ }
+ return vt->force_split;
+}
+
+static int check_split(AV1_COMP *const cpi, VAR_TREE *const vt,
+ const int segment_id, const int64_t *const thresholds) {
+ if (vt->bsize == BLOCK_16X16) {
+ vt->force_split = vt->variances.none.variance > thresholds[0];
+ if (!vt->force_split && vt->variances.none.variance > thresholds[-1] &&
+ !cyclic_refresh_segment_id_boosted(segment_id)) {
+ // We have some nominal amount of 16x16 variance (based on average),
+ // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+ // force split to 8x8 block for this 16x16 block.
+ int minmax =
+ compute_minmax_8x8(vt->src, vt->src_stride, vt->ref, vt->ref_stride,
+#if CONFIG_HIGHBITDEPTH
+ vt->highbd,
+#endif
+ vt->width, vt->height);
+ vt->force_split = minmax > cpi->vbp_threshold_minmax;
+ }
+ } else {
+ vt->force_split |=
+ check_split(cpi, vt->split[0], segment_id, thresholds + 1);
+ vt->force_split |=
+ check_split(cpi, vt->split[1], segment_id, thresholds + 1);
+ vt->force_split |=
+ check_split(cpi, vt->split[2], segment_id, thresholds + 1);
+ vt->force_split |=
+ check_split(cpi, vt->split[3], segment_id, thresholds + 1);
+
+ if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
+ vt->force_split = vt->variances.none.variance > thresholds[0];
+ }
+ }
+
+ return vt->force_split;
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last (or golden), where variance is computed for down-sampled
+// inputs.
+static void choose_partitioning(AV1_COMP *const cpi, ThreadData *const td,
+ const TileInfo *const tile, MACROBLOCK *const x,
+ const int mi_row, const int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+#if CONFIG_DUAL_FILTER
+ int i;
+#endif
+ const uint8_t *src;
+ const uint8_t *ref;
+ int src_stride;
+ int ref_stride;
+ int pixels_wide = MI_SIZE * mi_size_wide[cm->sb_size];
+ int pixels_high = MI_SIZE * mi_size_high[cm->sb_size];
+ int64_t thresholds[5] = {
+ cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2],
+ cpi->vbp_thresholds[3], cpi->vbp_thresholds[4],
+ };
+ BLOCK_SIZE bsize_min[5] = { BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+ cpi->vbp_bsize_min, BLOCK_8X8 };
+ const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
+ const int64_t *const thre = thresholds + start_level;
+ const BLOCK_SIZE *const bmin = bsize_min + start_level;
+
+ const int is_key_frame = (cm->frame_type == KEY_FRAME);
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+
+ int segment_id = CR_SEGMENT_ID_BASE;
+
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ const uint8_t *const map =
+ cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
+
+ if (cyclic_refresh_segment_id_boosted(segment_id)) {
+ int q = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ set_vbp_thresholds(cpi, thresholds, q);
+ }
+ }
+
+ set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
+
+ if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+ if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+ src = x->plane[0].src.buf;
+ src_stride = x->plane[0].src.stride;
+
+ if (!is_key_frame) {
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ unsigned int y_sad, y_sad_g;
+
+ const int hbs = cm->mib_size / 2;
+ const int split_vert = mi_col + hbs >= cm->mi_cols;
+ const int split_horz = mi_row + hbs >= cm->mi_rows;
+ BLOCK_SIZE bsize;
+
+ if (split_vert && split_horz)
+ bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
+ else if (split_vert)
+ bsize = get_subsize(cm->sb_size, PARTITION_VERT);
+ else if (split_horz)
+ bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
+ else
+ bsize = cm->sb_size;
+
+ assert(yv12 != NULL);
+
+ if (yv12_g && yv12_g != yv12) {
+ av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ y_sad_g = cpi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+ } else {
+ y_sad_g = UINT_MAX;
+ }
+
+ av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[LAST_FRAME - 1].sf);
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->sb_type = cm->sb_size;
+ mbmi->mv[0].as_int = 0;
+#if CONFIG_DUAL_FILTER
+ for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = BILINEAR;
+#else
+ mbmi->interp_filter = BILINEAR;
+#endif
+
+ y_sad = av1_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
+ if (y_sad_g < y_sad) {
+ av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ mbmi->ref_frame[0] = GOLDEN_FRAME;
+ mbmi->mv[0].as_int = 0;
+ y_sad = y_sad_g;
+ } else {
+ x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
+ }
+
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, cm->sb_size);
+
+ ref = xd->plane[0].dst.buf;
+ ref_stride = xd->plane[0].dst.stride;
+
+ // If the y_sad is very small, take the largest partition and exit.
+ // Don't check on boosted segment for now, as largest is suppressed there.
+ if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
+ if (!split_vert && !split_horz) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
+ return;
+ }
+ }
+ } else {
+ ref = AV1_VAR_OFFS;
+ ref_stride = 0;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (xd->bd) {
+ case 10: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10); break;
+ case 12: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12); break;
+ case 8:
+ default: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8); break;
+ }
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ }
+
+ init_variance_tree(
+ vt,
+#if CONFIG_HIGHBITDEPTH
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif // CONFIG_HIGHBITDEPTH
+ cm->sb_size, (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
+ pixels_wide, pixels_high, src, src_stride, ref, ref_stride);
+
+ // Fill in the entire tree of variances and compute splits.
+ if (is_key_frame) {
+ fill_variance_tree(vt, BLOCK_4X4);
+ check_split_key_frame(vt, thre[1]);
+ } else {
+ fill_variance_tree(vt, BLOCK_8X8);
+ check_split(cpi, vt, segment_id, thre);
+ if (low_res) {
+ refine_variance_tree(vt, thre[1] << 1);
+ }
+ }
+
+ vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
+ mi_row + cm->mib_size > cm->mi_rows;
+
+ // Now go through the entire structure, splitting every block size until
+ // we get to one that's got a variance lower than our threshold.
+ set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
+}
+
+#if CONFIG_DUAL_FILTER
+static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+ MB_MODE_INFO *mbmi) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ if (!has_subpel_mv_component(xd->mi[0], xd, dir) &&
+ (mbmi->ref_frame[1] == NONE_FRAME ||
+ !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
+ mbmi->interp_filter[dir] = (cm->interp_filter == SWITCHABLE)
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+ mbmi->interp_filter[dir + 2] = mbmi->interp_filter[dir];
+ }
+}
+
+static void update_filter_type_count(FRAME_COUNTS *counts,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+ (mbmi->ref_frame[1] > INTRA_FRAME &&
+ has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]];
+ }
+ }
+}
+#endif
+#if CONFIG_GLOBAL_MOTION
+static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+ const MB_MODE_INFO *mbmi,
+ RD_COUNTS *rdc) {
+ if (mode == ZEROMV
+#if CONFIG_EXT_INTER
+ || mode == ZERO_ZEROMV
+#endif
+ ) {
+ const int num_4x4s =
+ num_4x4_blocks_wide_lookup[bsize] * num_4x4_blocks_high_lookup[bsize];
+ int ref;
+ for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
+ }
+ }
+}
+#endif // CONFIG_GLOBAL_MOTION
+
+static void reset_tx_size(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+ const TX_MODE tx_mode) {
+ if (xd->lossless[mbmi->segment_id]) {
+ mbmi->tx_size = TX_4X4;
+ } else if (tx_mode != TX_MODE_SELECT) {
+ mbmi->tx_size =
+ tx_size_from_tx_mode(mbmi->sb_type, tx_mode, is_inter_block(mbmi));
+ }
+}
+
+#if CONFIG_REF_MV
+static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
+ int8_t rf_type) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+ const int bw = xd->n8_w << MI_SIZE_LOG2;
+ const int bh = xd->n8_h << MI_SIZE_LOG2;
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ CANDIDATE_MV *const curr_ref_mv_stack = mbmi_ext->ref_mv_stack[rf_type];
+
+#if CONFIG_EXT_INTER
+ if (has_second_ref(mbmi)) {
+ // Special case: NEAR_NEWMV and NEW_NEARMV modes use 1 + mbmi->ref_mv_idx
+ // (like NEARMV) instead
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) ref_mv_idx += 1;
+
+ if (compound_ref0_mode(mbmi->mode) == NEWMV) {
+ int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+ clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+ mbmi->pred_mv[0] = this_mv;
+ mi_pred_mv[0] = this_mv;
+ }
+ if (compound_ref1_mode(mbmi->mode) == NEWMV) {
+ int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
+ clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+ mbmi->pred_mv[1] = this_mv;
+ mi_pred_mv[1] = this_mv;
+ }
+ } else {
+#endif // CONFIG_EXT_INTER
+ if (mbmi->mode == NEWMV) {
+ int i;
+ for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+ int_mv this_mv = (i == 0) ? curr_ref_mv_stack[ref_mv_idx].this_mv
+ : curr_ref_mv_stack[ref_mv_idx].comp_mv;
+ clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
+ mbmi->pred_mv[i] = this_mv;
+ mi_pred_mv[i] = this_mv;
+ }
+ }
+#if CONFIG_EXT_INTER
+ }
+#endif // CONFIG_EXT_INTER
+}
+#endif // CONFIG_REF_MV
+
+static void update_state(const AV1_COMP *const cpi, ThreadData *td,
+ PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+ int i, x_idx, y;
+ const AV1_COMMON *const cm = &cpi->common;
+ RD_COUNTS *const rdc = &td->rd_counts;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ MODE_INFO *mi = &ctx->mic;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ MODE_INFO *mi_addr = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ const int bw = mi_size_wide[mi->mbmi.sb_type];
+ const int bh = mi_size_high[mi->mbmi.sb_type];
+ const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+ MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
+
+ const int mis = cm->mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int unify_bsize = CONFIG_CB4X4;
+
+#if CONFIG_REF_MV
+ int8_t rf_type;
+#endif
+
+#if !CONFIG_SUPERTX
+ assert(mi->mbmi.sb_type == bsize);
+#endif
+
+ *mi_addr = *mi;
+ *x->mbmi_ext = ctx->mbmi_ext;
+
+#if CONFIG_DUAL_FILTER
+ reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
+#if CONFIG_REF_MV
+ rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+ (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
+ set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
+ }
+#endif // CONFIG_REF_MV
+
+ // If segmentation in use
+ if (seg->enabled) {
+ // For in frame complexity AQ copy the segment id from the segment map.
+ if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+ }
+ // Else for cyclic refresh mode update the segment map, set the segment id
+ // and then update the quantizer.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
+ bsize, ctx->rate, ctx->dist, x->skip);
+ reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+ }
+ }
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ pd[i].dqcoeff = ctx->dqcoeff[i];
+#if CONFIG_PVQ
+ pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
+#endif
+ p[i].eobs = ctx->eobs[i];
+#if CONFIG_LV_MAP
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+#endif // CONFIG_LV_MAP
+ }
+#if CONFIG_PALETTE
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+#endif // CONFIG_PALETTE
+
+ // Restore the coding context of the MB to that that was in place
+ // when the mode was picked for it
+ for (y = 0; y < mi_height; y++)
+ for (x_idx = 0; x_idx < mi_width; x_idx++)
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+ xd->mi[x_idx + y * mis] = mi_addr;
+ }
+
+#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q
+ if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ)
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+#else
+ if (cpi->oxcf.aq_mode)
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+#endif
+
+ if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
+ mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+ }
+
+ x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+ for (i = 0; i < 1; ++i)
+ memcpy(x->blk_skip[i], ctx->blk_skip[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+
+ if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+ {
+ unsigned int *const mode_chosen_counts =
+ (unsigned int *)cpi->mode_chosen_counts; // Cast const away.
+ if (frame_is_intra_only(cm)) {
+ static const int kf_mode_index[] = {
+ THR_DC /*DC_PRED*/,
+ THR_V_PRED /*V_PRED*/,
+ THR_H_PRED /*H_PRED*/,
+ THR_D45_PRED /*D45_PRED*/,
+ THR_D135_PRED /*D135_PRED*/,
+ THR_D117_PRED /*D117_PRED*/,
+ THR_D153_PRED /*D153_PRED*/,
+ THR_D207_PRED /*D207_PRED*/,
+ THR_D63_PRED /*D63_PRED*/,
+#if CONFIG_ALT_INTRA
+ THR_SMOOTH, /*SMOOTH_PRED*/
+#endif // CONFIG_ALT_INTRA
+ THR_TM /*TM_PRED*/,
+ };
+ ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
+ } else {
+ // Note how often each mode chosen as best
+ ++mode_chosen_counts[ctx->best_mode_index];
+ }
+ }
+#endif
+ if (!frame_is_intra_only(cm)) {
+ if (is_inter_block(mbmi)) {
+ av1_update_mv_count(td);
+#if CONFIG_GLOBAL_MOTION
+ if (bsize >= BLOCK_8X8) {
+ // TODO(sarahparker): global motion stats need to be handled per-tile
+ // to be compatible with tile-based threading.
+ update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
+ }
+ }
+ }
+#endif // CONFIG_GLOBAL_MOTION
+ if (cm->interp_filter == SWITCHABLE
+#if CONFIG_WARPED_MOTION
+ && mbmi->motion_mode != WARPED_CAUSAL
+#endif // CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+ && !is_nontrans_global_motion(xd)
+#endif // CONFIG_GLOBAL_MOTION
+ ) {
+#if CONFIG_DUAL_FILTER
+ update_filter_type_count(td->counts, xd, mbmi);
+#else
+ const int switchable_ctx = av1_get_pred_context_switchable_interp(xd);
+ ++td->counts->switchable_interp[switchable_ctx][mbmi->interp_filter];
+#endif
+ }
+ }
+
+ rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+ rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+ rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+ }
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+ mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+ mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+ mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+ }
+ }
+}
+
+#if CONFIG_SUPERTX
+static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
+ PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+ int y, x_idx;
+#if CONFIG_VAR_TX
+ int i;
+#endif
+ const AV1_COMMON *const cm = &cpi->common;
+ RD_COUNTS *const rdc = &td->rd_counts;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi = &ctx->mic;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ MODE_INFO *mi_addr = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ const int mis = cm->mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col);
+ const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row);
+ const int unify_bsize = CONFIG_CB4X4;
+ MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
+
+#if CONFIG_REF_MV
+ int8_t rf_type;
+#endif
+
+ *mi_addr = *mi;
+ *x->mbmi_ext = ctx->mbmi_ext;
+ assert(is_inter_block(mbmi));
+ assert(mbmi->tx_size == ctx->mic.mbmi.tx_size);
+
+#if CONFIG_DUAL_FILTER
+ reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
+#if CONFIG_REF_MV
+ rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+ (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
+ set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
+ }
+#endif // CONFIG_REF_MV
+
+ // If segmentation in use
+ if (seg->enabled) {
+ if (cpi->vaq_refresh) {
+ const int energy =
+ bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
+ mi_addr->mbmi.segment_id = av1_vaq_segment_id(energy);
+ } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ // For cyclic refresh mode, now update the segment map
+ // and set the segment id.
+ av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
+ bsize, ctx->rate, ctx->dist, 1);
+ } else {
+ // Otherwise just set the segment id based on the current segment map
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ }
+ mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
+ }
+
+ // Restore the coding context of the MB to that that was in place
+ // when the mode was picked for it
+ for (y = 0; y < mi_height; y++)
+ for (x_idx = 0; x_idx < mi_width; x_idx++)
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+ xd->mi[x_idx + y * mis] = mi_addr;
+ }
+
+#if !CONFIG_CB4X4
+ if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
+ mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+ }
+#endif
+
+ x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+ for (i = 0; i < 1; ++i)
+ memcpy(x->blk_skip[i], ctx->blk_skip[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+
+ if (!is_inter_block(mbmi) || mbmi->skip)
+ mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif // CONFIG_VAR_TX
+
+#if CONFIG_VAR_TX
+ {
+ const TX_SIZE mtx = mbmi->tx_size;
+ const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1;
+ const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1;
+ int idy, idx;
+ mbmi->inter_tx_size[0][0] = mtx;
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+ mbmi->inter_tx_size[idy][idx] = mtx;
+ }
+#endif // CONFIG_VAR_TX
+ // Turn motion variation off for supertx
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+ if (dry_run) return;
+
+ if (!frame_is_intra_only(cm)) {
+ av1_update_mv_count(td);
+
+#if CONFIG_GLOBAL_MOTION
+ if (is_inter_block(mbmi)) {
+ if (bsize >= BLOCK_8X8) {
+ // TODO(sarahparker): global motion stats need to be handled per-tile
+ // to be compatible with tile-based threading.
+ update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
+ }
+ }
+ }
+ }
+#endif // CONFIG_GLOBAL_MOTION
+
+ if (cm->interp_filter == SWITCHABLE
+#if CONFIG_GLOBAL_MOTION
+ && !is_nontrans_global_motion(xd)
+#endif // CONFIG_GLOBAL_MOTION
+ ) {
+#if CONFIG_DUAL_FILTER
+ update_filter_type_count(td->counts, xd, mbmi);
+#else
+ const int pred_ctx = av1_get_pred_context_switchable_interp(xd);
+ ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
+#endif
+ }
+
+ rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+ rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+ rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+ }
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+ mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+ mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+ mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+ }
+ }
+}
+
+static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run, PC_TREE *pc_tree) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ int hbs = mi_size_wide[bsize] / 2;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+ PARTITION_TYPE partition = pc_tree->partitioning;
+ BLOCK_SIZE subsize = get_subsize(bsize, partition);
+ int i;
+#if CONFIG_EXT_PARTITION_TYPES
+ BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+ PICK_MODE_CONTEXT *pmc = NULL;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+ x->mb_energy = av1_block_energy(cpi, x, bsize);
+
+ switch (partition) {
+ case PARTITION_NONE:
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+ update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
+ dry_run);
+ break;
+ case PARTITION_VERT:
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+ update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
+ subsize, dry_run);
+ if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+ update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
+ mi_col + hbs, subsize, dry_run);
+ }
+ pmc = &pc_tree->vertical_supertx;
+ break;
+ case PARTITION_HORZ:
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+ update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
+ subsize, dry_run);
+ if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+ update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
+ mi_col, subsize, dry_run);
+ }
+ pmc = &pc_tree->horizontal_supertx;
+ break;
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+ update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
+ subsize, dry_run);
+ } else {
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+ update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
+ pc_tree->split[0]);
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+ update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
+ dry_run, pc_tree->split[1]);
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+ update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
+ dry_run, pc_tree->split[2]);
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
+ update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+ subsize, dry_run, pc_tree->split[3]);
+ }
+ pmc = &pc_tree->split_supertx;
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
+ bsize2, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
+ mi_col + hbs, bsize2, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+ update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
+ mi_col, subsize, dry_run);
+ pmc = &pc_tree->horizontala_supertx;
+ break;
+ case PARTITION_HORZ_B:
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+ update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
+ subsize, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
+ mi_col, bsize2, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
+ mi_col + hbs, bsize2, dry_run);
+ pmc = &pc_tree->horizontalb_supertx;
+ break;
+ case PARTITION_VERT_A:
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
+ bsize2, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
+ mi_col, bsize2, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+ update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
+ mi_col + hbs, subsize, dry_run);
+ pmc = &pc_tree->verticala_supertx;
+ break;
+ case PARTITION_VERT_B:
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+ update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
+ subsize, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
+ mi_col + hbs, bsize2, dry_run);
+ set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+ update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
+ mi_col + hbs, bsize2, dry_run);
+ pmc = &pc_tree->verticalb_supertx;
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0);
+ }
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ if (pmc != NULL) {
+ p[i].coeff = pmc->coeff[i];
+ p[i].qcoeff = pmc->qcoeff[i];
+ pd[i].dqcoeff = pmc->dqcoeff[i];
+ p[i].eobs = pmc->eobs[i];
+ } else {
+ // These should never be used
+ p[i].coeff = NULL;
+ p[i].qcoeff = NULL;
+ pd[i].dqcoeff = NULL;
+ p[i].eobs = NULL;
+ }
+ }
+}
+
+static void update_supertx_param(ThreadData *td, PICK_MODE_CONTEXT *ctx,
+ int best_tx, TX_SIZE supertx_size) {
+ MACROBLOCK *const x = &td->mb;
+#if CONFIG_VAR_TX
+ int i;
+
+ for (i = 0; i < 1; ++i)
+ memcpy(ctx->blk_skip[i], x->blk_skip[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+ ctx->mic.mbmi.min_tx_size = get_min_tx_size(supertx_size);
+#endif // CONFIG_VAR_TX
+ ctx->mic.mbmi.tx_size = supertx_size;
+ ctx->skip = x->skip;
+ ctx->mic.mbmi.tx_type = best_tx;
+}
+
+static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int best_tx, TX_SIZE supertx_size,
+ PC_TREE *pc_tree) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int hbs = mi_size_wide[bsize] / 2;
+ PARTITION_TYPE partition = pc_tree->partitioning;
+ BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ int i;
+#endif
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ update_supertx_param(td, &pc_tree->none, best_tx, supertx_size);
+ break;
+ case PARTITION_VERT:
+ update_supertx_param(td, &pc_tree->vertical[0], best_tx, supertx_size);
+ if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize))
+ update_supertx_param(td, &pc_tree->vertical[1], best_tx, supertx_size);
+ break;
+ case PARTITION_HORZ:
+ update_supertx_param(td, &pc_tree->horizontal[0], best_tx, supertx_size);
+ if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize))
+ update_supertx_param(td, &pc_tree->horizontal[1], best_tx,
+ supertx_size);
+ break;
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ update_supertx_param(td, pc_tree->leaf_split[0], best_tx, supertx_size);
+ } else {
+ update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize, best_tx,
+ supertx_size, pc_tree->split[0]);
+ update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize, best_tx,
+ supertx_size, pc_tree->split[1]);
+ update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize, best_tx,
+ supertx_size, pc_tree->split[2]);
+ update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize,
+ best_tx, supertx_size, pc_tree->split[3]);
+ }
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ for (i = 0; i < 3; i++)
+ update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
+ supertx_size);
+ break;
+ case PARTITION_HORZ_B:
+ for (i = 0; i < 3; i++)
+ update_supertx_param(td, &pc_tree->horizontalb[i], best_tx,
+ supertx_size);
+ break;
+ case PARTITION_VERT_A:
+ for (i = 0; i < 3; i++)
+ update_supertx_param(td, &pc_tree->verticala[i], best_tx, supertx_size);
+ break;
+ case PARTITION_VERT_B:
+ for (i = 0; i < 3; i++)
+ update_supertx_param(td, &pc_tree->verticalb[i], best_tx, supertx_size);
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0);
+ }
+}
+#endif // CONFIG_SUPERTX
+
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+static void set_mode_info_b(const AV1_COMP *const cpi,
+ const TileInfo *const tile, ThreadData *td,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCK *const x = &td->mb;
+ set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+ update_state(cpi, td, ctx, mi_row, mi_col, bsize, 1);
+}
+
+static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, TOKENEXTRA **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PC_TREE *pc_tree) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int hbs = mi_size_wide[bsize] / 2;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+ const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+ assert(bsize >= BLOCK_8X8);
+#endif
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, &pc_tree->none);
+ break;
+ case PARTITION_VERT:
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+ &pc_tree->vertical[0]);
+ if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
+ &pc_tree->vertical[1]);
+ }
+ break;
+ case PARTITION_HORZ:
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+ &pc_tree->horizontal[0]);
+ if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+ set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
+ &pc_tree->horizontal[1]);
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+ pc_tree->leaf_split[0]);
+ } else {
+ set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col, subsize,
+ pc_tree->split[0]);
+ set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, subsize,
+ pc_tree->split[1]);
+ set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, subsize,
+ pc_tree->split[2]);
+ set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, subsize,
+ pc_tree->split[3]);
+ }
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
+ &pc_tree->horizontala[0]);
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
+ &pc_tree->horizontala[1]);
+ set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
+ &pc_tree->horizontala[2]);
+ break;
+ case PARTITION_HORZ_B:
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+ &pc_tree->horizontalb[0]);
+ set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
+ &pc_tree->horizontalb[1]);
+ set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
+ &pc_tree->horizontalb[2]);
+ break;
+ case PARTITION_VERT_A:
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
+ &pc_tree->verticala[0]);
+ set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
+ &pc_tree->verticala[1]);
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
+ &pc_tree->verticala[2]);
+ break;
+ case PARTITION_VERT_B:
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
+ &pc_tree->verticalb[0]);
+ set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
+ &pc_tree->verticalb[1]);
+ set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
+ &pc_tree->verticalb[2]);
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0 && "Invalid partition type."); break;
+ }
+}
+#endif
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+ int mi_row, int mi_col) {
+ uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer };
+ const int widths[3] = { src->y_crop_width, src->uv_crop_width,
+ src->uv_crop_width };
+ const int heights[3] = { src->y_crop_height, src->uv_crop_height,
+ src->uv_crop_height };
+ const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ int i;
+
+ // Set current frame pointer.
+ x->e_mbd.cur_buf = src;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->mbmi.sb_type, buffers[i],
+ widths[i], heights[i], strides[i], mi_row, mi_col, NULL,
+ x->e_mbd.plane[i].subsampling_x,
+ x->e_mbd.plane[i].subsampling_y);
+}
+
+static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ int8_t segment_id) {
+ int segment_qindex;
+ const AV1_COMMON *const cm = &cpi->common;
+ av1_init_plane_quantizers(cpi, x, segment_id);
+ aom_clear_system_state();
+ segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+ int *totalrate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_TYPE partition,
+#endif
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+ int i, orig_rdmult;
+ const int unify_bsize = CONFIG_CB4X4;
+
+ aom_clear_system_state();
+
+#if CONFIG_PVQ
+ x->pvq_speed = 1;
+ x->pvq_coded = 0;
+#endif
+#if CONFIG_CFL
+ // Don't store luma during RDO (we will store the best mode later).
+ x->cfl_store_y = 0;
+#endif
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ mbmi = &xd->mi[0]->mbmi;
+ mbmi->sb_type = bsize;
+#if CONFIG_RD_DEBUG
+ mbmi->mi_row = mi_row;
+ mbmi->mi_col = mi_col;
+#endif
+#if CONFIG_SUPERTX
+ // We set tx_size here as skip blocks would otherwise not set it.
+ // tx_size needs to be set at this point as supertx_enable in
+ // write_modes_sb is computed based on this, and if the garbage in memory
+ // just happens to be the supertx_size, then the packer will code this
+ // block as a supertx block, even if rdopt did not pick it as such.
+ mbmi->tx_size = max_txsize_lookup[bsize];
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ mbmi->partition = partition;
+#endif
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ pd[i].dqcoeff = ctx->dqcoeff[i];
+#if CONFIG_PVQ
+ pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
+#endif
+ p[i].eobs = ctx->eobs[i];
+#if CONFIG_LV_MAP
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+#endif
+ }
+
+#if CONFIG_PALETTE
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+#endif // CONFIG_PALETTE
+
+ ctx->skippable = 0;
+ ctx->pred_pixel_ready = 0;
+
+ // Set to zero to make sure we do not use the previous encoded frame stats
+ mbmi->skip = 0;
+
+#if CONFIG_CB4X4
+ x->skip_chroma_rd =
+ !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y);
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ x->source_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ x->source_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+#else
+ x->source_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif // CONFIG_HIGHBITDEPTH
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ orig_rdmult = x->rdmult;
+
+ if (aq_mode == VARIANCE_AQ) {
+ if (cpi->vaq_refresh) {
+ const int energy =
+ bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
+ mbmi->segment_id = av1_vaq_segment_id(energy);
+ // Re-initialise quantiser
+ av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+ }
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == COMPLEXITY_AQ) {
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+ // If segment is boosted, use rdmult for that segment.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+ x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+ }
+
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (frame_is_intra_only(cm)) {
+ av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+ *totalrate_nocoef = 0;
+#endif // CONFIG_SUPERTX
+ } else {
+ if (bsize >= BLOCK_8X8 || unify_bsize) {
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+ *totalrate_nocoef = rd_cost->rate;
+#endif // CONFIG_SUPERTX
+ } else {
+ av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+#if CONFIG_SUPERTX
+ totalrate_nocoef,
+#endif // CONFIG_SUPERTX
+ bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+ assert(*totalrate_nocoef >= 0);
+#endif // CONFIG_SUPERTX
+ }
+ } else {
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ // The decoder rejects sub8x8 partitions when SEG_LVL_SKIP is set.
+ rd_cost->rate = INT_MAX;
+ } else {
+ av1_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost,
+#if CONFIG_SUPERTX
+ totalrate_nocoef,
+#endif // CONFIG_SUPERTX
+ bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+ assert(*totalrate_nocoef >= 0);
+#endif // CONFIG_SUPERTX
+ }
+ }
+ }
+
+ // Examine the resulting rate and for AQ mode 2 make a segment choice.
+ if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
+ (bsize >= BLOCK_16X16) &&
+ (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+ av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+ }
+
+ x->rdmult = orig_rdmult;
+
+ // TODO(jingning) The rate-distortion optimization flow needs to be
+ // refactored to provide proper exit/return handle.
+ if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+ ctx->rate = rd_cost->rate;
+ ctx->dist = rd_cost->dist;
+}
+
+#if CONFIG_REF_MV
+static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode,
+ int16_t mode_context) {
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+ if (mode == NEWMV) {
+ ++counts->newmv_mode[mode_ctx][0];
+ return;
+ } else {
+ ++counts->newmv_mode[mode_ctx][1];
+
+ if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) {
+ return;
+ }
+
+ mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+ if (mode == ZEROMV) {
+ ++counts->zeromv_mode[mode_ctx][0];
+ return;
+ } else {
+ ++counts->zeromv_mode[mode_ctx][1];
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+ if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
+ if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
+ if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+
+ ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+ }
+ }
+}
+#endif
+
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
+ int mi_col
+#if CONFIG_SUPERTX
+ ,
+ int supertx_enabled
+#endif
+ ) {
+#if CONFIG_DELTA_Q
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+#else
+ const MACROBLOCK *x = &td->mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+#endif
+ const MODE_INFO *const mi = xd->mi[0];
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int unify_bsize = CONFIG_CB4X4;
+
+#if CONFIG_DELTA_Q
+ // delta quant applies to both intra and inter
+ const int super_block_upper_left = ((mi_row & 7) == 0) && ((mi_col & 7) == 0);
+
+ if (cm->delta_q_present_flag && (bsize != BLOCK_64X64 || !mbmi->skip) &&
+ super_block_upper_left) {
+ const int dq = (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+ const int absdq = abs(dq);
+ int i;
+ for (i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+ td->counts->delta_q[i][1]++;
+ }
+ if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+ xd->prev_qindex = mbmi->current_q_index;
+#if CONFIG_EXT_DELTA_Q
+ if (cm->delta_lf_present_flag) {
+ const int dlf =
+ (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+ cm->delta_lf_res;
+ const int absdlf = abs(dlf);
+ for (i = 0; i < AOMMIN(absdlf, DELTA_LF_SMALL); ++i) {
+ td->counts->delta_lf[i][1]++;
+ }
+ if (absdlf < DELTA_LF_SMALL) td->counts->delta_lf[absdlf][0]++;
+ xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+ }
+#endif
+ }
+#else
+ (void)mi_row;
+ (void)mi_col;
+#endif
+ if (!frame_is_intra_only(cm)) {
+ FRAME_COUNTS *const counts = td->counts;
+ const int inter_block = is_inter_block(mbmi);
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active) {
+#if CONFIG_SUPERTX
+ if (!supertx_enabled)
+#endif
+ counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (inter_block) {
+ const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+#if CONFIG_EXT_REFS
+ const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+#endif // CONFIG_EXT_REFS
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+#if !SUB8X8_COMP_REF
+ if (mbmi->sb_type >= BLOCK_8X8)
+ counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
+ [has_second_ref(mbmi)]++;
+#else
+ counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
+ [has_second_ref(mbmi)]++;
+#endif
+ }
+
+ if (has_second_ref(mbmi)) {
+#if CONFIG_EXT_REFS
+ const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+
+ counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+ if (!bit) {
+ counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
+ [ref0 == LAST_FRAME]++;
+ } else {
+ counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
+ [ref0 == GOLDEN_FRAME]++;
+ }
+
+ counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
+ [ref1 == ALTREF_FRAME]++;
+#else
+ counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
+ [ref0 == GOLDEN_FRAME]++;
+#endif // CONFIG_EXT_REFS
+ } else {
+#if CONFIG_EXT_REFS
+ const int bit = (ref0 == ALTREF_FRAME || ref0 == BWDREF_FRAME);
+
+ counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+ if (bit) {
+ counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+ [ref0 != BWDREF_FRAME]++;
+ } else {
+ const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+ counts
+ ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+ if (!bit1) {
+ counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+ [ref0 != LAST_FRAME]++;
+ } else {
+ counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+ [ref0 != LAST3_FRAME]++;
+ }
+ }
+#else
+ counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0]
+ [ref0 != LAST_FRAME]++;
+ if (ref0 != LAST_FRAME) {
+ counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+ [ref0 != GOLDEN_FRAME]++;
+ }
+#endif // CONFIG_EXT_REFS
+ }
+
+#if CONFIG_EXT_INTER
+ if (cm->reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+ !supertx_enabled &&
+#endif
+ is_interintra_allowed(mbmi)) {
+ const int bsize_group = size_group_lookup[bsize];
+ if (mbmi->ref_frame[1] == INTRA_FRAME) {
+ counts->interintra[bsize_group][1]++;
+ counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+ if (is_interintra_wedge_used(bsize))
+ counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+ } else {
+ counts->interintra[bsize_group][0]++;
+ }
+ }
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ 0, xd->global_motion,
+#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ mi);
+#if CONFIG_SUPERTX
+ if (!supertx_enabled)
+#endif // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+ if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+ {
+ if (motion_allowed == WARPED_CAUSAL)
+ counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
+ else if (motion_allowed == OBMC_CAUSAL)
+ counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
+ }
+#else
+ if (motion_allowed > SIMPLE_TRANSLATION)
+ counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
+#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+ if (cm->reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode)
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ && mbmi->motion_mode == SIMPLE_TRANSLATION
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ ) {
+ counts->compound_interinter[bsize][mbmi->interinter_compound_type]++;
+ }
+#endif // CONFIG_EXT_INTER
+ }
+ }
+
+ if (inter_block &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ int16_t mode_ctx;
+#if !CONFIG_REF_MV
+ mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+#endif
+ if (bsize >= BLOCK_8X8 || unify_bsize) {
+ const PREDICTION_MODE mode = mbmi->mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (has_second_ref(mbmi)) {
+ mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+ ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+ } else {
+#endif // CONFIG_EXT_INTER
+ mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+ mbmi->ref_frame, bsize, -1);
+ update_inter_mode_stats(counts, mode, mode_ctx);
+#if CONFIG_EXT_INTER
+ }
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+#else
+ if (mbmi->mode == NEWMV) {
+#endif
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ int idx;
+
+ for (idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+
+ if (mbmi->ref_mv_idx == idx) break;
+ }
+ }
+ }
+
+#if CONFIG_EXT_INTER
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+#else
+ if (mbmi->mode == NEARMV) {
+#endif
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ int idx;
+
+ for (idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+
+ if (mbmi->ref_mv_idx == idx - 1) break;
+ }
+ }
+ }
+#else
+#if CONFIG_EXT_INTER
+ if (is_inter_compound_mode(mode))
+ ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+ else
+#endif // CONFIG_EXT_INTER
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+#endif
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (has_second_ref(mbmi)) {
+ mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+ ++counts->inter_compound_mode[mode_ctx]
+ [INTER_COMPOUND_OFFSET(b_mode)];
+ } else {
+#endif // CONFIG_EXT_INTER
+ mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+ mbmi->ref_frame, bsize, j);
+ update_inter_mode_stats(counts, b_mode, mode_ctx);
+#if CONFIG_EXT_INTER
+ }
+#endif // CONFIG_EXT_INTER
+#else
+#if CONFIG_EXT_INTER
+ if (is_inter_compound_mode(b_mode))
+ ++counts->inter_compound_mode[mode_ctx]
+ [INTER_COMPOUND_OFFSET(b_mode)];
+ else
+#endif // CONFIG_EXT_INTER
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+#endif
+ }
+ }
+ }
+ }
+ }
+}
+
+typedef struct {
+ ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+ ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+ PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+ PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+#if CONFIG_VAR_TX
+ TXFM_CONTEXT *p_ta;
+ TXFM_CONTEXT *p_tl;
+ TXFM_CONTEXT ta[MAX_MIB_SIZE];
+ TXFM_CONTEXT tl[MAX_MIB_SIZE];
+#endif
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+static void restore_context(MACROBLOCK *x,
+ const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
+ int mi_col,
+#if CONFIG_PVQ
+ od_rollback_buffer *rdo_buf,
+#endif
+ BLOCK_SIZE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide =
+ block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_blocks_high =
+ block_size_high[bsize] >> tx_size_high_log2[0];
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+ for (p = 0; p < MAX_MB_PLANE; p++) {
+ memcpy(xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ ctx->a + num_4x4_blocks_wide * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(xd->left_context[p] +
+ ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
+ ctx->l + num_4x4_blocks_high * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(xd->above_seg_context + mi_col, ctx->sa,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+ sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+ xd->above_txfm_context = ctx->p_ta;
+ xd->left_txfm_context = ctx->p_tl;
+ memcpy(xd->above_txfm_context, ctx->ta,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(xd->left_txfm_context, ctx->tl,
+ sizeof(*xd->left_txfm_context) * mi_height);
+#endif
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, rdo_buf);
+#endif
+}
+
+static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col,
+#if CONFIG_PVQ
+ od_rollback_buffer *rdo_buf,
+#endif
+ BLOCK_SIZE bsize) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide =
+ block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_blocks_high =
+ block_size_high[bsize] >> tx_size_high_log2[0];
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+
+ // buffer the above/left context information of the block in search.
+ for (p = 0; p < MAX_MB_PLANE; ++p) {
+ memcpy(ctx->a + num_4x4_blocks_wide * p,
+ xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(ctx->l + num_4x4_blocks_high * p,
+ xd->left_context[p] +
+ ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(ctx->sa, xd->above_seg_context + mi_col,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
+ sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+ memcpy(ctx->ta, xd->above_txfm_context,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(ctx->tl, xd->left_txfm_context,
+ sizeof(*xd->left_txfm_context) * mi_height);
+ ctx->p_ta = xd->above_txfm_context;
+ ctx->p_tl = xd->left_txfm_context;
+#endif
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, rdo_buf);
+#endif
+}
+
+static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
+ ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_TYPE partition,
+#endif
+ PICK_MODE_CONTEXT *ctx, int *rate) {
+ MACROBLOCK *const x = &td->mb;
+#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+ int check_ncobmc;
+#endif
+#endif
+
+ set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+#if CONFIG_EXT_PARTITION_TYPES
+ x->e_mbd.mi[0]->mbmi.partition = partition;
+#endif
+ update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+ mbmi = &xd->mi[0]->mbmi;
+ const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ 0, xd->global_motion,
+#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ xd->mi[0]);
+ check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL;
+ if (!dry_run && check_ncobmc) {
+ av1_check_ncobmc_rd(cpi, x, mi_row, mi_col);
+ av1_setup_dst_planes(x->e_mbd.plane, bsize,
+ get_frame_new_buffer(&cpi->common), mi_row, mi_col);
+ }
+#endif
+ encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
+
+ if (!dry_run) {
+#if CONFIG_EXT_DELTA_Q
+ mbmi = &xd->mi[0]->mbmi;
+ if (bsize == BLOCK_64X64 && mbmi->skip == 1 && is_inter_block(mbmi) &&
+ cpi->common.delta_lf_present_flag) {
+ mbmi->current_delta_lf_from_base = xd->prev_delta_lf_from_base;
+ }
+#endif
+#if CONFIG_SUPERTX
+ update_stats(&cpi->common, td, mi_row, mi_col, 0);
+#else
+ update_stats(&cpi->common, td, mi_row, mi_col);
+#endif
+ }
+}
+
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, TOKENEXTRA **tp, int mi_row,
+ int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PC_TREE *pc_tree, int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int is_partition_root = bsize >= BLOCK_8X8;
+ const int ctx = is_partition_root
+ ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+ mi_row + hbs < cm->mi_rows,
+ mi_col + hbs < cm->mi_cols,
+#endif
+ bsize)
+ : -1;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+ const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+ assert(bsize >= BLOCK_8X8);
+#endif
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++;
+
+#if CONFIG_SUPERTX
+ if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+ partition != PARTITION_NONE && !xd->lossless[0]) {
+ int supertx_enabled;
+ TX_SIZE supertx_size = max_txsize_lookup[bsize];
+ supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree);
+ if (supertx_enabled) {
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ int x_idx, y_idx, i;
+ uint8_t *dst_buf[3];
+ int dst_stride[3];
+ set_skip_context(xd, mi_row, mi_col);
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+ update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
+ pc_tree);
+
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col);
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ dst_buf[i] = xd->plane[i].dst.buf;
+ dst_stride[i] = xd->plane[i].dst.stride;
+ }
+ predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
+ bsize, bsize, dst_buf, dst_stride, pc_tree);
+
+ set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+ set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+ if (!x->skip) {
+ int this_rate = 0;
+ av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize);
+ av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+ if (rate) *rate += this_rate;
+ } else {
+ xd->mi[0]->mbmi.skip = 1;
+ if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
+ reset_skip_context(xd, bsize);
+ }
+ if (!dry_run) {
+ for (y_idx = 0; y_idx < mi_height; y_idx++)
+ for (x_idx = 0; x_idx < mi_width; x_idx++) {
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
+ x_idx &&
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height >
+ y_idx) {
+ xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip =
+ xd->mi[0]->mbmi.skip;
+ }
+ }
+ td->counts->supertx[partition_supertx_context_lookup[partition]]
+ [supertx_size][1]++;
+ td->counts->supertx_size[supertx_size]++;
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
+ 1 &&
+ !xd->mi[0]->mbmi.skip) {
+ const int eset =
+ get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
+ if (eset > 0) {
+ ++td->counts
+ ->inter_ext_tx[eset][supertx_size][xd->mi[0]->mbmi.tx_type];
+ }
+ }
+#else
+ if (supertx_size < TX_32X32 && !xd->mi[0]->mbmi.skip) {
+ ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type];
+ }
+#endif // CONFIG_EXT_TX
+ }
+#if CONFIG_EXT_PARTITION_TYPES
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
+ partition);
+#else
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif
+#if CONFIG_VAR_TX
+ set_txfm_ctxs(supertx_size, mi_width, mi_height, xd->mi[0]->mbmi.skip,
+ xd);
+#endif // CONFIG_VAR_TX
+ return;
+ } else {
+ if (!dry_run) {
+ td->counts->supertx[partition_supertx_context_lookup[partition]]
+ [supertx_size][0]++;
+ }
+ }
+ }
+#endif // CONFIG_SUPERTX
+
+ switch (partition) {
+ case PARTITION_NONE:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ &pc_tree->none, rate);
+ break;
+ case PARTITION_VERT:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ &pc_tree->vertical[0], rate);
+ if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ &pc_tree->vertical[1], rate);
+ }
+ break;
+ case PARTITION_HORZ:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ &pc_tree->horizontal[0], rate);
+ if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ &pc_tree->horizontal[1], rate);
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ pc_tree->leaf_split[0], rate);
+ } else {
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
+ pc_tree->split[0], rate);
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ pc_tree->split[1], rate);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ pc_tree->split[2], rate);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ subsize, pc_tree->split[3], rate);
+ }
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+ &pc_tree->horizontala[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->horizontala[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, &pc_tree->horizontala[2], rate);
+ break;
+ case PARTITION_HORZ_B:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+ &pc_tree->horizontalb[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, &pc_tree->horizontalb[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->horizontalb[2], rate);
+ break;
+ case PARTITION_VERT_A:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+ &pc_tree->verticala[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, &pc_tree->verticala[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, &pc_tree->verticala[2], rate);
+
+ break;
+ case PARTITION_VERT_B:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+ &pc_tree->verticalb[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->verticalb[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->verticalb[2], rate);
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0 && "Invalid partition type."); break;
+ }
+
+#if CONFIG_EXT_PARTITION_TYPES
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif // CONFIG_EXT_PARTITION_TYPES
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+ int cols_left, int *bh, int *bw) {
+ if (rows_left <= 0 || cols_left <= 0) {
+ return AOMMIN(bsize, BLOCK_8X8);
+ } else {
+ for (; bsize > 0; bsize -= 3) {
+ *bh = mi_size_high[bsize];
+ *bw = mi_size_wide[bsize];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return bsize;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi,
+ int bh_in, int bw_in,
+ int mi_rows_remaining,
+ int mi_cols_remaining, BLOCK_SIZE bsize,
+ MODE_INFO **mib) {
+ int bh = bh_in;
+ int r, c;
+ for (r = 0; r < cm->mib_size; r += bh) {
+ int bw = bw_in;
+ for (c = 0; c < cm->mib_size; c += bw) {
+ const int index = r * cm->mi_stride + c;
+ mib[index] = mi + index;
+ mib[index]->mbmi.sb_type = find_partition_size(
+ bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+ }
+ }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mib, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_rows_remaining = tile->mi_row_end - mi_row;
+ const int mi_cols_remaining = tile->mi_col_end - mi_col;
+ int block_row, block_col;
+ MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
+ int bh = mi_size_high[bsize];
+ int bw = mi_size_wide[bsize];
+
+ assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+ // Apply the requested partition size to the SB if it is all "in image"
+ if ((mi_cols_remaining >= cm->mib_size) &&
+ (mi_rows_remaining >= cm->mib_size)) {
+ for (block_row = 0; block_row < cm->mib_size; block_row += bh) {
+ for (block_col = 0; block_col < cm->mib_size; block_col += bw) {
+ int index = block_row * cm->mi_stride + block_col;
+ mib[index] = mi_upper_left + index;
+ mib[index]->mbmi.sb_type = bsize;
+ }
+ }
+ } else {
+ // Else this is a partial SB.
+ set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+ mi_cols_remaining, bsize, mib);
+ }
+}
+
+static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MODE_INFO **mib,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate, int64_t *dist,
+#if CONFIG_SUPERTX
+ int *rate_nocoef,
+#endif
+ int do_recon, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ int i;
+ const int pl = (bsize >= BLOCK_8X8)
+ ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+ mi_row + hbs < cm->mi_rows,
+ mi_col + hbs < cm->mi_cols,
+#endif
+ bsize)
+ : 0;
+ const PARTITION_TYPE partition =
+ (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS last_part_rdc, none_rdc, chosen_rdc;
+ BLOCK_SIZE sub_subsize = BLOCK_4X4;
+ int splits_below = 0;
+ BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type;
+ int do_partition_search = 1;
+ PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+ const int unify_bsize = CONFIG_CB4X4;
+#if CONFIG_SUPERTX
+ int last_part_rate_nocoef = INT_MAX;
+ int none_rate_nocoef = INT_MAX;
+ int chosen_rate_nocoef = INT_MAX;
+#endif
+#if CONFIG_PVQ
+ od_rollback_buffer pre_rdo_buf;
+#endif
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ assert(num_4x4_blocks_wide_lookup[bsize] ==
+ num_4x4_blocks_high_lookup[bsize]);
+
+ av1_invalid_rd_stats(&last_part_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_invalid_rd_stats(&chosen_rdc);
+
+ pc_tree->partitioning = partition;
+
+#if CONFIG_VAR_TX
+ xd->above_txfm_context = cm->above_txfm_context + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if !CONFIG_PVQ
+ save_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ x->mb_energy = av1_block_energy(cpi, x, bsize);
+ }
+
+ if (do_partition_search &&
+ cpi->sf.partition_search_type == SEARCH_PARTITION &&
+ cpi->sf.adjust_partitioning_from_last_frame) {
+ // Check if any of the sub blocks are further split.
+ if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+ sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+ splits_below = 1;
+ for (i = 0; i < 4; i++) {
+ int jj = i >> 1, ii = i & 0x01;
+ MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
+ if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
+ splits_below = 0;
+ }
+ }
+ }
+
+ // If partition is not none try none unless each of the 4 splits are split
+ // even further..
+ if (partition != PARTITION_NONE && !splits_below &&
+ mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+ pc_tree->partitioning = PARTITION_NONE;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+#if CONFIG_SUPERTX
+ &none_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_NONE,
+#endif
+ bsize, ctx_none, INT64_MAX);
+
+ if (none_rdc.rate < INT_MAX) {
+ none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist);
+#if CONFIG_SUPERTX
+ none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
+ }
+
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+ mib[0]->mbmi.sb_type = bs_type;
+ pc_tree->partitioning = partition;
+ }
+ }
+
+ switch (partition) {
+ case PARTITION_NONE:
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+ &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_NONE,
+#endif
+ bsize, ctx_none, INT64_MAX);
+ break;
+ case PARTITION_HORZ:
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+ &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_HORZ,
+#endif
+ subsize, &pc_tree->horizontal[0], INT64_MAX);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_row + hbs < cm->mi_rows) {
+ RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+ int rt_nocoef = 0;
+#endif
+ PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
+ av1_init_rd_stats(&tmp_rdc);
+ update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ ctx_h, NULL);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+#if CONFIG_SUPERTX
+ &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_HORZ,
+#endif
+ subsize, &pc_tree->horizontal[1], INT64_MAX);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef = INT_MAX;
+#endif
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef += rt_nocoef;
+#endif
+ }
+ break;
+ case PARTITION_VERT:
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+ &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_VERT,
+#endif
+ subsize, &pc_tree->vertical[0], INT64_MAX);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_col + hbs < cm->mi_cols) {
+ RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+ int rt_nocoef = 0;
+#endif
+ PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
+ av1_init_rd_stats(&tmp_rdc);
+ update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ ctx_v, NULL);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+#if CONFIG_SUPERTX
+ &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_VERT,
+#endif
+ subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
+ INT64_MAX);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef = INT_MAX;
+#endif
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef += rt_nocoef;
+#endif
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+ &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_SPLIT,
+#endif
+ subsize, pc_tree->leaf_split[0], INT64_MAX);
+ break;
+ }
+ last_part_rdc.rate = 0;
+ last_part_rdc.dist = 0;
+ last_part_rdc.rdcost = 0;
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef = 0;
+#endif
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ int jj = i >> 1, ii = i & 0x01;
+ RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+ int rt_nocoef;
+#endif
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ av1_init_rd_stats(&tmp_rdc);
+ rd_use_partition(cpi, td, tile_data,
+ mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+ &tmp_rdc.dist,
+#if CONFIG_SUPERTX
+ &rt_nocoef,
+#endif
+ i != 3, pc_tree->split[i]);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef = INT_MAX;
+#endif
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef += rt_nocoef;
+#endif
+ }
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B: assert(0 && "Cannot handle extended partiton types");
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0); break;
+ }
+
+ if (last_part_rdc.rate < INT_MAX) {
+ last_part_rdc.rate += cpi->partition_cost[pl][partition];
+ last_part_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist);
+#if CONFIG_SUPERTX
+ last_part_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
+ }
+
+ if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
+ cpi->sf.partition_search_type == SEARCH_PARTITION &&
+ partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+ (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
+ (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
+ BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+ chosen_rdc.rate = 0;
+ chosen_rdc.dist = 0;
+#if CONFIG_SUPERTX
+ chosen_rate_nocoef = 0;
+#endif
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ // Split partition.
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ RD_STATS tmp_rdc;
+#if CONFIG_SUPERTX
+ int rt_nocoef = 0;
+#endif
+#if CONFIG_PVQ
+ od_rollback_buffer buf;
+#endif
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+#if !CONFIG_PVQ
+ save_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ save_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
+#endif
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+ &tmp_rdc,
+#if CONFIG_SUPERTX
+ &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_SPLIT,
+#endif
+ split_subsize, &pc_tree->split[i]->none, INT64_MAX);
+
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
+#endif
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&chosen_rdc);
+#if CONFIG_SUPERTX
+ chosen_rate_nocoef = INT_MAX;
+#endif
+ break;
+ }
+
+ chosen_rdc.rate += tmp_rdc.rate;
+ chosen_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+ chosen_rate_nocoef += rt_nocoef;
+#endif
+
+ if (i != 3)
+ encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+ OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+ chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+#if CONFIG_SUPERTX
+ chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
+#endif
+ }
+ if (chosen_rdc.rate < INT_MAX) {
+ chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+ chosen_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist);
+#if CONFIG_SUPERTX
+ chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
+ }
+ }
+
+ // If last_part is better set the partitioning to that.
+ if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+ mib[0]->mbmi.sb_type = bsize;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+ chosen_rdc = last_part_rdc;
+#if CONFIG_SUPERTX
+ chosen_rate_nocoef = last_part_rate_nocoef;
+#endif
+ }
+ // If none was better set the partitioning to that.
+ if (none_rdc.rdcost < chosen_rdc.rdcost) {
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+ chosen_rdc = none_rdc;
+#if CONFIG_SUPERTX
+ chosen_rate_nocoef = none_rate_nocoef;
+#endif
+ }
+
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ if (bsize == cm->sb_size)
+ assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+ if (do_recon) {
+ if (bsize == cm->sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+
+ *rate = chosen_rdc.rate;
+ *dist = chosen_rdc.dist;
+#if CONFIG_SUPERTX
+ *rate_nocoef = chosen_rate_nocoef;
+#endif
+}
+
+/* clang-format off */
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+ BLOCK_2X2, BLOCK_2X2, BLOCK_2X2, // 2x2, 2x4, 4x2
+#endif
+ BLOCK_4X4, // 4x4
+ BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 4x8, 8x4, 8x8
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 8x16, 16x8, 16x16
+ BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 16x32, 32x16, 32x32
+ BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 32x64, 64x32, 64x64
+#if CONFIG_EXT_PARTITION
+ BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 // 64x128, 128x64, 128x128
+#endif // CONFIG_EXT_PARTITION
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+ BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 2x2, 2x4, 4x2
+#endif
+ BLOCK_8X8, // 4x4
+ BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 4x8, 8x4, 8x8
+ BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, // 8x16, 16x8, 16x16
+ BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, // 16x32, 32x16, 32x32
+ BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 32x64, 64x32, 64x64
+#if CONFIG_EXT_PARTITION
+ BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST // 64x128, 128x64, 128x128
+#endif // CONFIG_EXT_PARTITION
+};
+
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+ BLOCK_2X2, BLOCK_2X2, BLOCK_2X2, // 2x2, 2x4, 4x2
+#endif
+ BLOCK_4X4, // 4x4
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x8, 8x4, 8x8
+ BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 8x16, 16x8, 16x16
+ BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 16x32, 32x16, 32x32
+ BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, // 32x64, 64x32, 64x64
+#if CONFIG_EXT_PARTITION
+ BLOCK_64X64, BLOCK_64X64, BLOCK_128X128 // 64x128, 128x64, 128x128
+#endif // CONFIG_EXT_PARTITION
+};
+/* clang-format on */
+
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a superblock but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one
+// superblock.
+static void get_sb_partition_size_range(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd, MODE_INFO **mib,
+ BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size) {
+ int i, j;
+ int index = 0;
+
+ // Check the sb_type for each block that belongs to this region.
+ for (i = 0; i < cm->mib_size; ++i) {
+ for (j = 0; j < cm->mib_size; ++j) {
+ MODE_INFO *mi = mib[index + j];
+ BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4;
+ *min_block_size = AOMMIN(*min_block_size, sb_type);
+ *max_block_size = AOMMAX(*max_block_size, sb_type);
+ }
+ index += xd->mi_stride;
+ }
+}
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col, BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ MODE_INFO **mi = xd->mi;
+ const int left_in_image = xd->left_available && mi[-1];
+ const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+ const int mi_rows_remaining = tile->mi_row_end - mi_row;
+ const int mi_cols_remaining = tile->mi_col_end - mi_col;
+ int bh, bw;
+ BLOCK_SIZE min_size = BLOCK_4X4;
+ BLOCK_SIZE max_size = BLOCK_LARGEST;
+
+ // Trap case where we do not have a prediction.
+ if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
+ // Default "min to max" and "max to min"
+ min_size = BLOCK_LARGEST;
+ max_size = BLOCK_4X4;
+
+ // NOTE: each call to get_sb_partition_size_range() uses the previous
+ // passed in values for min and max as a starting point.
+ // Find the min and max partition used in previous frame at this location
+ if (cm->frame_type != KEY_FRAME) {
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+ get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
+ }
+ // Find the min and max partition sizes used in the left superblock
+ if (left_in_image) {
+ MODE_INFO **left_sb_mi = &mi[-cm->mib_size];
+ get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
+ }
+ // Find the min and max partition sizes used in the above suprblock.
+ if (above_in_image) {
+ MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size];
+ get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
+ }
+
+ // Adjust observed min and max for "relaxed" auto partition case.
+ if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+ }
+
+ // Check border cases where max and min from neighbors may not be legal.
+ max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
+ &bh, &bw);
+ min_size = AOMMIN(min_size, max_size);
+
+ // Test for blocks at the edge of the active image.
+ // This may be the actual edge of the image or where there are formatting
+ // bars.
+ if (av1_active_edge_sb(cpi, mi_row, mi_col)) {
+ min_size = BLOCK_4X4;
+ } else {
+ min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
+ }
+
+ // When use_square_partition_only is true, make sure at least one square
+ // partition is allowed by selecting the next smaller square size as
+ // *min_block_size.
+ if (cpi->sf.use_square_partition_only) {
+ min_size = AOMMIN(min_size, next_square_size[max_size]);
+ }
+
+ *min_block_size = AOMMIN(min_size, cm->sb_size);
+ *max_block_size = AOMMIN(max_size, cm->sb_size);
+}
+
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ BLOCK_SIZE *const min_bs,
+ BLOCK_SIZE *const max_bs) {
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ int idx, idy;
+
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
+ BLOCK_SIZE min_size = BLOCK_64X64; // default values
+ BLOCK_SIZE max_size = BLOCK_4X4;
+
+ if (prev_mi) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ const MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
+ const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = AOMMIN(min_size, bs);
+ max_size = AOMMAX(max_size, bs);
+ }
+ }
+ }
+
+ if (xd->left_available) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ const MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
+ const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = AOMMIN(min_size, bs);
+ max_size = AOMMAX(max_size, bs);
+ }
+ }
+
+ if (xd->up_available) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ const MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
+ const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = AOMMIN(min_size, bs);
+ max_size = AOMMAX(max_size, bs);
+ }
+ }
+
+ if (min_size == max_size) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+
+ *min_bs = AOMMIN(min_size, cm->sb_size);
+ *max_bs = AOMMIN(max_size, cm->sb_size);
+}
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+#if CONFIG_FP_MB_STATS
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
+ 0,
+ 10,
+ 10,
+ 30,
+ 40,
+ 40,
+ 60,
+ 80,
+ 80,
+ 90,
+ 100,
+ 100,
+ 120,
+#if CONFIG_EXT_PARTITION
+ // TODO(debargha): What are the correct numbers here?
+ 130,
+ 130,
+ 150
+#endif // CONFIG_EXT_PARTITION
+};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
+ 0,
+ 3,
+ 3,
+ 7,
+ 15,
+ 15,
+ 30,
+ 40,
+ 40,
+ 60,
+ 80,
+ 80,
+ 120,
+#if CONFIG_EXT_PARTITION
+ // TODO(debargha): What are the correct numbers here?
+ 160,
+ 160,
+ 240
+#endif // CONFIG_EXT_PARTITION
+};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 4,
+ 4,
+ 6,
+#if CONFIG_EXT_PARTITION
+ // TODO(debargha): What are the correct numbers here?
+ 8,
+ 8,
+ 10
+#endif // CONFIG_EXT_PARTITION
+};
+
+typedef enum {
+ MV_ZERO = 0,
+ MV_LEFT = 1,
+ MV_UP = 2,
+ MV_RIGHT = 3,
+ MV_DOWN = 4,
+ MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+ if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+ return MV_ZERO;
+ } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+ return MV_LEFT;
+ } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+ return MV_RIGHT;
+ } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+ return MV_UP;
+ } else {
+ return MV_DOWN;
+ }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+ MOTION_DIRECTION that_mv) {
+ if (this_mv == that_mv) {
+ return 0;
+ } else {
+ return abs(this_mv - that_mv) == 2 ? 2 : 1;
+ }
+}
+#endif
+
+#if CONFIG_EXT_PARTITION_TYPES
+static void rd_test_partition3(
+ const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc,
+ PICK_MODE_CONTEXT ctxs[3], PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PARTITION_TYPE partition,
+#if CONFIG_SUPERTX
+ int64_t best_rd, int *best_rate_nocoef, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+#endif
+ int mi_row0, int mi_col0, BLOCK_SIZE subsize0, int mi_row1, int mi_col1,
+ BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_STATS this_rdc, sum_rdc;
+#if CONFIG_SUPERTX
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ int this_rate_nocoef, sum_rate_nocoef;
+ int abort_flag;
+ const int supertx_allowed = !frame_is_intra_only(cm) &&
+ bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+ !xd->lossless[0];
+#endif
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row0, mi_col0, &sum_rdc,
+#if CONFIG_SUPERTX
+ &sum_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ subsize0, &ctxs[0], best_rdc->rdcost);
+#if CONFIG_SUPERTX
+ abort_flag = sum_rdc.rdcost >= best_rd;
+#endif
+
+#if CONFIG_SUPERTX
+ if (sum_rdc.rdcost < INT64_MAX) {
+#else
+ if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+ PICK_MODE_CONTEXT *ctx_0 = &ctxs[0];
+ update_state(cpi, td, ctx_0, mi_row0, mi_col0, subsize0, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
+ ctx_0, NULL);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_0);
+
+#if CONFIG_SUPERTX
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+ &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ subsize1, &ctxs[1], INT64_MAX - sum_rdc.rdcost);
+#else
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ subsize1, &ctxs[1], best_rdc->rdcost - sum_rdc.rdcost);
+#endif // CONFIG_SUPERTX
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef = INT_MAX;
+#endif
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += this_rate_nocoef;
+#endif
+ }
+
+#if CONFIG_SUPERTX
+ if (sum_rdc.rdcost < INT64_MAX) {
+#else
+ if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+ PICK_MODE_CONTEXT *ctx_1 = &ctxs[1];
+ update_state(cpi, td, ctx_1, mi_row1, mi_col1, subsize1, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
+ ctx_1, NULL);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_1);
+
+#if CONFIG_SUPERTX
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+ &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ subsize2, &ctxs[2], INT64_MAX - sum_rdc.rdcost);
+#else
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+ partition,
+#endif
+ subsize2, &ctxs[2], best_rdc->rdcost - sum_rdc.rdcost);
+#endif // CONFIG_SUPERTX
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef = INT_MAX;
+#endif
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += this_rate_nocoef;
+#endif
+ }
+
+#if CONFIG_SUPERTX
+ if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
+ TX_SIZE supertx_size = max_txsize_lookup[bsize];
+ const PARTITION_TYPE best_partition = pc_tree->partitioning;
+ pc_tree->partitioning = partition;
+ sum_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+ [supertx_size],
+ 0);
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+ if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+ TX_TYPE best_tx = DCT_DCT;
+ RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 };
+
+ restore_context(x, x_ctx, mi_row, mi_col, bsize);
+
+ rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+ &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+ tmp_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+ [supertx_size],
+ 1);
+ tmp_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+ if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+ sum_rdc = tmp_rdc;
+ update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+ supertx_size, pc_tree);
+ }
+ }
+
+ pc_tree->partitioning = best_partition;
+ }
+#endif // CONFIG_SUPERTX
+
+ if (sum_rdc.rdcost < best_rdc->rdcost) {
+ int pl = partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+ has_rows, has_cols,
+#endif
+ bsize);
+ sum_rdc.rate += cpi->partition_cost[pl][partition];
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
+ if (sum_rdc.rdcost < best_rdc->rdcost) {
+#if CONFIG_SUPERTX
+ *best_rate_nocoef = sum_rate_nocoef;
+ assert(*best_rate_nocoef >= 0);
+#endif
+ *best_rdc = sum_rdc;
+ pc_tree->partitioning = partition;
+ }
+ }
+ }
+ }
+}
+#endif // CONFIG_EXT_PARTITION_TYPES
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+ int *rate_nocoef,
+#endif
+ int64_t best_rd, PC_TREE *pc_tree) {
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_step = mi_size_wide[bsize] / 2;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ const TOKENEXTRA *const tp_orig = *tp;
+ PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+#if CONFIG_UNPOISON_PARTITION_CTX
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int has_rows = mi_row + hbs < cm->mi_rows;
+ const int has_cols = mi_col + hbs < cm->mi_cols;
+#else
+ int tmp_partition_cost[PARTITION_TYPES];
+#endif
+ BLOCK_SIZE subsize;
+ RD_STATS this_rdc, sum_rdc, best_rdc;
+ const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+ int do_square_split = bsize_at_least_8x8;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+ const int pl = bsize_at_least_8x8
+ ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+ has_rows, has_cols,
+#endif
+ bsize)
+ : 0;
+#else
+ const int unify_bsize = 0;
+ const int pl = partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+ has_rows, has_cols,
+#endif
+ bsize);
+#endif // CONFIG_CB4X4
+ const int *partition_cost = cpi->partition_cost[pl];
+#if CONFIG_SUPERTX
+ int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
+ int abort_flag;
+ const int supertx_allowed = !frame_is_intra_only(cm) &&
+ bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+ !xd->lossless[0];
+#endif // CONFIG_SUPERTX
+
+ int do_rectangular_split = 1;
+#if CONFIG_EXT_PARTITION_TYPES
+ BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+ // Override skipping rectangular partition operations for edge blocks
+ const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+ const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+ const int xss = x->e_mbd.plane[1].subsampling_x;
+ const int yss = x->e_mbd.plane[1].subsampling_y;
+
+ BLOCK_SIZE min_size = x->min_partition_size;
+ BLOCK_SIZE max_size = x->max_partition_size;
+
+#if CONFIG_FP_MB_STATS
+ unsigned int src_diff_var = UINT_MAX;
+ int none_complexity = 0;
+#endif
+
+ int partition_none_allowed = !force_horz_split && !force_vert_split;
+ int partition_horz_allowed =
+ !force_vert_split && yss <= xss && bsize_at_least_8x8;
+ int partition_vert_allowed =
+ !force_horz_split && xss <= yss && bsize_at_least_8x8;
+
+#if CONFIG_PVQ
+ od_rollback_buffer pre_rdo_buf;
+#endif
+
+ (void)*tp_orig;
+
+#if !CONFIG_UNPOISON_PARTITION_CTX
+ if (force_horz_split || force_vert_split) {
+ tmp_partition_cost[PARTITION_NONE] = INT_MAX;
+
+ if (!force_vert_split) { // force_horz_split only
+ tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+ tmp_partition_cost[PARTITION_HORZ] =
+ av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0);
+ tmp_partition_cost[PARTITION_SPLIT] =
+ av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1);
+ } else if (!force_horz_split) { // force_vert_split only
+ tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+ tmp_partition_cost[PARTITION_VERT] =
+ av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0);
+ tmp_partition_cost[PARTITION_SPLIT] =
+ av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1);
+ } else { // force_ horz_split && force_vert_split horz_split
+ tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+ tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+ tmp_partition_cost[PARTITION_SPLIT] = 0;
+ }
+
+ partition_cost = tmp_partition_cost;
+ }
+#endif
+
+#if CONFIG_VAR_TX
+#ifndef NDEBUG
+ // Nothing should rely on the default value of this array (which is just
+ // leftover from encoding the previous block. Setting it to magic number
+ // when debugging.
+ memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0]));
+#endif // NDEBUG
+#endif // CONFIG_VAR_TX
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ av1_init_rd_stats(&this_rdc);
+ av1_init_rd_stats(&sum_rdc);
+ av1_invalid_rd_stats(&best_rdc);
+ best_rdc.rdcost = best_rd;
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+ x->mb_energy = av1_block_energy(cpi, x, bsize);
+
+ if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+ const int cb_partition_search_ctrl =
+ ((pc_tree->index == 0 || pc_tree->index == 3) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1;
+
+ if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+ set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+ }
+
+ // Determine partition types in search according to the speed features.
+ // The threshold set here has to be of square block size.
+ if (cpi->sf.auto_min_max_partition_size) {
+ const int no_partition_allowed = (bsize <= max_size && bsize >= min_size);
+ // Note: Further partitioning is NOT allowed when bsize == min_size already.
+ const int partition_allowed = (bsize <= max_size && bsize > min_size);
+ partition_none_allowed &= no_partition_allowed;
+ partition_horz_allowed &= partition_allowed || force_horz_split;
+ partition_vert_allowed &= partition_allowed || force_vert_split;
+ do_square_split &= bsize > min_size;
+ }
+ if (cpi->sf.use_square_partition_only) {
+ partition_horz_allowed &= force_horz_split;
+ partition_vert_allowed &= force_vert_split;
+ }
+
+#if CONFIG_VAR_TX
+ xd->above_txfm_context = cm->above_txfm_context + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if !CONFIG_PVQ
+ save_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
+ mi_col, bsize);
+ }
+#endif
+
+#if CONFIG_FP_MB_STATS
+ // Decide whether we shall split directly and skip searching NONE by using
+ // the first pass block statistics
+ if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split &&
+ partition_none_allowed && src_diff_var > 4 &&
+ cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ // compute a complexity measure, basically measure inconsistency of motion
+ // vectors obtained from the first pass in the current block
+ for (r = mb_row; r < mb_row_end; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+
+ MOTION_DIRECTION this_mv;
+ MOTION_DIRECTION right_mv;
+ MOTION_DIRECTION bottom_mv;
+
+ this_mv =
+ get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+ // to its right
+ if (c != mb_col_end - 1) {
+ right_mv = get_motion_direction_fp(
+ cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+ none_complexity += get_motion_inconsistency(this_mv, right_mv);
+ }
+
+ // to its bottom
+ if (r != mb_row_end - 1) {
+ bottom_mv = get_motion_direction_fp(
+ cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+ none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+ }
+
+ // do not count its left and top neighbors to avoid double counting
+ }
+ }
+
+ if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+ partition_none_allowed = 0;
+ }
+ }
+#endif
+
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+#if CONFIG_SUPERTX
+ &this_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_NONE,
+#endif
+ bsize, ctx_none, best_rdc.rdcost);
+ if (this_rdc.rate != INT_MAX) {
+ if (bsize_at_least_8x8) {
+ this_rdc.rate += partition_cost[PARTITION_NONE];
+ this_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+#if CONFIG_SUPERTX
+ this_rate_nocoef += partition_cost[PARTITION_NONE];
+#endif
+ }
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ // Adjust dist breakout threshold according to the partition size.
+ const int64_t dist_breakout_thr =
+ cpi->sf.partition_search_breakout_dist_thr >>
+ ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+ (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+ const int rate_breakout_thr =
+ cpi->sf.partition_search_breakout_rate_thr *
+ num_pels_log2_lookup[bsize];
+
+ best_rdc = this_rdc;
+#if CONFIG_SUPERTX
+ best_rate_nocoef = this_rate_nocoef;
+ assert(best_rate_nocoef >= 0);
+#endif
+ if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+ // If all y, u, v transform blocks in this partition are skippable, and
+ // the dist & rate are within the thresholds, the partition search is
+ // terminated for current branch of the partition search tree.
+ // The dist & rate thresholds are set to 0 at speed 0 to disable the
+ // early termination at that speed.
+ if (!x->e_mbd.lossless[xd->mi[0]->mbmi.segment_id] &&
+ (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr)) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
+
+#if CONFIG_FP_MB_STATS
+ // Check if every 16x16 first pass block statistics has zero
+ // motion and the corresponding first pass residue is small enough.
+ // If that is the case, check the difference variance between the
+ // current frame and the last frame. If the variance is small enough,
+ // stop further splitting in RD optimization
+ if (cpi->use_fp_mb_stats && do_square_split &&
+ cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ int skip = 1;
+ for (r = mb_row; r < mb_row_end; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+ if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_MOTION_ZERO_MASK) ||
+ !(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_ERROR_SMALL_MASK)) {
+ skip = 0;
+ break;
+ }
+ }
+ if (skip == 0) {
+ break;
+ }
+ }
+ if (skip) {
+ if (src_diff_var == UINT_MAX) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ src_diff_var = get_sby_perpixel_diff_variance(
+ cpi, &x->plane[0].src, mi_row, mi_col, bsize);
+ }
+ if (src_diff_var < 8) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
+ }
+ }
+#endif
+ }
+ }
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+ }
+
+ // store estimated motion vector
+ if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+ // PARTITION_SPLIT
+ // TODO(jingning): use the motion vectors given by the above search as
+ // the starting point of motion search in the following partition type check.
+ if (do_square_split) {
+ int reached_last_index = 0;
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+#if CONFIG_DUAL_FILTER
+ if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+ pc_tree->leaf_split[0]->pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter[0];
+#else
+ if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+ pc_tree->leaf_split[0]->pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+ &sum_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_SPLIT,
+#endif
+ subsize, pc_tree->leaf_split[0], INT64_MAX);
+#else
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_SPLIT,
+#endif
+ subsize, pc_tree->leaf_split[0], best_rdc.rdcost);
+#endif // CONFIG_SUPERTX
+ if (sum_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef = INT_MAX;
+#endif
+ }
+#if CONFIG_SUPERTX
+ if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) {
+ TX_SIZE supertx_size = max_txsize_lookup[bsize];
+ const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ sum_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup
+ [PARTITION_SPLIT]][supertx_size],
+ 0);
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+ if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
+ TX_TYPE best_tx = DCT_DCT;
+ RD_STATS tmp_rdc;
+ av1_init_rd_stats(&tmp_rdc);
+ tmp_rdc.rate = sum_rate_nocoef;
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+ rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+ &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+ tmp_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup
+ [PARTITION_SPLIT]][supertx_size],
+ 1);
+ tmp_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+ if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+ sum_rdc = tmp_rdc;
+ update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+ supertx_size, pc_tree);
+ }
+ }
+
+ pc_tree->partitioning = best_partition;
+ }
+#endif // CONFIG_SUPERTX
+ reached_last_index = 1;
+ } else {
+ int idx;
+#if CONFIG_SUPERTX
+ for (idx = 0; idx < 4 && sum_rdc.rdcost < INT64_MAX; ++idx) {
+#else
+ for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+#endif // CONFIG_SUPERTX
+ const int x_idx = (idx & 1) * mi_step;
+ const int y_idx = (idx >> 1) * mi_step;
+
+ if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+ continue;
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+ pc_tree->split[idx]->index = idx;
+#if CONFIG_SUPERTX
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+ mi_col + x_idx, subsize, &this_rdc, &this_rate_nocoef,
+ INT64_MAX - sum_rdc.rdcost, pc_tree->split[idx]);
+#else
+ rd_pick_partition(
+ cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[idx]);
+#endif // CONFIG_SUPERTX
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef = INT_MAX;
+#endif // CONFIG_SUPERTX
+ break;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += this_rate_nocoef;
+#endif // CONFIG_SUPERTX
+ }
+ }
+ reached_last_index = (idx == 4);
+#if CONFIG_SUPERTX
+ if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
+ TX_SIZE supertx_size = max_txsize_lookup[bsize];
+ const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ sum_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup
+ [PARTITION_SPLIT]][supertx_size],
+ 0);
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+ if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+ TX_TYPE best_tx = DCT_DCT;
+ RD_STATS tmp_rdc;
+ av1_init_rd_stats(&tmp_rdc);
+ tmp_rdc.rate = sum_rate_nocoef;
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+ rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+ &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+ tmp_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup
+ [PARTITION_SPLIT]][supertx_size],
+ 1);
+ tmp_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+ if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+ sum_rdc = tmp_rdc;
+ update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+ supertx_size, pc_tree);
+ }
+ }
+
+ pc_tree->partitioning = best_partition;
+ }
+#endif // CONFIG_SUPERTX
+ }
+
+ if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rate += partition_cost[PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
+#endif // CONFIG_SUPERTX
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+ best_rate_nocoef = sum_rate_nocoef;
+ assert(best_rate_nocoef >= 0);
+#endif // CONFIG_SUPERTX
+ pc_tree->partitioning = PARTITION_SPLIT;
+ }
+ } else if (cpi->sf.less_rectangular_check) {
+ // skip rectangular partition test when larger block size
+ // gives better rd cost
+ do_rectangular_split &= !partition_none_allowed;
+ }
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+ } // if (do_split)
+
+ // PARTITION_HORZ
+ if (partition_horz_allowed &&
+ (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+#if CONFIG_DUAL_FILTER
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->horizontal[0].pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter[0];
+#else
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->horizontal[0].pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter;
+#endif
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+ &sum_rate_nocoef,
+#endif // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_HORZ,
+#endif
+ subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
+
+#if CONFIG_SUPERTX
+ abort_flag =
+ (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
+ (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+ if (sum_rdc.rdcost < INT64_MAX &&
+#else
+ if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif // CONFIG_SUPERTX
+ !force_horz_split && (bsize > BLOCK_8X8 || unify_bsize)) {
+ PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
+ update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ ctx_h, NULL);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
+
+#if CONFIG_DUAL_FILTER
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->horizontal[1].pred_interp_filter =
+ ctx_h->mic.mbmi.interp_filter[0];
+#else
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->horizontal[1].pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+ &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_HORZ,
+#endif
+ subsize, &pc_tree->horizontal[1], INT64_MAX);
+#else
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_HORZ,
+#endif
+ subsize, &pc_tree->horizontal[1],
+ best_rdc.rdcost - sum_rdc.rdcost);
+#endif // CONFIG_SUPERTX
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef = INT_MAX;
+#endif // CONFIG_SUPERTX
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += this_rate_nocoef;
+#endif // CONFIG_SUPERTX
+ }
+ }
+
+#if CONFIG_SUPERTX
+ if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+ TX_SIZE supertx_size = max_txsize_lookup[bsize];
+ const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+ pc_tree->partitioning = PARTITION_HORZ;
+
+ sum_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+ [supertx_size],
+ 0);
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+ if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+ TX_TYPE best_tx = DCT_DCT;
+ RD_STATS tmp_rdc;
+ av1_init_rd_stats(&tmp_rdc);
+ tmp_rdc.rate = sum_rate_nocoef;
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+ rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
+ &tmp_rdc.dist, &best_tx, pc_tree);
+
+ tmp_rdc.rate += av1_cost_bit(
+ cm->fc
+ ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+ [supertx_size],
+ 1);
+ tmp_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+ if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+ sum_rdc = tmp_rdc;
+ update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+ supertx_size, pc_tree);
+ }
+ }
+
+ pc_tree->partitioning = best_partition;
+ }
+#endif // CONFIG_SUPERTX
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rate += partition_cost[PARTITION_HORZ];
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += partition_cost[PARTITION_HORZ];
+#endif // CONFIG_SUPERTX
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+ best_rate_nocoef = sum_rate_nocoef;
+ assert(best_rate_nocoef >= 0);
+#endif // CONFIG_SUPERTX
+ pc_tree->partitioning = PARTITION_HORZ;
+ }
+ }
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+ }
+
+ // PARTITION_VERT
+ if (partition_vert_allowed &&
+ (do_rectangular_split || av1_active_v_edge(cpi, mi_col, mi_step))) {
+ subsize = get_subsize(bsize, PARTITION_VERT);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+#if CONFIG_DUAL_FILTER
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->vertical[0].pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter[0];
+#else
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->vertical[0].pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter;
+#endif
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+ &sum_rate_nocoef,
+#endif // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_VERT,
+#endif
+ subsize, &pc_tree->vertical[0], best_rdc.rdcost);
+#if CONFIG_SUPERTX
+ abort_flag =
+ (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
+ (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+ if (sum_rdc.rdcost < INT64_MAX &&
+#else
+ if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif // CONFIG_SUPERTX
+ !force_vert_split && (bsize > BLOCK_8X8 || unify_bsize)) {
+ update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ &pc_tree->vertical[0], NULL);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+#if CONFIG_DUAL_FILTER
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->vertical[1].pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter[0];
+#else
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->vertical[1].pred_interp_filter =
+ ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+ &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_VERT,
+#endif
+ subsize, &pc_tree->vertical[1],
+ INT64_MAX - sum_rdc.rdcost);
+#else
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_VERT,
+#endif
+ subsize, &pc_tree->vertical[1],
+ best_rdc.rdcost - sum_rdc.rdcost);
+#endif // CONFIG_SUPERTX
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef = INT_MAX;
+#endif // CONFIG_SUPERTX
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += this_rate_nocoef;
+#endif // CONFIG_SUPERTX
+ }
+ }
+#if CONFIG_SUPERTX
+ if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+ TX_SIZE supertx_size = max_txsize_lookup[bsize];
+ const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+ pc_tree->partitioning = PARTITION_VERT;
+
+ sum_rdc.rate += av1_cost_bit(
+ cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+ [supertx_size],
+ 0);
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+ if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+ TX_TYPE best_tx = DCT_DCT;
+ RD_STATS tmp_rdc;
+ av1_init_rd_stats(&tmp_rdc);
+ tmp_rdc.rate = sum_rate_nocoef;
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+ rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
+ &tmp_rdc.dist, &best_tx, pc_tree);
+
+ tmp_rdc.rate += av1_cost_bit(
+ cm->fc
+ ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+ [supertx_size],
+ 1);
+ tmp_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+ if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+ sum_rdc = tmp_rdc;
+ update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+ supertx_size, pc_tree);
+ }
+ }
+
+ pc_tree->partitioning = best_partition;
+ }
+#endif // CONFIG_SUPERTX
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rate += partition_cost[PARTITION_VERT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+ sum_rate_nocoef += partition_cost[PARTITION_VERT];
+#endif // CONFIG_SUPERTX
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+ best_rate_nocoef = sum_rate_nocoef;
+ assert(best_rate_nocoef >= 0);
+#endif // CONFIG_SUPERTX
+ pc_tree->partitioning = PARTITION_VERT;
+ }
+ }
+#if !CONFIG_PVQ
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+ restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+ }
+
+#if CONFIG_EXT_PARTITION_TYPES
+ // PARTITION_HORZ_A
+ if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+ partition_none_allowed) {
+ subsize = get_subsize(bsize, PARTITION_HORZ_A);
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_HORZ_A,
+#if CONFIG_SUPERTX
+ best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+ mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2,
+ mi_row + mi_step, mi_col, subsize);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
+ // PARTITION_HORZ_B
+ if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+ partition_none_allowed) {
+ subsize = get_subsize(bsize, PARTITION_HORZ_B);
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_HORZ_B,
+#if CONFIG_SUPERTX
+ best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+ mi_row, mi_col, subsize, mi_row + mi_step, mi_col,
+ bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
+ // PARTITION_VERT_A
+ if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+ partition_none_allowed) {
+ subsize = get_subsize(bsize, PARTITION_VERT_A);
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_VERT_A,
+#if CONFIG_SUPERTX
+ best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+ mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2,
+ mi_row, mi_col + mi_step, subsize);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
+ // PARTITION_VERT_B
+ if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+ partition_none_allowed) {
+ subsize = get_subsize(bsize, PARTITION_VERT_B);
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_VERT_B,
+#if CONFIG_SUPERTX
+ best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+ mi_row, mi_col, subsize, mi_row, mi_col + mi_step,
+ bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
+#endif // CONFIG_EXT_PARTITION_TYPES
+
+ // TODO(jbb): This code added so that we avoid static analysis
+ // warning related to the fact that best_rd isn't used after this
+ // point. This code should be refactored so that the duplicate
+ // checks occur in some sub function and thus are used...
+ (void)best_rd;
+ *rd_cost = best_rdc;
+#if CONFIG_SUPERTX
+ *rate_nocoef = best_rate_nocoef;
+#endif // CONFIG_SUPERTX
+
+#if CONFIG_CFL
+ // Store the luma for the best mode
+ x->cfl_store_y = 1;
+#endif
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+ pc_tree->index != 3) {
+ if (bsize == cm->sb_size) {
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+ set_mode_info_sb(cpi, td, tile_info, tp, mi_row, mi_col, bsize, pc_tree);
+#endif
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+#if CONFIG_CFL
+ x->cfl_store_y = 0;
+#endif
+
+ if (bsize == cm->sb_size) {
+#if !CONFIG_PVQ && !CONFIG_LV_MAP
+ assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
+#endif
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+}
+
+static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, int mi_row,
+ TOKENEXTRA **tp) {
+ AV1_COMMON *const cm = &cpi->common;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ int mi_col;
+#if CONFIG_EXT_PARTITION
+ const int leaf_nodes = 256;
+#else
+ const int leaf_nodes = 64;
+#endif // CONFIG_EXT_PARTITION
+
+ // Initialize the left context for the new SB row
+ av1_zero_left_context(xd);
+
+#if CONFIG_DELTA_Q
+ // Reset delta for every tile
+ if (cm->delta_q_present_flag)
+ if (mi_row == tile_info->mi_row_start) xd->prev_qindex = cm->base_qindex;
+#if CONFIG_EXT_DELTA_Q
+ if (cm->delta_lf_present_flag)
+ if (mi_row == tile_info->mi_row_start) xd->prev_delta_lf_from_base = 0;
+#endif
+#endif
+
+ // Code each SB in the row
+ for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+ mi_col += cm->mib_size) {
+ const struct segmentation *const seg = &cm->seg;
+ int dummy_rate;
+ int64_t dummy_dist;
+ RD_STATS dummy_rdc;
+#if CONFIG_SUPERTX
+ int dummy_rate_nocoef;
+#endif // CONFIG_SUPERTX
+ int i;
+ int seg_skip = 0;
+
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+ PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+
+ av1_update_boundary_info(cm, tile_info, mi_row, mi_col);
+
+ if (sf->adaptive_pred_interp_filter) {
+ for (i = 0; i < leaf_nodes; ++i)
+ td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+
+ for (i = 0; i < leaf_nodes; ++i) {
+ td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+ }
+ }
+
+ av1_zero(x->pred_mv);
+ pc_root->index = 0;
+
+ if (seg->enabled) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
+ seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+ }
+
+#if CONFIG_DELTA_Q
+ if (cm->delta_q_present_flag) {
+ // Test mode for delta quantization
+ int sb_row = mi_row >> 3;
+ int sb_col = mi_col >> 3;
+ int sb_stride = (cm->width + MAX_SB_SIZE - 1) >> MAX_SB_SIZE_LOG2;
+ int index = ((sb_row * sb_stride + sb_col + 8) & 31) - 16;
+
+ // Ensure divisibility of delta_qindex by delta_q_res
+ int offset_qindex = (index < 0 ? -index - 8 : index - 8);
+ int qmask = ~(cm->delta_q_res - 1);
+ int current_qindex = clamp(cm->base_qindex + offset_qindex,
+ cm->delta_q_res, 256 - cm->delta_q_res);
+
+ current_qindex =
+ ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) +
+ cm->base_qindex;
+ assert(current_qindex > 0);
+
+ xd->delta_qindex = current_qindex - cm->base_qindex;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ xd->mi[0]->mbmi.current_q_index = current_qindex;
+#if !CONFIG_EXT_DELTA_Q
+ xd->mi[0]->mbmi.segment_id = 0;
+#endif // CONFIG_EXT_DELTA_Q
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+#if CONFIG_EXT_DELTA_Q
+ if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
+ int j, k;
+ int lfmask = ~(cm->delta_lf_res - 1);
+ int current_delta_lf_from_base = offset_qindex / 2;
+ current_delta_lf_from_base =
+ ((current_delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
+
+ // pre-set the delta lf for loop filter. Note that this value is set
+ // before mi is assigned for each block in current superblock
+ for (j = 0; j < AOMMIN(cm->mib_size, cm->mi_rows - mi_row); j++) {
+ for (k = 0; k < AOMMIN(cm->mib_size, cm->mi_cols - mi_col); k++) {
+ cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
+ .mbmi.current_delta_lf_from_base = current_delta_lf_from_base;
+ }
+ }
+ }
+#endif // CONFIG_EXT_DELTA_Q
+ }
+#endif // CONFIG_DELTA_Q
+
+ x->source_variance = UINT_MAX;
+ if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+ BLOCK_SIZE bsize;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+ bsize = seg_skip ? cm->sb_size : sf->always_this_block_size;
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+ &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+ &dummy_rate_nocoef,
+#endif // CONFIG_SUPERTX
+ 1, pc_root);
+ } else if (cpi->partition_search_skippable_frame) {
+ BLOCK_SIZE bsize;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+ bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+ &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+ &dummy_rate_nocoef,
+#endif // CONFIG_SUPERTX
+ 1, pc_root);
+ } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+ choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+ &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+ &dummy_rate_nocoef,
+#endif // CONFIG_SUPERTX
+ 1, pc_root);
+ } else {
+ // If required set upper and lower partition size limits
+ if (sf->auto_min_max_partition_size) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+ rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+ &x->min_partition_size, &x->max_partition_size);
+ }
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
+ &dummy_rdc,
+#if CONFIG_SUPERTX
+ &dummy_rate_nocoef,
+#endif // CONFIG_SUPERTX
+ INT64_MAX, pc_root);
+ }
+ }
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ if (cm->do_subframe_update &&
+ cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ const int mi_rows_per_update =
+ MI_SIZE * AOMMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1);
+ if ((mi_row + MI_SIZE) % mi_rows_per_update == 0 &&
+ mi_row + MI_SIZE < cm->mi_rows &&
+ cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
+ TX_SIZE t;
+ SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+ for (t = 0; t < TX_SIZES; ++t)
+ av1_full_to_model_counts(cpi->td.counts->coef[t],
+ cpi->td.rd_counts.coef_counts[t]);
+ av1_partial_adapt_probs(cm, mi_row, mi_col);
+ ++cm->coef_probs_update_idx;
+ av1_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
+ cm->fc->coef_probs);
+ av1_copy(subframe_stats->coef_counts_buf[cm->coef_probs_update_idx],
+ cpi->td.rd_counts.coef_counts);
+ av1_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
+ cm->counts.eob_branch);
+ av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
+ }
+ }
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+}
+
+static void init_encode_frame_mb_context(AV1_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Copy data over into macro block data structures.
+ av1_setup_src_planes(x, cpi->source, 0, 0);
+
+ av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
+}
+
+#if !CONFIG_REF_ADAPT
+static int check_dual_ref_flags(AV1_COMP *cpi) {
+ const int ref_flags = cpi->ref_frame_flags;
+
+ if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+ return 0;
+ } else {
+ return (!!(ref_flags & AOM_GOLD_FLAG) + !!(ref_flags & AOM_LAST_FLAG) +
+#if CONFIG_EXT_REFS
+ !!(ref_flags & AOM_LAST2_FLAG) + !!(ref_flags & AOM_LAST3_FLAG) +
+ !!(ref_flags & AOM_BWD_FLAG) +
+#endif // CONFIG_EXT_REFS
+ !!(ref_flags & AOM_ALT_FLAG)) >= 2;
+ }
+}
+#endif // !CONFIG_REF_ADAPT
+
+#if !CONFIG_VAR_TX
+static void reset_skip_tx_size(AV1_COMMON *cm, TX_SIZE max_tx_size) {
+ int mi_row, mi_col;
+ const int mis = cm->mi_stride;
+ MODE_INFO **mi_ptr = cm->mi_grid_visible;
+
+ for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+ for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+ if (txsize_sqr_up_map[mi_ptr[mi_col]->mbmi.tx_size] > max_tx_size)
+ mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
+ }
+ }
+}
+#endif
+
+static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
+ if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
+#if CONFIG_EXT_REFS
+ // We will not update the golden frame with an internal overlay frame
+ else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
+ cpi->rc.is_src_frame_ext_arf)
+#else
+ else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
+#endif
+ return ALTREF_FRAME;
+ else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+ return GOLDEN_FRAME;
+ else
+ // TODO(zoeliu): To investigate whether a frame_type other than
+ // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+ return LAST_FRAME;
+}
+
+static TX_MODE select_tx_mode(const AV1_COMP *cpi, MACROBLOCKD *const xd) {
+ int i, all_lossless = 1;
+
+ if (cpi->common.seg.enabled) {
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ if (!xd->lossless[i]) {
+ all_lossless = 0;
+ break;
+ }
+ }
+ } else {
+ all_lossless = xd->lossless[0];
+ }
+ if (all_lossless) return ONLY_4X4;
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+ return ALLOW_32X32 + CONFIG_TX64X64;
+ else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
+ cpi->sf.tx_size_search_method == USE_TX_8X8)
+ return TX_MODE_SELECT;
+ else
+ return cpi->common.tx_mode;
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tile_col, tile_row;
+ TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+ unsigned int tile_tok = 0;
+
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+ if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+ CHECK_MEM_ERROR(
+ cm, cpi->tile_data,
+ aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+ cpi->allocated_tiles = tile_cols * tile_rows;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ int i, j;
+ for (i = 0; i < BLOCK_SIZES; ++i) {
+ for (j = 0; j < MAX_MODES; ++j) {
+ tile_data->thresh_freq_fact[i][j] = 32;
+ tile_data->mode_map[i][j] = j;
+ }
+ }
+#if CONFIG_PVQ
+ // This will be dynamically increased as more pvq block is encoded.
+ tile_data->pvq_q.buf_len = 1000;
+ CHECK_MEM_ERROR(
+ cm, tile_data->pvq_q.buf,
+ aom_malloc(tile_data->pvq_q.buf_len * sizeof(PVQ_INFO)));
+ tile_data->pvq_q.curr_pos = 0;
+#endif
+ }
+ }
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileInfo *const tile_info =
+ &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+ av1_tile_init(tile_info, cm, tile_row, tile_col);
+
+ cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+ pre_tok = cpi->tile_tok[tile_row][tile_col];
+ tile_tok = allocated_tokens(*tile_info);
+#if CONFIG_PVQ
+ cpi->tile_data[tile_row * tile_cols + tile_col].pvq_q.curr_pos = 0;
+#endif
+ }
+ }
+}
+
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+ int mi_row;
+
+#if CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_TILE_GROUPS
+ if ((!cm->dependent_horz_tiles) || (tile_row == 0) ||
+ tile_info->tg_horz_boundary) {
+#else
+ if ((!cm->dependent_horz_tiles) || (tile_row == 0)) {
+#endif
+ av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
+ }
+#else
+ av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
+#endif
+
+ // Set up pointers to per thread motion search counters.
+ this_tile->m_search_count = 0; // Count of motion search hits.
+ this_tile->ex_search_count = 0; // Exhaustive mesh search hits.
+ td->mb.m_search_count_ptr = &this_tile->m_search_count;
+ td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+
+#if CONFIG_PVQ
+ td->mb.pvq_q = &this_tile->pvq_q;
+
+ // TODO(yushin) : activity masking info needs be signaled by a bitstream
+ td->mb.daala_enc.use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
+
+ if (td->mb.daala_enc.use_activity_masking)
+ td->mb.daala_enc.qm = OD_HVS_QM; // Hard coded. Enc/dec required to sync.
+ else
+ td->mb.daala_enc.qm = OD_FLAT_QM; // Hard coded. Enc/dec required to sync.
+
+ {
+ // FIXME: Multiple segments support
+ int segment_id = 0;
+ int rdmult = set_segment_rdmult(cpi, &td->mb, segment_id);
+ int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+#if CONFIG_HIGHBITDEPTH
+ const int quantizer_shift = td->mb.e_mbd.bd - 8;
+#else
+ const int quantizer_shift = 0;
+#endif // CONFIG_HIGHBITDEPTH
+ int64_t q_ac = OD_MAXI(
+ 1, av1_ac_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
+ int64_t q_dc = OD_MAXI(
+ 1, av1_dc_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
+ /* td->mb.daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; */
+ td->mb.daala_enc.pvq_norm_lambda =
+ (double)rdmult * (64 / 16) / (q_ac * q_ac * (1 << RDDIV_BITS));
+ td->mb.daala_enc.pvq_norm_lambda_dc =
+ (double)rdmult * (64 / 16) / (q_dc * q_dc * (1 << RDDIV_BITS));
+ // printf("%f\n", td->mb.daala_enc.pvq_norm_lambda);
+ }
+ od_init_qm(td->mb.daala_enc.state.qm, td->mb.daala_enc.state.qm_inv,
+ td->mb.daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+
+ if (td->mb.daala_enc.use_activity_masking) {
+ int pli;
+ int use_masking = td->mb.daala_enc.use_activity_masking;
+ int segment_id = 0;
+ int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+
+ for (pli = 0; pli < MAX_MB_PLANE; pli++) {
+ int i;
+ int q;
+
+ q = qindex;
+ if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) {
+ od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
+ &OD_DEFAULT_QMS[use_masking][0][pli], NULL);
+ } else {
+ i = 0;
+ while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL &&
+ q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q
+ << OD_COEFF_SHIFT) {
+ i++;
+ }
+ od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
+ &OD_DEFAULT_QMS[use_masking][i][pli],
+ &OD_DEFAULT_QMS[use_masking][i + 1][pli]);
+ }
+ }
+ }
+
+#if CONFIG_DAALA_EC
+ od_ec_enc_init(&td->mb.daala_enc.w.ec, 65025);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+#if CONFIG_DAALA_EC
+ od_ec_enc_reset(&td->mb.daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+#endif // #if CONFIG_PVQ
+
+#if CONFIG_EC_ADAPT
+ this_tile->tctx = *cm->fc;
+ td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+#endif // #if CONFIG_EC_ADAPT
+
+#if CONFIG_CFL
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ xd->cfl = &this_tile->cfl;
+ cfl_init(xd->cfl, cm, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+#endif
+
+#if CONFIG_PVQ
+ td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif // CONFIG_PVQ
+
+ for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += cm->mib_size) {
+ encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+ }
+
+ cpi->tok_count[tile_row][tile_col] =
+ (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+ assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info));
+#if CONFIG_PVQ
+#if CONFIG_DAALA_EC
+ od_ec_enc_clear(&td->mb.daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+ td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos;
+ // rewind current position so that bitstream can be written
+ // from the 1st pvq block
+ td->mb.pvq_q->curr_pos = 0;
+
+ td->mb.pvq_q = NULL;
+#endif
+}
+
+static void encode_tiles(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int tile_col, tile_row;
+
+ av1_init_tile_data(cpi);
+
+ for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col)
+ av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
+#if CONFIG_FP_MB_STATS
+static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
+ AV1_COMMON *cm, uint8_t **this_frame_mb_stats) {
+ uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
+ cm->current_video_frame * cm->MBs * sizeof(uint8_t);
+
+ if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
+
+ *this_frame_mb_stats = mb_stats_in;
+
+ return 1;
+}
+#endif
+
+#if CONFIG_GLOBAL_MOTION
+#define GLOBAL_TRANS_TYPES_ENC 3 // highest motion model to search
+static int gm_get_params_cost(WarpedMotionParams *gm,
+ WarpedMotionParams *ref_gm, int allow_hp) {
+ assert(gm->wmtype < GLOBAL_TRANS_TYPES);
+ int params_cost = 0;
+ int trans_bits, trans_prec_diff;
+ switch (gm->wmtype) {
+ case HOMOGRAPHY:
+ case HORTRAPEZOID:
+ case VERTRAPEZOID:
+ if (gm->wmtype != HORTRAPEZOID)
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
+ (gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
+ if (gm->wmtype != VERTRAPEZOID)
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
+ (gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
+ // Fallthrough intended
+ case AFFINE:
+ case ROTZOOM:
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ if (gm->wmtype != VERTRAPEZOID)
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ if (gm->wmtype >= AFFINE) {
+ if (gm->wmtype != HORTRAPEZOID)
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ }
+ // Fallthrough intended
+ case TRANSLATION:
+ trans_bits = (gm->wmtype == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ trans_prec_diff = (gm->wmtype == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[0] >> trans_prec_diff),
+ (gm->wmmat[0] >> trans_prec_diff));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[1] >> trans_prec_diff),
+ (gm->wmmat[1] >> trans_prec_diff));
+ // Fallthrough intended
+ case IDENTITY: break;
+ default: assert(0);
+ }
+ return (params_cost << AV1_PROB_COST_SHIFT);
+}
+#endif // CONFIG_GLOBAL_MOTION
+
+static void encode_frame_internal(AV1_COMP *cpi) {
+ ThreadData *const td = &cpi->td;
+ MACROBLOCK *const x = &td->mb;
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+ int i;
+#if CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
+ const int last_fb_buf_idx = get_ref_frame_buf_idx(cpi, LAST_FRAME);
+#endif // CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
+
+#if CONFIG_ADAPT_SCAN
+ av1_deliver_eob_threshold(cm, xd);
+#endif
+
+ x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
+ x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+#if CONFIG_REF_MV
+ cm->setup_mi(cm);
+#endif
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+
+ av1_zero(*td->counts);
+ av1_zero(rdc->coef_counts);
+ av1_zero(rdc->comp_pred_diff);
+
+#if CONFIG_GLOBAL_MOTION
+ av1_zero(rdc->global_motion_used);
+ if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
+ !cpi->global_motion_search_done) {
+ YV12_BUFFER_CONFIG *ref_buf;
+ int frame;
+ double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
+ const double *params_this_motion;
+ int inliers_by_motion[RANSAC_NUM_MOTIONS];
+ WarpedMotionParams tmp_wm_params;
+ static const double kInfiniteErrAdv = 1e12;
+ static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+ 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+ };
+
+ for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ ref_buf = get_ref_frame_buffer(cpi, frame);
+ if (ref_buf) {
+ TransformationType model;
+ aom_clear_system_state();
+ for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+ double best_erroradvantage = kInfiniteErrAdv;
+
+ // Initially set all params to identity.
+ for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+ memcpy(params_by_motion + (MAX_PARAMDIM - 1) * i, kIdentityParams,
+ (MAX_PARAMDIM - 1) * sizeof(*params_by_motion));
+ }
+
+ compute_global_motion_feature_based(
+ model, cpi->source, ref_buf,
+#if CONFIG_HIGHBITDEPTH
+ cpi->common.bit_depth,
+#endif // CONFIG_HIGHBITDEPTH
+ inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS);
+
+ for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+ if (inliers_by_motion[i] == 0) continue;
+
+ params_this_motion = params_by_motion + (MAX_PARAMDIM - 1) * i;
+ convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+ if (tmp_wm_params.wmtype != IDENTITY) {
+ const double erroradv_this_motion = refine_integerized_param(
+ &tmp_wm_params, tmp_wm_params.wmtype,
+#if CONFIG_HIGHBITDEPTH
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif // CONFIG_HIGHBITDEPTH
+ ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
+ ref_buf->y_stride, cpi->source->y_buffer,
+ cpi->source->y_width, cpi->source->y_height,
+ cpi->source->y_stride, 3);
+ if (erroradv_this_motion < best_erroradvantage) {
+ best_erroradvantage = erroradv_this_motion;
+ // Save the wm_params modified by refine_integerized_param()
+ // rather than motion index to avoid rerunning refine() below.
+ memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+ sizeof(WarpedMotionParams));
+ }
+ }
+ }
+ if (cm->global_motion[frame].wmtype <= AFFINE)
+ if (!get_shear_params(&cm->global_motion[frame]))
+ set_default_warp_params(&cm->global_motion[frame]);
+
+ if (cm->global_motion[frame].wmtype == TRANSLATION) {
+ cm->global_motion[frame].wmmat[0] =
+ convert_to_trans_prec(cm->allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[0]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ cm->global_motion[frame].wmmat[1] =
+ convert_to_trans_prec(cm->allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[1]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ }
+
+ // If the best error advantage found doesn't meet the threshold for
+ // this motion type, revert to IDENTITY.
+ if (!is_enough_erroradvantage(
+ best_erroradvantage,
+ gm_get_params_cost(&cm->global_motion[frame],
+ &cm->prev_frame->global_motion[frame],
+ cm->allow_high_precision_mv))) {
+ set_default_warp_params(&cm->global_motion[frame]);
+ }
+
+ if (cm->global_motion[frame].wmtype != IDENTITY) break;
+ }
+ aom_clear_system_state();
+ }
+ cpi->gmparams_cost[frame] =
+ gm_get_params_cost(&cm->global_motion[frame],
+ &cm->prev_frame->global_motion[frame],
+ cm->allow_high_precision_mv) +
+ cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
+ cpi->gmtype_cost[IDENTITY];
+ }
+ cpi->global_motion_search_done = 1;
+ }
+ memcpy(cm->cur_frame->global_motion, cm->global_motion,
+ TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams));
+#endif // CONFIG_GLOBAL_MOTION
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex = cm->seg.enabled
+ ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+ : cm->base_qindex;
+ xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+ cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+ xd->qindex[i] = qindex;
+ }
+
+ if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
+
+ cm->tx_mode = select_tx_mode(cpi, xd);
+
+#if CONFIG_DELTA_Q
+ // Fix delta q resolution for the moment
+ cm->delta_q_res = DEFAULT_DELTA_Q_RES;
+// Set delta_q_present_flag before it is used for the first time
+#if CONFIG_EXT_DELTA_Q
+ cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
+ // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+ cm->delta_q_present_flag &= cm->base_qindex > 0;
+ cm->delta_lf_present_flag &= cm->base_qindex > 0;
+#else
+ cm->delta_q_present_flag =
+ cpi->oxcf.aq_mode == DELTA_AQ && cm->base_qindex > 0;
+#endif // CONFIG_EXT_DELTA_Q
+#endif
+
+ av1_frame_init_quantizer(cpi);
+
+ av1_initialize_rd_consts(cpi);
+ av1_initialize_me_consts(cpi, x, cm->base_qindex);
+ init_encode_frame_mb_context(cpi);
+#if CONFIG_TEMPMV_SIGNALING
+ if (last_fb_buf_idx != INVALID_IDX) {
+ cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx];
+ cm->use_prev_frame_mvs &= !cm->error_resilient_mode &&
+ cm->width == cm->prev_frame->buf.y_width &&
+ cm->height == cm->prev_frame->buf.y_height &&
+ !cm->intra_only && !cm->prev_frame->intra_only;
+ }
+#else
+ cm->use_prev_frame_mvs =
+ !cm->error_resilient_mode && cm->width == cm->last_width &&
+ cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame;
+#endif
+
+#if CONFIG_EXT_REFS
+ // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
+ // show_exisiting_frame=1, nor can it take a frame not used as
+ // a reference, it is probable that by the time it is being
+ // referred to, the frame buffer it originally points to may
+ // already get expired and have been reassigned to the current
+ // newly coded frame. Hence, we need to check whether this is
+ // the case, and if yes, we have 2 choices:
+ // (1) Simply disable the use of previous frame mvs; or
+ // (2) Have cm->prev_frame point to one reference frame buffer,
+ // e.g. LAST_FRAME.
+ if (cm->use_prev_frame_mvs && !enc_is_ref_frame_buf(cpi, cm->prev_frame)) {
+ // Reassign the LAST_FRAME buffer to cm->prev_frame.
+ cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx];
+ }
+#endif // CONFIG_EXT_REFS
+
+ // Special case: set prev_mi to NULL when the previous mode info
+ // context cannot be used.
+ cm->prev_mi =
+ cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL;
+
+#if CONFIG_VAR_TX
+ x->txb_split_count = 0;
+#if CONFIG_REF_MV
+ av1_zero(x->blk_skip_drl);
+#endif
+#endif
+
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+ cpi->td.var_root[0] == NULL)
+ av1_setup_var_tree(&cpi->common, &cpi->td);
+
+ {
+ struct aom_usec_timer emr_timer;
+ aom_usec_timer_start(&emr_timer);
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
+ &cpi->twopass.this_frame_mb_stats);
+ }
+#endif
+
+ // If allowed, encoding tiles in parallel with one thread handling one tile.
+ // TODO(geza.lore): The multi-threaded encoder is not safe with more than
+ // 1 tile rows, as it uses the single above_context et al arrays from
+ // cpi->common
+ if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1)
+ av1_encode_tiles_mt(cpi);
+ else
+ encode_tiles(cpi);
+
+ aom_usec_timer_mark(&emr_timer);
+ cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
+ }
+
+#if 0
+ // Keep record of the total distortion this time around for future use
+ cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+}
+
+void av1_encode_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_EXT_TX
+ // Indicates whether or not to use a default reduced set for ext-tx
+ // rather than the potential full set of 16 transforms
+ cm->reduced_tx_set_used = 0;
+#endif // CONFIG_EXT_TX
+
+ // In the longer term the encoder should be generalized to match the
+ // decoder such that we allow compound where one of the 3 buffers has a
+ // different sign bias and that buffer is then the fixed ref. However, this
+ // requires further work in the rd loop. For now the only supported encoder
+ // side behavior is where the ALT ref buffer has opposite sign bias to
+ // the other two.
+ if (!frame_is_intra_only(cm)) {
+#if CONFIG_LOWDELAY_COMPOUND // Normative in encoder
+ cpi->allow_comp_inter_inter = 1;
+#if CONFIG_EXT_REFS
+ cm->comp_fwd_ref[0] = LAST_FRAME;
+ cm->comp_fwd_ref[1] = LAST2_FRAME;
+ cm->comp_fwd_ref[2] = LAST3_FRAME;
+ cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+ cm->comp_bwd_ref[0] = BWDREF_FRAME;
+ cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else
+ cm->comp_fixed_ref = ALTREF_FRAME;
+ cm->comp_var_ref[0] = LAST_FRAME;
+ cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif // CONFIG_EXT_REFS
+#else
+ if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+ cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
+ (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+ cm->ref_frame_sign_bias[LAST_FRAME])) {
+ cpi->allow_comp_inter_inter = 0;
+ } else {
+ cpi->allow_comp_inter_inter = 1;
+
+#if CONFIG_EXT_REFS
+ cm->comp_fwd_ref[0] = LAST_FRAME;
+ cm->comp_fwd_ref[1] = LAST2_FRAME;
+ cm->comp_fwd_ref[2] = LAST3_FRAME;
+ cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+ cm->comp_bwd_ref[0] = BWDREF_FRAME;
+ cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else
+ cm->comp_fixed_ref = ALTREF_FRAME;
+ cm->comp_var_ref[0] = LAST_FRAME;
+ cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif // CONFIG_EXT_REFS
+ }
+#endif
+ } else {
+ cpi->allow_comp_inter_inter = 0;
+ }
+
+ if (cpi->sf.frame_parameter_update) {
+ int i;
+ RD_OPT *const rd_opt = &cpi->rd;
+ FRAME_COUNTS *counts = cpi->td.counts;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+ // This code does a single RD pass over the whole frame assuming
+ // either compound, single or hybrid prediction as per whatever has
+ // worked best for that type of frame in the past.
+ // It also predicts whether another coding mode would have worked
+ // better than this coding mode. If that is the case, it remembers
+ // that for subsequent frames.
+ // It does the same analysis for transform size selection also.
+ //
+ // TODO(zoeliu): To investigate whether a frame_type other than
+ // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+ const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+ int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+ const int is_alt_ref = frame_type == ALTREF_FRAME;
+
+/* prediction (compound, single or hybrid) mode selection */
+#if CONFIG_REF_ADAPT
+ // NOTE(zoeliu): "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
+ if (is_alt_ref || !cpi->allow_comp_inter_inter)
+ cm->reference_mode = SINGLE_REFERENCE;
+ else
+ cm->reference_mode = REFERENCE_MODE_SELECT;
+#else
+ if (is_alt_ref || !cpi->allow_comp_inter_inter)
+ cm->reference_mode = SINGLE_REFERENCE;
+ else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
+ mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] &&
+ check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+ cm->reference_mode = COMPOUND_REFERENCE;
+ else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
+ cm->reference_mode = SINGLE_REFERENCE;
+ else
+ cm->reference_mode = REFERENCE_MODE_SELECT;
+#endif // CONFIG_REF_ADAPT
+
+#if CONFIG_DUAL_FILTER
+ cm->interp_filter = SWITCHABLE;
+#endif
+
+ encode_frame_internal(cpi);
+
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ int single_count_zero = 0;
+ int comp_count_zero = 0;
+
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+ single_count_zero += counts->comp_inter[i][0];
+ comp_count_zero += counts->comp_inter[i][1];
+ }
+
+ if (comp_count_zero == 0) {
+ cm->reference_mode = SINGLE_REFERENCE;
+ av1_zero(counts->comp_inter);
+#if !CONFIG_REF_ADAPT
+ } else if (single_count_zero == 0) {
+ cm->reference_mode = COMPOUND_REFERENCE;
+ av1_zero(counts->comp_inter);
+#endif // !CONFIG_REF_ADAPT
+ }
+ }
+
+#if CONFIG_VAR_TX
+ if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+ cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
+#else
+ if (cm->tx_mode == TX_MODE_SELECT) {
+#if CONFIG_TX64X64
+ int count4x4 = 0;
+ int count8x8_8x8p = 0, count8x8_lp = 0;
+ int count16x16_16x16p = 0, count16x16_lp = 0;
+ int count32x32_32x32p = 0, count32x32_lp = 0;
+ int count64x64_64x64p = 0;
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+ // counts->tx_size[max_depth][context_idx][this_depth_level]
+ count4x4 += counts->tx_size[0][i][0];
+ count4x4 += counts->tx_size[1][i][0];
+ count4x4 += counts->tx_size[2][i][0];
+ count4x4 += counts->tx_size[3][i][0];
+
+ count8x8_8x8p += counts->tx_size[0][i][1];
+ count8x8_lp += counts->tx_size[1][i][1];
+ count8x8_lp += counts->tx_size[2][i][1];
+ count8x8_lp += counts->tx_size[3][i][1];
+
+ count16x16_16x16p += counts->tx_size[1][i][2];
+ count16x16_lp += counts->tx_size[2][i][2];
+ count16x16_lp += counts->tx_size[3][i][2];
+
+ count32x32_32x32p += counts->tx_size[2][i][3];
+ count32x32_lp += counts->tx_size[3][i][3];
+
+ count64x64_64x64p += counts->tx_size[3][i][4];
+ }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ count4x4 += counts->tx_size_implied[0][TX_4X4];
+ count4x4 += counts->tx_size_implied[1][TX_4X4];
+ count4x4 += counts->tx_size_implied[2][TX_4X4];
+ count4x4 += counts->tx_size_implied[3][TX_4X4];
+ count8x8_8x8p += counts->tx_size_implied[1][TX_8X8];
+ count8x8_lp += counts->tx_size_implied[2][TX_8X8];
+ count8x8_lp += counts->tx_size_implied[3][TX_8X8];
+ count8x8_lp += counts->tx_size_implied[4][TX_8X8];
+ count16x16_16x16p += counts->tx_size_implied[2][TX_16X16];
+ count16x16_lp += counts->tx_size_implied[3][TX_16X16];
+ count16x16_lp += counts->tx_size_implied[4][TX_16X16];
+ count32x32_32x32p += counts->tx_size_implied[3][TX_32X32];
+ count32x32_lp += counts->tx_size_implied[4][TX_32X32];
+ count64x64_64x64p += counts->tx_size_implied[4][TX_64X64];
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+ if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+ count32x32_lp == 0 && count32x32_32x32p == 0 &&
+#if CONFIG_SUPERTX
+ cm->counts.supertx_size[TX_16X16] == 0 &&
+ cm->counts.supertx_size[TX_32X32] == 0 &&
+ cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+ count64x64_64x64p == 0) {
+ cm->tx_mode = ALLOW_8X8;
+ reset_skip_tx_size(cm, TX_8X8);
+ } else if (count8x8_8x8p == 0 && count8x8_lp == 0 &&
+ count16x16_16x16p == 0 && count16x16_lp == 0 &&
+ count32x32_32x32p == 0 && count32x32_lp == 0 &&
+#if CONFIG_SUPERTX
+ cm->counts.supertx_size[TX_8X8] == 0 &&
+ cm->counts.supertx_size[TX_16X16] == 0 &&
+ cm->counts.supertx_size[TX_32X32] == 0 &&
+ cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+ count64x64_64x64p == 0) {
+ cm->tx_mode = ONLY_4X4;
+ reset_skip_tx_size(cm, TX_4X4);
+ } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+ count32x32_lp == 0) {
+ cm->tx_mode = ALLOW_64X64;
+ } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+#if CONFIG_SUPERTX
+ cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+ count64x64_64x64p == 0) {
+ cm->tx_mode = ALLOW_32X32;
+ reset_skip_tx_size(cm, TX_32X32);
+ } else if (count4x4 == 0 && count8x8_lp == 0 && count32x32_lp == 0 &&
+ count32x32_32x32p == 0 &&
+#if CONFIG_SUPERTX
+ cm->counts.supertx_size[TX_32X32] == 0 &&
+ cm->counts.supertx_size[TX_64X64] == 0 &&
+#endif
+ count64x64_64x64p == 0) {
+ cm->tx_mode = ALLOW_16X16;
+ reset_skip_tx_size(cm, TX_16X16);
+ }
+
+#else // CONFIG_TX64X64
+
+ int count4x4 = 0;
+ int count8x8_lp = 0, count8x8_8x8p = 0;
+ int count16x16_16x16p = 0, count16x16_lp = 0;
+ int count32x32 = 0;
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+ // counts->tx_size[max_depth][context_idx][this_depth_level]
+ count4x4 += counts->tx_size[0][i][0];
+ count4x4 += counts->tx_size[1][i][0];
+ count4x4 += counts->tx_size[2][i][0];
+
+ count8x8_8x8p += counts->tx_size[0][i][1];
+ count8x8_lp += counts->tx_size[1][i][1];
+ count8x8_lp += counts->tx_size[2][i][1];
+
+ count16x16_16x16p += counts->tx_size[1][i][2];
+ count16x16_lp += counts->tx_size[2][i][2];
+ count32x32 += counts->tx_size[2][i][3];
+ }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ count4x4 += counts->tx_size_implied[0][TX_4X4];
+ count4x4 += counts->tx_size_implied[1][TX_4X4];
+ count4x4 += counts->tx_size_implied[2][TX_4X4];
+ count4x4 += counts->tx_size_implied[3][TX_4X4];
+ count8x8_8x8p += counts->tx_size_implied[1][TX_8X8];
+ count8x8_lp += counts->tx_size_implied[2][TX_8X8];
+ count8x8_lp += counts->tx_size_implied[3][TX_8X8];
+ count16x16_lp += counts->tx_size_implied[3][TX_16X16];
+ count16x16_16x16p += counts->tx_size_implied[2][TX_16X16];
+ count32x32 += counts->tx_size_implied[3][TX_32X32];
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+ if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+#if CONFIG_SUPERTX
+ cm->counts.supertx_size[TX_16X16] == 0 &&
+ cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif // CONFIG_SUPERTX
+ count32x32 == 0) {
+ cm->tx_mode = ALLOW_8X8;
+ reset_skip_tx_size(cm, TX_8X8);
+ } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+ count8x8_lp == 0 && count16x16_lp == 0 &&
+#if CONFIG_SUPERTX
+ cm->counts.supertx_size[TX_8X8] == 0 &&
+ cm->counts.supertx_size[TX_16X16] == 0 &&
+ cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif // CONFIG_SUPERTX
+ count32x32 == 0) {
+ cm->tx_mode = ONLY_4X4;
+ reset_skip_tx_size(cm, TX_4X4);
+ } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+ cm->tx_mode = ALLOW_32X32;
+ } else if (count32x32 == 0 && count8x8_lp == 0 &&
+#if CONFIG_SUPERTX
+ cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif // CONFIG_SUPERTX
+ count4x4 == 0) {
+ cm->tx_mode = ALLOW_16X16;
+ reset_skip_tx_size(cm, TX_16X16);
+ }
+#endif // CONFIG_TX64X64
+ }
+#endif
+ } else {
+ encode_frame_internal(cpi);
+ }
+}
+
+static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
+ const MODE_INFO *mi, const MODE_INFO *above_mi,
+ const MODE_INFO *left_mi, const int intraonly,
+ const int mi_row, const int mi_col) {
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const PREDICTION_MODE y_mode = mbmi->mode;
+ const PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int unify_bsize = CONFIG_CB4X4;
+
+ if (bsize < BLOCK_8X8 && !unify_bsize) {
+ int idx, idy;
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ for (idy = 0; idy < 2; idy += num_4x4_h)
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int bidx = idy * 2 + idx;
+ const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
+ if (intraonly) {
+ const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx);
+ const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx);
+ ++counts->kf_y_mode[a][l][bmode];
+ } else {
+ ++counts->y_mode[0][bmode];
+ }
+ }
+ } else {
+ if (intraonly) {
+ const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
+ const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
+ ++counts->kf_y_mode[above][left][y_mode];
+ } else {
+ ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+ }
+#if CONFIG_FILTER_INTRA
+ if (mbmi->mode == DC_PRED
+#if CONFIG_PALETTE
+ && mbmi->palette_mode_info.palette_size[0] == 0
+#endif // CONFIG_PALETTE
+ ) {
+ const int use_filter_intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
+ ++counts->filter_intra[0][use_filter_intra_mode];
+ }
+ if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+ && mbmi->palette_mode_info.palette_size[1] == 0
+#endif // CONFIG_PALETTE
+ ) {
+ const int use_filter_intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1];
+ ++counts->filter_intra[1][use_filter_intra_mode];
+ }
+#endif // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
+ if (av1_is_directional_mode(mbmi->mode, bsize)) {
+ const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+ const int p_angle =
+ mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+ if (av1_is_intra_filter_switchable(p_angle))
+ ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter];
+ }
+#endif // CONFIG_INTRA_INTERP && CONFIG_INTRA_INTERP
+ }
+
+#if CONFIG_CB4X4
+ if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y))
+ return;
+#else
+ (void)mi_row;
+ (void)mi_col;
+ (void)xd;
+#endif
+ ++counts->uv_mode[y_mode][uv_mode];
+}
+
+#if CONFIG_VAR_TX
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+ int blk_row, int blk_col) {
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const int tx_row = blk_row >> 1;
+ const int tx_col = blk_col >> 1;
+ const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+ int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row,
+ mbmi->sb_type, tx_size);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (tx_size == plane_tx_size) {
+ ++counts->txfm_partition[ctx][0];
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row, tx_size, tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bs = tx_size_wide_unit[sub_txs];
+ int i;
+
+ ++counts->txfm_partition[ctx][1];
+ ++x->txb_split_count;
+
+ if (tx_size == TX_8X8) {
+ mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row, TX_4X4, tx_size);
+ return;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ int offsetr = (i >> 1) * bs;
+ int offsetc = (i & 0x01) * bs;
+ update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+ blk_col + offsetc);
+ }
+ }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+ BLOCK_SIZE plane_bsize, int mi_row,
+ int mi_col, FRAME_COUNTS *td_counts) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+ TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+
+ xd->above_txfm_context = cm->above_txfm_context + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ for (idy = 0; idy < mi_height; idy += bh)
+ for (idx = 0; idx < mi_width; idx += bw)
+ update_txfm_count(x, xd, td_counts, max_tx_size, mi_width != mi_height,
+ idy, idx);
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+ int blk_col) {
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const int tx_row = blk_row >> 1;
+ const int tx_col = blk_col >> 1;
+ const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (tx_size == plane_tx_size) {
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row, tx_size, tx_size);
+
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsl = tx_size_wide_unit[sub_txs];
+ int i;
+
+ if (tx_size == TX_8X8) {
+ mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row, TX_4X4, tx_size);
+ return;
+ }
+
+ assert(bsl > 0);
+ for (i = 0; i < 4; ++i) {
+ int offsetr = (i >> 1) * bsl;
+ int offsetc = (i & 0x01) * bsl;
+ set_txfm_context(xd, sub_txs, blk_row + offsetr, blk_col + offsetc);
+ }
+ }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
+ int mi_row, int mi_col) {
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+
+ xd->above_txfm_context = cm->above_txfm_context + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ for (idy = 0; idy < mi_height; idy += bh)
+ for (idx = 0; idx < mi_width; idx += bw)
+ set_txfm_context(xd, max_tx_size, idy, idx);
+}
+#endif
+
+void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
+#if CONFIG_TXK_SEL
+ int block, int plane,
+#endif
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ FRAME_COUNTS *counts) {
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ int is_inter = is_inter_block(mbmi);
+#if !CONFIG_TXK_SEL
+ TX_TYPE tx_type = mbmi->tx_type;
+#else
+ // Only y plane's tx_type is updated
+ if (plane > 0) return;
+ TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+#endif
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
+ cm->base_qindex > 0 && !mbmi->skip &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int eset =
+ get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+ if (eset > 0) {
+ if (is_inter) {
+ ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
+ } else {
+ ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
+ [tx_type];
+ }
+ }
+ }
+#else
+ (void)bsize;
+ if (tx_size < TX_32X32 &&
+ ((!cm->seg.enabled && cm->base_qindex > 0) ||
+ (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+ !mbmi->skip &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ if (is_inter) {
+ ++counts->inter_ext_tx[tx_size][tx_type];
+ } else {
+ ++counts->intra_ext_tx[tx_size][intra_mode_to_tx_type_context[mbmi->mode]]
+ [tx_type];
+ }
+ }
+#endif // CONFIG_EXT_TX
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
+ TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO **mi_8x8 = xd->mi;
+ MODE_INFO *mi = mi_8x8[0];
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ const int seg_skip =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+ const int mis = cm->mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int is_inter = is_inter_block(mbmi);
+#if CONFIG_CB4X4
+ const BLOCK_SIZE block_size = bsize;
+#else
+ const BLOCK_SIZE block_size = AOMMAX(bsize, BLOCK_8X8);
+#endif
+
+#if CONFIG_PVQ
+ x->pvq_speed = 0;
+ x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
+#endif
+#if CONFIG_CFL
+ x->cfl_store_y = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
+#endif
+
+ if (!is_inter) {
+ int plane;
+ mbmi->skip = 1;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1,
+ mi_row, mi_col);
+ }
+ if (!dry_run) {
+ sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
+ frame_is_intra_only(cm), mi_row, mi_col);
+ }
+#if CONFIG_PALETTE
+ if (bsize >= BLOCK_8X8 && !dry_run) {
+ for (plane = 0; plane <= 1; ++plane) {
+ if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+ mbmi->palette_mode_info.palette_first_color_idx[plane] =
+ xd->plane[plane].color_index_map[0];
+ // TODO(huisu): this increases the use of token buffer. Needs stretch
+ // test to verify.
+ av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
+ }
+ }
+ }
+#endif // CONFIG_PALETTE
+#if CONFIG_VAR_TX
+ mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif
+#if CONFIG_LV_MAP
+ av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
+#else // CONFIG_LV_MAP
+ av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
+#endif // CONFIG_LV_MAP
+ } else {
+ int ref;
+ const int is_compound = has_second_ref(mbmi);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+#if CONFIG_INTRABC
+ assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+#else
+ assert(cfg != NULL);
+#endif // !CONFIG_INTRABC
+ av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+ &xd->block_refs[ref]->sf);
+ }
+ if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
+ av1_build_inter_predictors_sby(xd, mi_row, mi_col, NULL, block_size);
+
+ av1_build_inter_predictors_sbuv(xd, mi_row, mi_col, NULL, block_size);
+#if CONFIG_MOTION_VAR
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+#if CONFIG_NCOBMC
+ if (dry_run == OUTPUT_ENABLED)
+ av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+ else
+#endif
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+ }
+#endif // CONFIG_MOTION_VAR
+
+ av1_encode_sb((AV1_COMMON *)cm, x, block_size, mi_row, mi_col);
+#if CONFIG_VAR_TX
+ if (mbmi->skip) mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+ av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, block_size,
+ rate);
+#else
+#if CONFIG_LV_MAP
+ av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
+#else // CONFIG_LV_MAP
+ av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
+#endif // CONFIG_LV_MAP
+#endif
+ }
+
+ if (!dry_run) {
+#if CONFIG_VAR_TX
+ TX_SIZE tx_size =
+ is_inter && !mbmi->skip ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+ TX_SIZE tx_size = mbmi->tx_size;
+#endif
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] &&
+#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
+ mbmi->sb_type > BLOCK_4X4 &&
+#else
+ mbmi->sb_type >= BLOCK_8X8 &&
+#endif
+ !(is_inter && (mbmi->skip || seg_skip))) {
+#if CONFIG_VAR_TX
+ if (is_inter) {
+ tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
+ } else {
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+ : intra_tx_size_cat_lookup[bsize];
+ const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+ const int depth = tx_size_to_depth(coded_tx_size);
+ ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+ if (tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+ }
+#else
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+ : intra_tx_size_cat_lookup[bsize];
+ const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+ const int depth = tx_size_to_depth(coded_tx_size);
+
+ ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+ } else {
+ int i, j;
+ TX_SIZE intra_tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter) {
+ if (xd->lossless[mbmi->segment_id]) {
+ intra_tx_size = TX_4X4;
+ } else {
+ intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+ }
+ } else {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ intra_tx_size = tx_size;
+#else
+ intra_tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4;
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+ }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ ++td->counts->tx_size_implied[max_txsize_lookup[bsize]]
+ [txsize_sqr_up_map[tx_size]];
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+ for (j = 0; j < mi_height; j++)
+ for (i = 0; i < mi_width; i++)
+ if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
+ mi_8x8[mis * j + i]->mbmi.tx_size = intra_tx_size;
+
+#if CONFIG_VAR_TX
+ mbmi->min_tx_size = get_min_tx_size(intra_tx_size);
+ if (intra_tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+#endif
+ }
+
+ ++td->counts->tx_size_totals[txsize_sqr_map[tx_size]];
+ ++td->counts
+ ->tx_size_totals[txsize_sqr_map[get_uv_tx_size(mbmi, &xd->plane[1])]];
+#if !CONFIG_TXK_SEL
+ av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts);
+#endif
+ }
+
+#if CONFIG_VAR_TX
+ if (cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_CB4X4
+ mbmi->sb_type > BLOCK_4X4 &&
+#else
+ mbmi->sb_type >= BLOCK_8X8 &&
+#endif
+ is_inter && !(mbmi->skip || seg_skip)) {
+ if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+ } else {
+ TX_SIZE tx_size = mbmi->tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter)
+ tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter);
+ else
+ tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+ mbmi->tx_size = tx_size;
+ set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd);
+ }
+#endif // CONFIG_VAR_TX
+}
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
+ if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1;
+#if CONFIG_EXT_INTER
+ if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1;
+#endif // CONFIG_EXT_INTER
+ return 0;
+}
+
+static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PC_TREE *pc_tree) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int hbs = mi_size_wide[bsize] / 2;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+ int i;
+#endif
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+
+#if !CONFIG_CB4X4
+ assert(bsize >= BLOCK_8X8);
+#endif
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return 1;
+
+ switch (partition) {
+ case PARTITION_NONE: return check_intra_b(&pc_tree->none); break;
+ case PARTITION_VERT:
+ if (check_intra_b(&pc_tree->vertical[0])) return 1;
+ if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
+ if (check_intra_b(&pc_tree->vertical[1])) return 1;
+ }
+ break;
+ case PARTITION_HORZ:
+ if (check_intra_b(&pc_tree->horizontal[0])) return 1;
+ if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
+ if (check_intra_b(&pc_tree->horizontal[1])) return 1;
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ if (check_intra_b(pc_tree->leaf_split[0])) return 1;
+ } else {
+ if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize,
+ pc_tree->split[0]))
+ return 1;
+ if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize,
+ pc_tree->split[1]))
+ return 1;
+ if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize,
+ pc_tree->split[2]))
+ return 1;
+ if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize,
+ pc_tree->split[3]))
+ return 1;
+ }
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ for (i = 0; i < 3; i++) {
+ if (check_intra_b(&pc_tree->horizontala[i])) return 1;
+ }
+ break;
+ case PARTITION_HORZ_B:
+ for (i = 0; i < 3; i++) {
+ if (check_intra_b(&pc_tree->horizontalb[i])) return 1;
+ }
+ break;
+ case PARTITION_VERT_A:
+ for (i = 0; i < 3; i++) {
+ if (check_intra_b(&pc_tree->verticala[i])) return 1;
+ }
+ break;
+ case PARTITION_VERT_B:
+ for (i = 0; i < 3; i++) {
+ if (check_intra_b(&pc_tree->verticalb[i])) return 1;
+ }
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0);
+ }
+ return 0;
+}
+
+static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) {
+ return ctx->mic.mbmi.tx_size == supertx_size;
+}
+
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+ PC_TREE *pc_tree) {
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+
+ partition = pc_tree->partitioning;
+ subsize = get_subsize(bsize, partition);
+ switch (partition) {
+ case PARTITION_NONE: return check_supertx_b(supertx_size, &pc_tree->none);
+ case PARTITION_VERT:
+ return check_supertx_b(supertx_size, &pc_tree->vertical[0]);
+ case PARTITION_HORZ:
+ return check_supertx_b(supertx_size, &pc_tree->horizontal[0]);
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize)
+ return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
+ else
+ return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
+ case PARTITION_HORZ_B:
+ return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]);
+ case PARTITION_VERT_A:
+ return check_supertx_b(supertx_size, &pc_tree->verticala[0]);
+ case PARTITION_VERT_B:
+ return check_supertx_b(supertx_size, &pc_tree->verticalb[0]);
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0); return 0;
+ }
+}
+
+static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+ int mi_row_ori, int mi_col_ori,
+#endif // CONFIG_EXT_INTER
+ int mi_row_pred, int mi_col_pred,
+ BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
+ // Used in supertx
+ // (mi_row_ori, mi_col_ori): location for mv
+ // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi_8x8 = xd->mi[0];
+ MODE_INFO *mi = mi_8x8;
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ int ref;
+ const int is_compound = has_second_ref(mbmi);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+ av1_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
+ &xd->block_refs[ref]->sf);
+ }
+
+ if (!b_sub8x8)
+ av1_build_inter_predictors_sb_extend(xd,
+#if CONFIG_EXT_INTER
+ mi_row_ori, mi_col_ori,
+#endif // CONFIG_EXT_INTER
+ mi_row_pred, mi_col_pred, bsize_pred);
+ else
+ av1_build_inter_predictors_sb_sub8x8_extend(xd,
+#if CONFIG_EXT_INTER
+ mi_row_ori, mi_col_ori,
+#endif // CONFIG_EXT_INTER
+ mi_row_pred, mi_col_pred,
+ bsize_pred, block);
+}
+
+static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int block,
+ int mi_row_ori, int mi_col_ori, int mi_row_pred,
+ int mi_col_pred, int mi_row_top, int mi_col_top,
+ uint8_t *dst_buf[3], int dst_stride[3],
+ BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
+ RUN_TYPE dry_run, int b_sub8x8, int bextend) {
+ // Used in supertx
+ // (mi_row_ori, mi_col_ori): location for mv
+ // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+ // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+ // block: sub location of sub8x8 blocks
+ // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+ // bextend: 1: region to predict is an extension of ori; 0: not
+
+ MACROBLOCK *const x = &td->mb;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+ int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+ const int mi_width_top = mi_size_wide[bsize_top];
+ const int mi_height_top = mi_size_high[bsize_top];
+
+ if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+ mi_row_pred >= mi_row_top + mi_height_top ||
+ mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
+ mi_col_pred >= cm->mi_cols)
+ return;
+
+ set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori,
+ mi_col_ori, bsize_pred);
+ xd->plane[0].dst.stride = dst_stride[0];
+ xd->plane[1].dst.stride = dst_stride[1];
+ xd->plane[2].dst.stride = dst_stride[2];
+ xd->plane[0].dst.buf = dst_buf[0] +
+ (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+ (c >> xd->plane[0].subsampling_x);
+ xd->plane[1].dst.buf = dst_buf[1] +
+ (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+ (c >> xd->plane[1].subsampling_x);
+ xd->plane[2].dst.buf = dst_buf[2] +
+ (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+ (c >> xd->plane[2].subsampling_x);
+
+ predict_superblock(cpi, td,
+#if CONFIG_EXT_INTER
+ mi_row_ori, mi_col_ori,
+#endif // CONFIG_EXT_INTER
+ mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
+
+ if (!dry_run && !bextend)
+ update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
+}
+
+static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int block, BLOCK_SIZE bsize,
+ BLOCK_SIZE top_bsize, int mi_row, int mi_col,
+ int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
+ uint8_t *dst_buf[3], int dst_stride[3], int dir) {
+ // dir: 0-lower, 1-upper, 2-left, 3-right
+ // 4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+ MACROBLOCKD *xd = &td->mb.e_mbd;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ int xss = xd->plane[1].subsampling_x;
+ int yss = xd->plane[1].subsampling_y;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+ int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0;
+ int wide_unit, high_unit;
+ int i, j;
+ int ext_offset = 0;
+
+ BLOCK_SIZE extend_bsize;
+ int mi_row_pred, mi_col_pred;
+
+ if (dir == 0 || dir == 1) { // lower and upper
+ extend_bsize =
+ (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss)
+ ? BLOCK_8X8
+ : BLOCK_16X8;
+
+#if CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) {
+ extend_bsize = BLOCK_4X4;
+ ext_offset = mi_size_wide[BLOCK_8X8];
+ }
+#endif
+ wide_unit = mi_size_wide[extend_bsize];
+ high_unit = mi_size_high[extend_bsize];
+
+ mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset));
+ mi_col_pred = mi_col;
+
+ for (j = 0; j < mi_height + ext_offset; j += high_unit)
+ for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+ predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
+ mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
+ dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+ 1);
+ } else if (dir == 2 || dir == 3) { // left and right
+ extend_bsize =
+ (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
+ ? BLOCK_8X8
+ : BLOCK_8X16;
+#if CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) {
+ extend_bsize = BLOCK_4X4;
+ ext_offset = mi_size_wide[BLOCK_8X8];
+ }
+#endif
+ wide_unit = mi_size_wide[extend_bsize];
+ high_unit = mi_size_high[extend_bsize];
+
+ mi_row_pred = mi_row;
+ mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset));
+
+ for (j = 0; j < mi_height + ext_offset; j += high_unit)
+ for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+ predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
+ mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
+ dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+ 1);
+ } else {
+ extend_bsize = BLOCK_8X8;
+#if CONFIG_CB4X4
+ if (bsize < BLOCK_8X8) {
+ extend_bsize = BLOCK_4X4;
+ ext_offset = mi_size_wide[BLOCK_8X8];
+ }
+#endif
+ wide_unit = mi_size_wide[extend_bsize];
+ high_unit = mi_size_high[extend_bsize];
+
+ mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height
+ : -(mi_height + ext_offset));
+ mi_col_pred =
+ mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset));
+
+ for (j = 0; j < mi_height + ext_offset; j += high_unit)
+ for (i = 0; i < mi_width + ext_offset; i += wide_unit)
+ predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
+ mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
+ dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+ 1);
+ }
+}
+
+static void extend_all(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int block, BLOCK_SIZE bsize,
+ BLOCK_SIZE top_bsize, int mi_row, int mi_col,
+ int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
+ uint8_t *dst_buf[3], int dst_stride[3]) {
+ assert(block >= 0 && block < 4);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 0);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 1);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 2);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 3);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 4);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 5);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 6);
+ extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride, 7);
+}
+
+// This function generates prediction for multiple blocks, between which
+// discontinuity around boundary is reduced by smoothing masks. The basic
+// smoothing mask is a soft step function along horz/vert direction. In more
+// complicated case when a block is split into 4 subblocks, the basic mask is
+// first applied to neighboring subblocks (2 pairs) in horizontal direction and
+// then applied to the 2 masked prediction mentioned above in vertical direction
+// If the block is split into more than one level, at every stage, masked
+// prediction is stored in dst_buf[] passed from higher level.
+static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row,
+ int mi_col, int mi_row_top, int mi_col_top,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
+ int dst_stride[3], PC_TREE *pc_tree) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int is_partition_root = bsize >= BLOCK_8X8;
+ const int ctx = is_partition_root
+ ? partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+ mi_row + hbs < cm->mi_rows,
+ mi_col + hbs < cm->mi_cols,
+#endif
+ bsize)
+ : -1;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+ const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+ int i;
+ uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+ int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+ int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+ int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+ assert(bsize >= BLOCK_8X8);
+#endif
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+ dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+ dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+ dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+ dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
+ dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+ dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+ dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ dst_buf1[0] = tmp_buf1;
+ dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+ dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
+ dst_buf2[0] = tmp_buf2;
+ dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+ dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
+ dst_buf3[0] = tmp_buf3;
+ dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+ dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ if (!dry_run && ctx >= 0 && bsize < top_bsize) {
+ // Explicitly cast away const.
+ FRAME_COUNTS *const frame_counts = (FRAME_COUNTS *)&cm->counts;
+ frame_counts->partition[ctx][partition]++;
+ }
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf[i];
+ xd->plane[i].dst.stride = dst_stride[i];
+ }
+
+ switch (partition) {
+ case PARTITION_NONE:
+ assert(bsize < top_bsize);
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ bsize, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, dst_buf, dst_stride);
+ break;
+ case PARTITION_HORZ:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ // Fisrt half
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ BLOCK_8X8, dry_run, 1, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+ // Second half
+ predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+ // Smooth
+ xd->plane[0].dst.buf = dst_buf[0];
+ xd->plane[0].dst.stride = dst_stride[0];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+ 0);
+ } else {
+ // First half
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
+
+ if (mi_row + hbs < cm->mi_rows) {
+ // Second half
+ predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+ mi_col, mi_row_top, mi_col_top, dst_buf1,
+ dst_stride1, top_bsize, subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+ mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
+ dst_stride1);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+ mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
+ dst_stride1, 1);
+
+ // Smooth
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf[i];
+ xd->plane[i].dst.stride = dst_stride[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_HORZ, i);
+ }
+ }
+ }
+ break;
+ case PARTITION_VERT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ // First half
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ BLOCK_8X8, dry_run, 1, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+ // Second half
+ predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+ // Smooth
+ xd->plane[0].dst.buf = dst_buf[0];
+ xd->plane[0].dst.stride = dst_stride[0];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+ 0);
+ } else {
+ // bsize: not important, not useful
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
+
+ if (mi_col + hbs < cm->mi_cols) {
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+ mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+ dst_stride1, top_bsize, subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+ dst_stride1);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+ dst_stride1, 2);
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf[i];
+ xd->plane[i].dst.stride = dst_stride[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_VERT, i);
+ }
+ }
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (bsize == BLOCK_8X8 && !unify_bsize) {
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ BLOCK_8X8, dry_run, 1, 0);
+ predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
+ predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
+ predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
+
+ if (bsize < top_bsize) {
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+ extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+ extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+ extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
+ }
+ } else {
+ predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
+ mi_col_top, dry_run, subsize, top_bsize, dst_buf,
+ dst_stride, pc_tree->split[0]);
+ if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+ predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
+ mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
+ dst_stride1, pc_tree->split[1]);
+ if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+ predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
+ mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
+ dst_stride2, pc_tree->split[2]);
+ if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+ predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+ mi_row_top, mi_col_top, dry_run, subsize,
+ top_bsize, dst_buf3, dst_stride3,
+ pc_tree->split[3]);
+ }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+#if !CONFIG_CB4X4
+ if (bsize == BLOCK_8X8 && i != 0)
+ continue; // Skip <4x4 chroma smoothing
+#endif
+ if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_VERT, i);
+ if (mi_row + hbs < cm->mi_rows) {
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_VERT, i);
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_HORZ, i);
+ }
+ } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+ if (bsize == BLOCK_8X8 && i != 0)
+ continue; // Skip <4x4 chroma smoothing
+
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_HORZ, i);
+ }
+ }
+ break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_HORZ_A:
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+ mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+ dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+ mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+ top_bsize, subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf[i];
+ xd->plane[i].dst.stride = dst_stride[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+ i);
+ }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+ i);
+ }
+
+ break;
+ case PARTITION_VERT_A:
+
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+ mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+ top_bsize, bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+ mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+ dst_stride2, top_bsize, subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf[i];
+ xd->plane[i].dst.stride = dst_stride[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+ i);
+ }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+ i);
+ }
+ break;
+ case PARTITION_HORZ_B:
+
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+ mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+ top_bsize, bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+ mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+ dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
+ dst_stride2);
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf1[i];
+ xd->plane[i].dst.stride = dst_stride1[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_VERT, i);
+ }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf[i];
+ xd->plane[i].dst.stride = dst_stride[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+ i);
+ }
+ break;
+ case PARTITION_VERT_B:
+
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+ mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+ subsize, dry_run, 0, 0);
+ if (bsize < top_bsize)
+ extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+ else
+ extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+ mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+ dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+ predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+ mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+ dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
+ extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
+ dst_stride2);
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf1[i];
+ xd->plane[i].dst.stride = dst_stride1[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+ mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+ PARTITION_HORZ, i);
+ }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst_buf[i];
+ xd->plane[i].dst.stride = dst_stride[i];
+ av1_build_masked_inter_predictor_complex(
+ xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+ mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+ i);
+ }
+ break;
+#endif // CONFIG_EXT_PARTITION_TYPES
+ default: assert(0);
+ }
+
+#if CONFIG_EXT_PARTITION_TYPES
+ if (bsize < top_bsize)
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+ if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif // CONFIG_EXT_PARTITION_TYPES
+}
+
+static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
+ const TileInfo *const tile, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
+ TX_TYPE *best_tx, PC_TREE *pc_tree) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate,
+ base_rate = *tmp_rate;
+ int64_t sse, pnsse, sse_uv, this_dist, dist_uv;
+ uint8_t *dst_buf[3];
+ int dst_stride[3];
+ TX_SIZE tx_size;
+ MB_MODE_INFO *mbmi;
+ TX_TYPE tx_type, best_tx_nostx;
+#if CONFIG_EXT_TX
+ int ext_tx_set;
+#endif // CONFIG_EXT_TX
+ int tmp_rate_tx = 0, skip_tx = 0;
+ int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
+
+ set_skip_context(xd, mi_row, mi_col);
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+ update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col);
+ for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+ dst_buf[plane] = xd->plane[plane].dst.buf;
+ dst_stride[plane] = xd->plane[plane].dst.stride;
+ }
+ predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
+ bsize, dst_buf, dst_stride, pc_tree);
+
+ set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+ set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+ mbmi = &xd->mi[0]->mbmi;
+ best_tx_nostx = mbmi->tx_type;
+
+ *best_tx = DCT_DCT;
+
+ // chroma
+ skippable_uv = 1;
+ rate_uv = 0;
+ dist_uv = 0;
+ sse_uv = 0;
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+ ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+
+ tx_size = max_txsize_lookup[bsize];
+ tx_size =
+ uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
+ av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+
+ av1_subtract_plane(x, bsize, plane);
+ av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0,
+ get_plane_block_size(bsize, pd), &ctxa[0], &ctxl[0],
+ &this_rd_stats);
+
+ this_rate = this_rd_stats.rate;
+ this_dist = this_rd_stats.dist;
+ pnsse = this_rd_stats.sse;
+ pnskip = this_rd_stats.skip;
+#else
+ tx_size = max_txsize_lookup[bsize];
+ tx_size =
+ uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
+ av1_subtract_plane(x, bsize, plane);
+ av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
+ &pnsse, INT64_MAX, plane, bsize, tx_size, 0);
+#endif // CONFIG_VAR_TX
+
+ rate_uv += this_rate;
+ dist_uv += this_dist;
+ sse_uv += pnsse;
+ skippable_uv &= pnskip;
+ }
+
+ // luma
+ tx_size = max_txsize_lookup[bsize];
+ av1_subtract_plane(x, bsize, 0);
+#if CONFIG_EXT_TX
+ ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used);
+#endif // CONFIG_EXT_TX
+ for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+#if CONFIG_VAR_TX
+ ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ RD_STATS this_rd_stats;
+#endif // CONFIG_VAR_TX
+
+#if CONFIG_EXT_TX
+ if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+#else
+ if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue;
+#endif // CONFIG_EXT_TX
+ mbmi->tx_type = tx_type;
+
+#if CONFIG_VAR_TX
+ av1_init_rd_stats(&this_rd_stats);
+ av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+ av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, &ctxa[0], &ctxl[0],
+ &this_rd_stats);
+
+ this_rate = this_rd_stats.rate;
+ this_dist = this_rd_stats.dist;
+ pnsse = this_rd_stats.sse;
+ pnskip = this_rd_stats.skip;
+#else
+ av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
+ &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
+#endif // CONFIG_VAR_TX
+
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(tx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
+ !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) {
+ if (ext_tx_set > 0)
+ this_rate +=
+ cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
+ }
+#else
+ if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+ this_rate != INT_MAX) {
+ this_rate += cpi->inter_tx_type_costs[tx_size][mbmi->tx_type];
+ }
+#endif // CONFIG_EXT_TX
+ *tmp_rate = rate_uv + this_rate;
+ *tmp_dist = dist_uv + this_dist;
+ sse = sse_uv + pnsse;
+ skippable = skippable_uv && pnskip;
+ if (skippable) {
+ *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ x->skip = 1;
+ } else {
+ if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist) <
+ RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+ *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ x->skip = 0;
+ } else {
+ *tmp_dist = sse;
+ *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ x->skip = 1;
+ }
+ }
+ *tmp_rate += base_rate;
+ rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist);
+ if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
+ *best_tx = tx_type;
+ bestrd_tx = rd_tx;
+ tmp_rate_tx = *tmp_rate;
+ tmp_dist_tx = *tmp_dist;
+ skip_tx = x->skip;
+ }
+ }
+ *tmp_rate = tmp_rate_tx;
+ *tmp_dist = tmp_dist_tx;
+ x->skip = skip_tx;
+#if CONFIG_VAR_TX
+ for (plane = 0; plane < 1; ++plane)
+ memset(x->blk_skip[plane], x->skip,
+ sizeof(uint8_t) * pc_tree->none.num_4x4_blk);
+#endif // CONFIG_VAR_TX
+ xd->mi[0]->mbmi.tx_type = best_tx_nostx;
+}
+#endif // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
new file mode 100644
index 0000000000..08d6d20dee
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODEFRAME_H_
+#define AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+// Constants used in SOURCE_VAR_BASED_PARTITION
+#define VAR_HIST_MAX_BG_VAR 1000
+#define VAR_HIST_FACTOR 10
+#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
+#define VAR_HIST_LARGE_CUT_OFF 75
+#define VAR_HIST_SMALL_CUT_OFF 45
+
+void av1_setup_src_planes(struct macroblock *x,
+ const struct yv12_buffer_config *src, int mi_row,
+ int mi_col);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+ int tile_col);
+
+void av1_set_variance_partition_thresholds(struct AV1_COMP *cpi, int q);
+
+void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd,
+#if CONFIG_TXK_SEL
+ int block, int plane,
+#endif
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ FRAME_COUNTS *counts);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
new file mode 100644
index 0000000000..c450244b1c
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -0,0 +1,1671 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/tokenize.h"
+
+#if CONFIG_PVQ
+#include "av1/encoder/encint.h"
+#include "av1/common/partition.h"
+#include "av1/encoder/pvq_encoder.h"
+#endif
+
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
+
+// Check if one needs to use c version subtraction.
+static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
+
+static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src8, ptrdiff_t src_stride,
+ const uint8_t *pred8, ptrdiff_t pred_stride) {
+#if !CONFIG_HIGHBITDEPTH
+ (void)xd;
+#endif
+
+ if (check_subtract_block_size(rows, cols)) {
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
+ src_stride, pred8, pred_stride, xd->bd);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+ pred_stride);
+
+ return;
+ }
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+ pred8, pred_stride, xd->bd);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+ pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const int tx1d_width = tx_size_wide[tx_size];
+ const int tx1d_height = tx_size_high[tx_size];
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ uint8_t *src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+ int16_t *src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+ subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
+ src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+}
+
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+#if CONFIG_EC_ADAPT
+ { 10, 7 }, { 8, 5 },
+#else
+ { 10, 6 }, { 8, 5 },
+#endif
+};
+
+#define UPDATE_RD_COST() \
+ { \
+ rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
+ rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
+ }
+
+static INLINE int64_t
+get_token_bit_costs(unsigned int token_costs[2][COEFF_CONTEXTS][ENTROPY_TOKENS],
+ int skip_eob, int ctx, int token) {
+#if CONFIG_NEW_TOKENSET
+ (void)skip_eob;
+ return token_costs[token == ZERO_TOKEN || token == EOB_TOKEN][ctx][token];
+#else
+ return token_costs[skip_eob][ctx][token];
+#endif
+}
+
+#define USE_GREEDY_OPTIMIZE_B 0
+
+#if USE_GREEDY_OPTIMIZE_B
+
+typedef struct av1_token_state {
+ int16_t token;
+ tran_low_t qc;
+ tran_low_t dqc;
+} av1_token_state;
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+ TX_SIZE tx_size, int ctx) {
+#if !CONFIG_PVQ
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ struct macroblock_plane *const p = &mb->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ref = is_inter_block(&xd->mi[0]->mbmi);
+ av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+ uint8_t token_cache[MAX_TX_SQUARE];
+ const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const int eob = p->eobs[block];
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const int16_t *const dequant_ptr = pd->dequant;
+ const uint8_t *const band_translate = get_band_translate(tx_size);
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+ const int16_t *const scan = scan_order->scan;
+ const int16_t *const nb = scan_order->neighbors;
+ int dqv;
+ const int shift = av1_get_tx_scale(tx_size);
+#if CONFIG_AOM_QM
+ int seg_id = xd->mi[0]->mbmi.segment_id;
+ const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
+#endif
+#if CONFIG_NEW_QUANT
+ int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
+ const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
+#elif !CONFIG_AOM_QM
+ const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
+#endif // CONFIG_NEW_QUANT
+ int sz = 0;
+ const int64_t rddiv = mb->rddiv;
+ int64_t rd_cost0, rd_cost1;
+ int16_t t0, t1;
+ int i, final_eob;
+#if CONFIG_HIGHBITDEPTH
+ const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
+#else
+ const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
+#endif
+ unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+ const int default_eob = tx_size_2d[tx_size];
+
+ assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+ assert((!plane_type && !plane) || (plane_type && plane));
+ assert(eob <= default_eob);
+
+ int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+/* CpuSpeedTest uses "--min-q=0 --max-q=0" and expects 100dB psnr
+* This creates conflict with search for a better EOB position
+* The line below is to make sure EOB search is disabled at this corner case.
+*/
+#if !CONFIG_NEW_QUANT && !CONFIG_AOM_QM
+ if (dq_step[1] <= 4) {
+ rdmult = 1;
+ }
+#endif
+
+ int64_t rate0, rate1;
+ for (i = 0; i < eob; i++) {
+ const int rc = scan[i];
+ int x = qcoeff[rc];
+ t0 = av1_get_token(x);
+
+ tokens[i][0].qc = x;
+ tokens[i][0].token = t0;
+ tokens[i][0].dqc = dqcoeff[rc];
+
+ token_cache[rc] = av1_pt_energy_class[t0];
+ }
+ tokens[eob][0].token = EOB_TOKEN;
+ tokens[eob][0].qc = 0;
+ tokens[eob][0].dqc = 0;
+ tokens[eob][1] = tokens[eob][0];
+
+ unsigned int(*token_costs_ptr)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ token_costs;
+
+ final_eob = 0;
+
+ int64_t eob_cost0, eob_cost1;
+
+ const int ctx0 = ctx;
+ /* Record the r-d cost */
+ int64_t accu_rate = 0;
+ int64_t accu_error = 0;
+
+ rate0 = get_token_bit_costs(*(token_costs_ptr + band_translate[0]), 0, ctx0,
+ EOB_TOKEN);
+ int64_t best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+ // int64_t best_block_rd_cost_all0 = best_block_rd_cost;
+
+ int x_prev = 1;
+
+ for (i = 0; i < eob; i++) {
+ const int rc = scan[i];
+ int x = qcoeff[rc];
+ sz = -(x < 0);
+
+ int band_cur = band_translate[i];
+ int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+ int token_tree_sel_cur = (x_prev == 0);
+
+ if (x == 0) {
+ // no need to search when x == 0
+ rate0 =
+ get_token_bit_costs(*(token_costs_ptr + band_cur), token_tree_sel_cur,
+ ctx_cur, tokens[i][0].token);
+ accu_rate += rate0;
+ x_prev = 0;
+ // accu_error does not change when x==0
+ } else {
+ /* Computing distortion
+ */
+ // compute the distortion for the first candidate
+ // and the distortion for quantizing to 0.
+ int dx0 = (-coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx0 >>= xd->bd - 8;
+ }
+#endif
+ int64_t d0 = (int64_t)dx0 * dx0;
+
+ int x_a = x - 2 * sz - 1;
+ int64_t d2, d2_a;
+
+ int dx;
+
+#if CONFIG_AOM_QM
+ int iwt = iqmatrix[rc];
+ dqv = dequant_ptr[rc != 0];
+ dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+#else
+ dqv = dequant_ptr[rc != 0];
+#endif
+
+ dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx >>= xd->bd - 8;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ d2 = (int64_t)dx * dx;
+
+ /* compute the distortion for the second candidate
+ * x_a = x - 2 * sz + 1;
+ */
+ if (x_a != 0) {
+#if CONFIG_NEW_QUANT
+ dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
+ (coeff[rc] << shift);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx >>= xd->bd - 8;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+#else // CONFIG_NEW_QUANT
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
+ } else {
+ dx -= (dqv + sz) ^ sz;
+ }
+#else
+ dx -= (dqv + sz) ^ sz;
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_NEW_QUANT
+ d2_a = (int64_t)dx * dx;
+ } else {
+ d2_a = d0;
+ }
+ /* Computing rates and r-d cost
+ */
+
+ int best_x, best_eob_x;
+ int64_t base_bits, next_bits0, next_bits1;
+ int64_t next_eob_bits0, next_eob_bits1;
+
+ // rate cost of x
+ base_bits = av1_get_token_cost(x, &t0, cat6_bits);
+ rate0 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
+ token_tree_sel_cur, ctx_cur, t0);
+
+ base_bits = av1_get_token_cost(x_a, &t1, cat6_bits);
+ rate1 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
+ token_tree_sel_cur, ctx_cur, t1);
+
+ next_bits0 = 0;
+ next_bits1 = 0;
+ next_eob_bits0 = 0;
+ next_eob_bits1 = 0;
+
+ if (i < default_eob - 1) {
+ int ctx_next, token_tree_sel_next;
+ int band_next = band_translate[i + 1];
+
+ token_cache[rc] = av1_pt_energy_class[t0];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x == 0);
+
+ next_bits0 = get_token_bit_costs(*(token_costs_ptr + band_next),
+ token_tree_sel_next, ctx_next,
+ tokens[i + 1][0].token);
+ next_eob_bits0 =
+ get_token_bit_costs(*(token_costs_ptr + band_next),
+ token_tree_sel_next, ctx_next, EOB_TOKEN);
+
+ token_cache[rc] = av1_pt_energy_class[t1];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x_a == 0);
+
+ next_bits1 = get_token_bit_costs(*(token_costs_ptr + band_next),
+ token_tree_sel_next, ctx_next,
+ tokens[i + 1][0].token);
+
+ if (x_a != 0) {
+ next_eob_bits1 =
+ get_token_bit_costs(*(token_costs_ptr + band_next),
+ token_tree_sel_next, ctx_next, EOB_TOKEN);
+ }
+ }
+
+ rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), d2);
+ rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), d2_a);
+
+ best_x = (rd_cost1 < rd_cost0);
+
+ eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+ (accu_error + d2 - d0));
+ eob_cost1 = eob_cost0;
+ if (x_a != 0) {
+ eob_cost1 = RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+ (accu_error + d2_a - d0));
+ best_eob_x = (eob_cost1 < eob_cost0);
+ } else {
+ best_eob_x = 0;
+ }
+
+ int dqc, dqc_a = 0;
+
+ dqc = dqcoeff[rc];
+ if (best_x + best_eob_x) {
+ if (x_a != 0) {
+#if CONFIG_NEW_QUANT
+ dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
+ dequant_val[band_translate[i]]);
+ dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a;
+ if (sz) dqc_a = -dqc_a;
+#else
+// The 32x32 transform coefficient uses half quantization step size.
+// Account for the rounding difference in the dequantized coefficeint
+// value when the quantization index is dropped from an even number
+// to an odd number.
+
+#if CONFIG_AOM_QM
+ tran_low_t offset = dqv >> shift;
+#else
+ tran_low_t offset = dq_step[rc != 0];
+#endif
+ if (shift & x_a) offset += (dqv & 0x01);
+
+ if (sz == 0)
+ dqc_a = dqcoeff[rc] - offset;
+ else
+ dqc_a = dqcoeff[rc] + offset;
+#endif // CONFIG_NEW_QUANT
+ } else {
+ dqc_a = 0;
+ } // if (x_a != 0)
+ }
+
+ // record the better quantized value
+ if (best_x) {
+ qcoeff[rc] = x_a;
+ dqcoeff[rc] = dqc_a;
+
+ accu_rate += rate1;
+ accu_error += d2_a - d0;
+ assert(d2_a <= d0);
+
+ token_cache[rc] = av1_pt_energy_class[t1];
+ } else {
+ accu_rate += rate0;
+ accu_error += d2 - d0;
+ assert(d2 <= d0);
+
+ token_cache[rc] = av1_pt_energy_class[t0];
+ }
+
+ x_prev = qcoeff[rc];
+
+ // determine whether to move the eob position to i+1
+ int64_t best_eob_cost_i = eob_cost0;
+
+ tokens[i][1].token = t0;
+ tokens[i][1].qc = x;
+ tokens[i][1].dqc = dqc;
+
+ if ((x_a != 0) && (best_eob_x)) {
+ best_eob_cost_i = eob_cost1;
+
+ tokens[i][1].token = t1;
+ tokens[i][1].qc = x_a;
+ tokens[i][1].dqc = dqc_a;
+ }
+
+ if (best_eob_cost_i < best_block_rd_cost) {
+ best_block_rd_cost = best_eob_cost_i;
+ final_eob = i + 1;
+ }
+ } // if (x==0)
+ } // for (i)
+
+ assert(final_eob <= eob);
+ if (final_eob > 0) {
+ assert(tokens[final_eob - 1][1].qc != 0);
+ i = final_eob - 1;
+ int rc = scan[i];
+ qcoeff[rc] = tokens[i][1].qc;
+ dqcoeff[rc] = tokens[i][1].dqc;
+ }
+
+ for (i = final_eob; i < eob; i++) {
+ int rc = scan[i];
+ qcoeff[rc] = 0;
+ dqcoeff[rc] = 0;
+ }
+
+ mb->plane[plane].eobs[block] = final_eob;
+ return final_eob;
+
+#else // !CONFIG_PVQ
+ (void)cm;
+ (void)tx_size;
+ (void)ctx;
+ struct macroblock_plane *const p = &mb->plane[plane];
+ return p->eobs[block];
+#endif // !CONFIG_PVQ
+}
+
+#else // USE_GREEDY_OPTIMIZE_B
+
+typedef struct av1_token_state {
+ int64_t error;
+ int rate;
+ int16_t next;
+ int16_t token;
+ tran_low_t qc;
+ tran_low_t dqc;
+ uint8_t best_index;
+} av1_token_state;
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+ TX_SIZE tx_size, int ctx) {
+#if !CONFIG_PVQ
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ struct macroblock_plane *const p = &mb->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ref = is_inter_block(&xd->mi[0]->mbmi);
+ av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+ uint8_t token_cache[MAX_TX_SQUARE];
+ const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const int eob = p->eobs[block];
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const int default_eob = tx_size_2d[tx_size];
+ const int16_t *const dequant_ptr = pd->dequant;
+ const uint8_t *const band_translate = get_band_translate(tx_size);
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+ const int16_t *const scan = scan_order->scan;
+ const int16_t *const nb = scan_order->neighbors;
+ int dqv;
+ const int shift = av1_get_tx_scale(tx_size);
+#if CONFIG_AOM_QM
+ int seg_id = xd->mi[0]->mbmi.segment_id;
+ const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
+#endif
+#if CONFIG_NEW_QUANT
+ int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
+ const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
+#elif !CONFIG_AOM_QM
+ const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
+#endif // CONFIG_NEW_QUANT
+ int next = eob, sz = 0;
+ const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+ const int64_t rddiv = mb->rddiv;
+ int64_t rd_cost0, rd_cost1;
+ int rate0, rate1;
+ int64_t error0, error1;
+ int16_t t0, t1;
+ int best, band = (eob < default_eob) ? band_translate[eob]
+ : band_translate[eob - 1];
+ int pt, i, final_eob;
+#if CONFIG_HIGHBITDEPTH
+ const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
+#else
+ const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
+#endif
+ unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+ const uint16_t *band_counts = &band_count_table[tx_size][band];
+ uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
+ int shortcut = 0;
+ int next_shortcut = 0;
+
+#if CONFIG_EXT_DELTA_Q
+ const int qindex = cm->seg.enabled
+ ? av1_get_qindex(&cm->seg, xd->mi[0]->mbmi.segment_id,
+ cm->base_qindex)
+ : cm->base_qindex;
+ if (qindex == 0) {
+ assert((qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+ }
+#else
+ assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+#endif
+
+ token_costs += band;
+
+ assert((!plane_type && !plane) || (plane_type && plane));
+ assert(eob <= default_eob);
+
+ /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+ /* Initialize the sentinel node of the trellis. */
+ tokens[eob][0].rate = 0;
+ tokens[eob][0].error = 0;
+ tokens[eob][0].next = default_eob;
+ tokens[eob][0].token = EOB_TOKEN;
+ tokens[eob][0].qc = 0;
+ tokens[eob][1] = tokens[eob][0];
+
+ for (i = 0; i < eob; i++) {
+ const int rc = scan[i];
+ tokens[i][0].rate = av1_get_token_cost(qcoeff[rc], &t0, cat6_bits);
+ tokens[i][0].token = t0;
+ token_cache[rc] = av1_pt_energy_class[t0];
+ }
+
+ for (i = eob; i-- > 0;) {
+ int base_bits, dx;
+ int64_t d2;
+ const int rc = scan[i];
+ int x = qcoeff[rc];
+#if CONFIG_AOM_QM
+ int iwt = iqmatrix[rc];
+ dqv = dequant_ptr[rc != 0];
+ dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+#else
+ dqv = dequant_ptr[rc != 0];
+#endif
+ next_shortcut = shortcut;
+
+ /* Only add a trellis state for non-zero coefficients. */
+ if (UNLIKELY(x)) {
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ /* Evaluate the first possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+
+ if (next_shortcut) {
+ /* Consider both possible successor states. */
+ if (next < default_eob) {
+ pt = get_coef_context(nb, token_cache, i + 1);
+ rate0 +=
+ get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
+ rate1 +=
+ get_token_bit_costs(*token_costs, 0, pt, tokens[next][1].token);
+ }
+ UPDATE_RD_COST();
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ } else {
+ if (next < default_eob) {
+ pt = get_coef_context(nb, token_cache, i + 1);
+ rate0 +=
+ get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
+ }
+ best = 0;
+ }
+
+ dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx >>= xd->bd - 8;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ d2 = (int64_t)dx * dx;
+ tokens[i][0].rate += (best ? rate1 : rate0);
+ tokens[i][0].error = d2 + (best ? error1 : error0);
+ tokens[i][0].next = next;
+ tokens[i][0].qc = x;
+ tokens[i][0].dqc = dqcoeff[rc];
+ tokens[i][0].best_index = best;
+
+ /* Evaluate the second possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+
+ // The threshold of 3 is empirically obtained.
+ if (UNLIKELY(abs(x) > 3)) {
+ shortcut = 0;
+ } else {
+#if CONFIG_NEW_QUANT
+ shortcut = ((av1_dequant_abscoeff_nuq(abs(x), dqv,
+ dequant_val[band_translate[i]]) >
+ (abs(coeff[rc]) << shift)) &&
+ (av1_dequant_abscoeff_nuq(abs(x) - 1, dqv,
+ dequant_val[band_translate[i]]) <
+ (abs(coeff[rc]) << shift)));
+#else // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+ if ((abs(x) * dequant_ptr[rc != 0] * iwt >
+ ((abs(coeff[rc]) << shift) << AOM_QM_BITS)) &&
+ (abs(x) * dequant_ptr[rc != 0] * iwt <
+ (((abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])
+ << AOM_QM_BITS)))
+#else
+ if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+ (abs(x) * dequant_ptr[rc != 0] <
+ (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
+#endif // CONFIG_AOM_QM
+ shortcut = 1;
+ else
+ shortcut = 0;
+#endif // CONFIG_NEW_QUANT
+ }
+
+ if (shortcut) {
+ sz = -(x < 0);
+ x -= 2 * sz + 1;
+ } else {
+ tokens[i][1] = tokens[i][0];
+ next = i;
+
+ if (UNLIKELY(!(--band_left))) {
+ --band_counts;
+ band_left = *band_counts;
+ --token_costs;
+ }
+ continue;
+ }
+
+ /* Consider both possible successor states. */
+ if (!x) {
+ /* If we reduced this coefficient to zero, check to see if
+ * we need to move the EOB back here.
+ */
+ t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+ t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+ base_bits = 0;
+ } else {
+ base_bits = av1_get_token_cost(x, &t0, cat6_bits);
+ t1 = t0;
+ }
+
+ if (next_shortcut) {
+ if (LIKELY(next < default_eob)) {
+ if (t0 != EOB_TOKEN) {
+ token_cache[rc] = av1_pt_energy_class[t0];
+ pt = get_coef_context(nb, token_cache, i + 1);
+ rate0 += get_token_bit_costs(*token_costs, !x, pt,
+ tokens[next][0].token);
+ }
+ if (t1 != EOB_TOKEN) {
+ token_cache[rc] = av1_pt_energy_class[t1];
+ pt = get_coef_context(nb, token_cache, i + 1);
+ rate1 += get_token_bit_costs(*token_costs, !x, pt,
+ tokens[next][1].token);
+ }
+ }
+
+ UPDATE_RD_COST();
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ } else {
+ // The two states in next stage are identical.
+ if (next < default_eob && t0 != EOB_TOKEN) {
+ token_cache[rc] = av1_pt_energy_class[t0];
+ pt = get_coef_context(nb, token_cache, i + 1);
+ rate0 +=
+ get_token_bit_costs(*token_costs, !x, pt, tokens[next][0].token);
+ }
+ best = 0;
+ }
+
+#if CONFIG_NEW_QUANT
+ dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
+ (coeff[rc] << shift);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx >>= xd->bd - 8;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+#else // CONFIG_NEW_QUANT
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
+ } else {
+ dx -= (dqv + sz) ^ sz;
+ }
+#else
+ dx -= (dqv + sz) ^ sz;
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_NEW_QUANT
+ d2 = (int64_t)dx * dx;
+
+ tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+ tokens[i][1].error = d2 + (best ? error1 : error0);
+ tokens[i][1].next = next;
+ tokens[i][1].token = best ? t1 : t0;
+ tokens[i][1].qc = x;
+
+ if (x) {
+#if CONFIG_NEW_QUANT
+ tokens[i][1].dqc = av1_dequant_abscoeff_nuq(
+ abs(x), dqv, dequant_val[band_translate[i]]);
+ tokens[i][1].dqc = shift ? ROUND_POWER_OF_TWO(tokens[i][1].dqc, shift)
+ : tokens[i][1].dqc;
+ if (sz) tokens[i][1].dqc = -tokens[i][1].dqc;
+#else
+// The 32x32 transform coefficient uses half quantization step size.
+// Account for the rounding difference in the dequantized coefficeint
+// value when the quantization index is dropped from an even number
+// to an odd number.
+
+#if CONFIG_AOM_QM
+ tran_low_t offset = dqv >> shift;
+#else
+ tran_low_t offset = dq_step[rc != 0];
+#endif
+ if (shift & x) offset += (dqv & 0x01);
+
+ if (sz == 0)
+ tokens[i][1].dqc = dqcoeff[rc] - offset;
+ else
+ tokens[i][1].dqc = dqcoeff[rc] + offset;
+#endif // CONFIG_NEW_QUANT
+ } else {
+ tokens[i][1].dqc = 0;
+ }
+
+ tokens[i][1].best_index = best;
+ /* Finally, make this the new head of the trellis. */
+ next = i;
+ } else {
+ /* There's no choice to make for a zero coefficient, so we don't
+ * add a new trellis node, but we do need to update the costs.
+ */
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ pt = get_coef_context(nb, token_cache, i + 1);
+ /* Update the cost of each path if we're past the EOB token. */
+ if (t0 != EOB_TOKEN) {
+ tokens[next][0].rate += get_token_bit_costs(*token_costs, 1, pt, t0);
+ tokens[next][0].token = ZERO_TOKEN;
+ }
+ if (t1 != EOB_TOKEN) {
+ tokens[next][1].rate += get_token_bit_costs(*token_costs, 1, pt, t1);
+ tokens[next][1].token = ZERO_TOKEN;
+ }
+ tokens[i][0].best_index = tokens[i][1].best_index = 0;
+ shortcut = (tokens[next][0].rate != tokens[next][1].rate);
+ /* Don't update next, because we didn't add a new node. */
+ }
+
+ if (UNLIKELY(!(--band_left))) {
+ --band_counts;
+ band_left = *band_counts;
+ --token_costs;
+ }
+ }
+
+ /* Now pick the best path through the whole trellis. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ rate0 += get_token_bit_costs(*token_costs, 0, ctx, t0);
+ rate1 += get_token_bit_costs(*token_costs, 0, ctx, t1);
+ UPDATE_RD_COST();
+ best = rd_cost1 < rd_cost0;
+
+ final_eob = -1;
+
+ for (i = next; i < eob; i = next) {
+ const int x = tokens[i][best].qc;
+ const int rc = scan[i];
+ if (x) final_eob = i;
+ qcoeff[rc] = x;
+ dqcoeff[rc] = tokens[i][best].dqc;
+
+ next = tokens[i][best].next;
+ best = tokens[i][best].best_index;
+ }
+ final_eob++;
+
+ mb->plane[plane].eobs[block] = final_eob;
+ assert(final_eob <= default_eob);
+ return final_eob;
+#else // !CONFIG_PVQ
+ (void)cm;
+ (void)tx_size;
+ (void)ctx;
+ struct macroblock_plane *const p = &mb->plane[plane];
+ return p->eobs[block];
+#endif // !CONFIG_PVQ
+}
+
+#endif // USE_GREEDY_OPTIMIZE_B
+
+#if !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+typedef enum QUANT_FUNC {
+ QUANT_FUNC_LOWBD = 0,
+ QUANT_FUNC_HIGHBD = 1,
+ QUANT_FUNC_TYPES = 2
+} QUANT_FUNC;
+
+static AV1_QUANT_FACADE
+ quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+#if !CONFIG_NEW_QUANT
+ { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+ { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+ { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+#else // !CONFIG_NEW_QUANT
+ { av1_quantize_fp_nuq_facade, av1_highbd_quantize_fp_nuq_facade },
+ { av1_quantize_b_nuq_facade, av1_highbd_quantize_b_nuq_facade },
+ { av1_quantize_dc_nuq_facade, av1_highbd_quantize_dc_nuq_facade },
+#endif // !CONFIG_NEW_QUANT
+ { NULL, NULL }
+ };
+
+#else
+
+typedef enum QUANT_FUNC {
+ QUANT_FUNC_LOWBD = 0,
+ QUANT_FUNC_TYPES = 1
+} QUANT_FUNC;
+
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES]
+ [QUANT_FUNC_TYPES] = {
+#if !CONFIG_NEW_QUANT
+ { av1_quantize_fp_facade },
+ { av1_quantize_b_facade },
+ { av1_quantize_dc_facade },
+#else // !CONFIG_NEW_QUANT
+ { av1_quantize_fp_nuq_facade },
+ { av1_quantize_b_nuq_facade },
+ { av1_quantize_dc_nuq_facade },
+#endif // !CONFIG_NEW_QUANT
+ { NULL }
+ };
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_PVQ
+
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, int ctx,
+ AV1_XFORM_QUANT xform_quant_idx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if !(CONFIG_PVQ || CONFIG_DAALA_DIST)
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+#else
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const int is_inter = is_inter_block(mbmi);
+ const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = block_size_wide[plane_bsize];
+#if CONFIG_AOM_QM
+ int seg_id = mbmi->segment_id;
+ const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][!is_inter][tx_size];
+ const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!is_inter][tx_size];
+#endif
+
+ FWD_TXFM_PARAM fwd_txfm_param;
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+ uint8_t *dst;
+ int16_t *pred;
+ const int dst_stride = pd->dst.stride;
+ int tx_blk_size;
+ int i, j;
+#endif
+
+#if !CONFIG_PVQ
+ const int tx2d_size = tx_size_2d[tx_size];
+ QUANT_PARAM qparam;
+ const int16_t *src_diff;
+
+ src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+ qparam.log_scale = av1_get_tx_scale(tx_size);
+#if CONFIG_NEW_QUANT
+ qparam.tx_size = tx_size;
+ qparam.dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
+#endif // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+ qparam.qmatrix = qmatrix;
+ qparam.iqmatrix = iqmatrix;
+#endif // CONFIG_AOM_QM
+#else
+ tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+ int skip = 1;
+ PVQ_INFO *pvq_info = NULL;
+ uint8_t *src;
+ int16_t *src_int16;
+ const int src_stride = p->src.stride;
+
+ (void)ctx;
+ (void)scan_order;
+ (void)qcoeff;
+
+ if (x->pvq_coded) {
+ assert(block < MAX_PVQ_BLOCKS_IN_SB);
+ pvq_info = &x->pvq[block][plane];
+ }
+ src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+ src_int16 =
+ &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+ // transform block size in pixels
+ tx_blk_size = tx_size_wide[tx_size];
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (j = 0; j < tx_blk_size; j++)
+ for (i = 0; i < tx_blk_size; i++)
+ src_int16[diff_stride * j + i] =
+ CONVERT_TO_SHORTPTR(src)[src_stride * j + i];
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ for (j = 0; j < tx_blk_size; j++)
+ for (i = 0; i < tx_blk_size; i++)
+ src_int16[diff_stride * j + i] = src[src_stride * j + i];
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+#endif
+
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+ dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+ // transform block size in pixels
+ tx_blk_size = tx_size_wide[tx_size];
+
+// copy uint8 orig and predicted block to int16 buffer
+// in order to use existing VP10 transform functions
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (j = 0; j < tx_blk_size; j++)
+ for (i = 0; i < tx_blk_size; i++)
+ pred[diff_stride * j + i] =
+ CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ for (j = 0; j < tx_blk_size; j++)
+ for (i = 0; i < tx_blk_size; i++)
+ pred[diff_stride * j + i] = dst[dst_stride * j + i];
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+#endif
+
+ (void)ctx;
+
+ fwd_txfm_param.tx_type = tx_type;
+ fwd_txfm_param.tx_size = tx_size;
+ fwd_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+
+#if !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+ fwd_txfm_param.bd = xd->bd;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+ if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+ if (LIKELY(!x->skip_block)) {
+ quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
+ coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+ } else {
+ av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+ }
+ }
+#if CONFIG_LV_MAP
+ p->txb_entropy_ctx[block] =
+ (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+#endif // CONFIG_LV_MAP
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ av1_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+ if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+ if (LIKELY(!x->skip_block)) {
+ quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+ coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+ } else {
+ av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+ }
+ }
+#if CONFIG_LV_MAP
+ p->txb_entropy_ctx[block] =
+ (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+#endif // CONFIG_LV_MAP
+#else // #if !CONFIG_PVQ
+ (void)xform_quant_idx;
+#if CONFIG_HIGHBITDEPTH
+ fwd_txfm_param.bd = xd->bd;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+ av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+ } else {
+#endif
+ av1_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+ av1_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+
+ // PVQ for inter mode block
+ if (!x->skip_block) {
+ PVQ_SKIP_TYPE ac_dc_coded =
+ av1_pvq_encode_helper(x,
+ coeff, // target original vector
+ ref_coeff, // reference vector
+ dqcoeff, // de-quantized vector
+ eob, // End of Block marker
+ pd->dequant, // aom's quantizers
+ plane, // image plane
+ tx_size, // block size in log_2 - 2
+ tx_type,
+ &x->rate, // rate measured
+ x->pvq_speed,
+ pvq_info); // PVQ info for a block
+ skip = ac_dc_coded == PVQ_SKIP;
+ }
+ x->pvq_skip[plane] = skip;
+
+ if (!skip) mbmi->skip = 0;
+#endif // #if !CONFIG_PVQ
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct encode_b_args *const args = arg;
+ AV1_COMMON *cm = args->cm;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int ctx;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *dst;
+#if !CONFIG_PVQ
+ ENTROPY_CONTEXT *a, *l;
+#endif
+#if CONFIG_VAR_TX
+ int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+#endif
+ dst = &pd->dst
+ .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+
+#if !CONFIG_PVQ
+ a = &args->ta[blk_col];
+ l = &args->tl[blk_row];
+#if CONFIG_VAR_TX
+ ctx = get_entropy_context(tx_size, a, l);
+#else
+ ctx = combine_entropy_contexts(*a, *l);
+#endif
+#else
+ ctx = 0;
+#endif // CONFIG_PVQ
+
+#if CONFIG_VAR_TX
+ // Assert not magic number (uninitialized).
+ assert(x->blk_skip[plane][blk_row * bw + blk_col] != 234);
+
+ if (x->blk_skip[plane][blk_row * bw + blk_col] == 0) {
+#else
+ {
+#endif
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ ctx, AV1_XFORM_QUANT_FP);
+ }
+#if CONFIG_VAR_TX
+ else {
+ p->eobs[block] = 0;
+ }
+#endif
+
+#if !CONFIG_PVQ
+ if (p->eobs[block] && !xd->lossless[xd->mi[0]->mbmi.segment_id])
+ av1_optimize_b(cm, x, plane, block, tx_size, ctx);
+
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+ if (p->eobs[block]) *(args->skip) = 0;
+
+ if (p->eobs[block] == 0) return;
+#else
+ (void)ctx;
+ if (!x->pvq_skip[plane]) *(args->skip) = 0;
+
+ if (x->pvq_skip[plane]) return;
+#endif
+ TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+ av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst,
+ pd->dst.stride, p->eobs[block]);
+}
+
+#if CONFIG_VAR_TX
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int tx_row = blk_row >> (1 - pd->subsampling_y);
+ const int tx_col = blk_col >> (1 - pd->subsampling_x);
+ TX_SIZE plane_tx_size;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ plane_tx_size =
+ plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+ : mbmi->inter_tx_size[tx_row][tx_col];
+
+ if (tx_size == plane_tx_size) {
+ encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ // This is the square transform block partition entry point.
+ int bsl = tx_size_wide_unit[sub_txs];
+ int i;
+ assert(bsl > 0);
+ assert(tx_size < TX_SIZES_ALL);
+
+ for (i = 0; i < 4; ++i) {
+ const int offsetr = blk_row + ((i >> 1) * bsl);
+ const int offsetc = blk_col + ((i & 0x01) * bsl);
+ int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+ arg);
+ block += step;
+ }
+ }
+}
+#endif
+
+typedef struct encode_block_pass1_args {
+ AV1_COMMON *cm;
+ MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+ AV1_COMMON *cm = args->cm;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *dst;
+ int ctx = 0;
+ dst = &pd->dst
+ .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ ctx, AV1_XFORM_QUANT_B);
+#if !CONFIG_PVQ
+ if (p->eobs[block] > 0) {
+#else
+ if (!x->pvq_skip[plane]) {
+ {
+ int tx_blk_size;
+ int i, j;
+ // transform block size in pixels
+ tx_blk_size = tx_size_wide[tx_size];
+
+// Since av1 does not have separate function which does inverse transform
+// but av1_inv_txfm_add_*x*() also does addition of predicted image to
+// inverse transformed image,
+// pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (j = 0; j < tx_blk_size; j++)
+ for (i = 0; i < tx_blk_size; i++)
+ CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0;
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ for (j = 0; j < tx_blk_size; j++)
+ for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ }
+#endif // !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ av1_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+ xd->bd);
+ } else {
+ av1_highbd_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+ xd->bd);
+ }
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ } else {
+ av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ }
+ }
+}
+
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
+ encode_block_pass1_args args = { cm, x };
+ av1_subtract_plane(x, bsize, 0);
+ av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+ encode_block_pass1, &args);
+}
+
+void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize,
+ const int mi_row, const int mi_col) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx ctx;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+ int plane;
+
+ mbmi->skip = 1;
+
+ if (x->skip) return;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+ const int subsampling_x = xd->plane[plane].subsampling_x;
+ const int subsampling_y = xd->plane[plane].subsampling_y;
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize, subsampling_x,
+ subsampling_y))
+ continue;
+
+ bsize = scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
+#else
+ (void)mi_row;
+ (void)mi_col;
+#endif
+
+#if CONFIG_VAR_TX
+ // TODO(jingning): Clean this up.
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+ const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+ int idx, idy;
+ int block = 0;
+ int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ av1_get_entropy_contexts(bsize, 0, pd, ctx.ta[plane], ctx.tl[plane]);
+#else
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE tx_size = get_tx_size(plane, xd);
+ av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+#endif
+
+#if !CONFIG_PVQ
+ av1_subtract_plane(x, bsize, plane);
+#endif
+ arg.ta = ctx.ta[plane];
+ arg.tl = ctx.tl[plane];
+
+#if CONFIG_VAR_TX
+ for (idy = 0; idy < mi_height; idy += bh) {
+ for (idx = 0; idx < mi_width; idx += bw) {
+ encode_block_inter(plane, block, idy, idx, plane_bsize, max_tx_size,
+ &arg);
+ block += step;
+ }
+ }
+#else
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+ &arg);
+#endif
+ }
+}
+
+#if CONFIG_SUPERTX
+void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx ctx;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+ int plane;
+
+ mbmi->skip = 1;
+ if (x->skip) return;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_VAR_TX
+ const TX_SIZE tx_size = TX_4X4;
+#else
+ const TX_SIZE tx_size = get_tx_size(plane, xd);
+#endif
+ av1_subtract_plane(x, bsize, plane);
+ av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+ arg.ta = ctx.ta[plane];
+ arg.tl = ctx.tl[plane];
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+ &arg);
+ }
+}
+#endif // CONFIG_SUPERTX
+
+#if !CONFIG_PVQ
+void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+ (void)tx_size;
+ struct macroblock_plane *p = &x->plane[plane];
+
+#if !CONFIG_LV_MAP
+ *a = *l = p->eobs[block] > 0;
+#else // !CONFIG_LV_MAP
+ *a = *l = p->txb_entropy_ctx[block];
+#endif // !CONFIG_LV_MAP
+
+#if CONFIG_VAR_TX || CONFIG_LV_MAP
+ int i;
+ for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) a[i] = a[0];
+
+ for (i = 0; i < tx_size_high_unit[tx_size]; ++i) l[i] = l[0];
+#endif
+}
+#endif
+
+static void encode_block_intra_and_set_context(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ arg);
+#if !CONFIG_PVQ
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *x = args->x;
+ ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+#endif
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct encode_b_args *const args = arg;
+ AV1_COMMON *cm = args->cm;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ uint16_t *eob = &p->eobs[block];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+ const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ int ctx = combine_entropy_contexts(*a, *l);
+ if (args->enable_optimize_b) {
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ ctx, AV1_XFORM_QUANT_FP);
+ if (p->eobs[block]) {
+ av1_optimize_b(cm, x, plane, block, tx_size, ctx);
+ }
+ } else {
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ ctx, AV1_XFORM_QUANT_B);
+ }
+
+#if CONFIG_PVQ
+ // *(args->skip) == mbmi->skip
+ if (!x->pvq_skip[plane]) *(args->skip) = 0;
+
+ if (x->pvq_skip[plane]) return;
+#endif // CONFIG_PVQ
+ av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst, dst_stride,
+ *eob);
+#if !CONFIG_PVQ
+ if (*eob) *(args->skip) = 0;
+#else
+// Note : *(args->skip) == mbmi->skip
+#endif
+#if CONFIG_CFL
+ if (plane == AOM_PLANE_Y && x->cfl_store_y) {
+ cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+ }
+#endif
+}
+
+void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane,
+ int enable_optimize_b, const int mi_row,
+ const int mi_col) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE] = { 0 };
+ ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 };
+
+ struct encode_b_args arg = {
+ cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b
+ };
+
+#if CONFIG_CB4X4
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y))
+ return;
+#else
+ (void)mi_row;
+ (void)mi_col;
+#endif
+
+ if (enable_optimize_b) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE tx_size = get_tx_size(plane, xd);
+ av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
+ }
+ av1_foreach_transformed_block_in_plane(
+ xd, bsize, plane, encode_block_intra_and_set_context, &arg);
+}
+
+#if CONFIG_PVQ
+PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
+ tran_low_t *ref_coeff,
+ tran_low_t *const dqcoeff, uint16_t *eob,
+ const int16_t *quant, int plane,
+ int tx_size, TX_TYPE tx_type, int *rate,
+ int speed, PVQ_INFO *pvq_info) {
+ const int tx_blk_size = tx_size_wide[tx_size];
+ daala_enc_ctx *daala_enc = &x->daala_enc;
+ PVQ_SKIP_TYPE ac_dc_coded;
+ int coeff_shift = 3 - av1_get_tx_scale(tx_size);
+ int hbd_downshift = 0;
+ int rounding_mask;
+ int pvq_dc_quant;
+ int use_activity_masking = daala_enc->use_activity_masking;
+ int tell;
+ int has_dc_skip = 1;
+ int i;
+ int off = od_qm_offset(tx_size, plane ? 1 : 0);
+
+ DECLARE_ALIGNED(16, tran_low_t, coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+ DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+
+ DECLARE_ALIGNED(16, int32_t, in_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+ DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+ DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+
+#if CONFIG_HIGHBITDEPTH
+ hbd_downshift = x->e_mbd.bd - 8;
+#endif
+
+ assert(OD_COEFF_SHIFT >= 4);
+ // DC quantizer for PVQ
+ if (use_activity_masking)
+ pvq_dc_quant =
+ OD_MAXI(1, (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
+ daala_enc->state
+ .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >>
+ 4);
+ else
+ pvq_dc_quant =
+ OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
+
+ *eob = 0;
+
+#if CONFIG_DAALA_EC
+ tell = od_ec_enc_tell_frac(&daala_enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+ // Change coefficient ordering for pvq encoding.
+ od_raster_to_coding_order(coeff_pvq, tx_blk_size, tx_type, coeff,
+ tx_blk_size);
+ od_raster_to_coding_order(ref_coeff_pvq, tx_blk_size, tx_type, ref_coeff,
+ tx_blk_size);
+
+ // copy int16 inputs to int32
+ for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
+ ref_int32[i] =
+ AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
+ hbd_downshift;
+ in_int32[i] = AOM_SIGNED_SHL(coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
+ hbd_downshift;
+ }
+
+ if (abs(in_int32[0] - ref_int32[0]) < pvq_dc_quant * 141 / 256) { /* 0.55 */
+ out_int32[0] = 0;
+ } else {
+ out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant);
+ }
+
+ ac_dc_coded =
+ od_pvq_encode(daala_enc, ref_int32, in_int32, out_int32,
+ OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >>
+ hbd_downshift), // scale/quantizer
+ OD_MAXI(1, quant[1] << (OD_COEFF_SHIFT - 3) >>
+ hbd_downshift), // scale/quantizer
+ plane,
+ tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size],
+ 0, // is_keyframe,
+ daala_enc->state.qm + off, daala_enc->state.qm_inv + off,
+ speed, // speed
+ pvq_info);
+
+ // Encode residue of DC coeff, if required.
+ if (!has_dc_skip || out_int32[0]) {
+ generic_encode(&daala_enc->w, &daala_enc->state.adapt->model_dc[plane],
+ abs(out_int32[0]) - has_dc_skip,
+ &daala_enc->state.adapt->ex_dc[plane][tx_size][0], 2);
+ }
+ if (out_int32[0]) {
+ aom_write_bit(&daala_enc->w, out_int32[0] < 0);
+ }
+
+ // need to save quantized residue of DC coeff
+ // so that final pvq bitstream writing can know whether DC is coded.
+ if (pvq_info) pvq_info->dq_dc_residue = out_int32[0];
+
+ out_int32[0] = out_int32[0] * pvq_dc_quant;
+ out_int32[0] += ref_int32[0];
+
+ // copy int32 result back to int16
+ assert(OD_COEFF_SHIFT > coeff_shift);
+ rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
+ for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
+ out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift);
+ dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
+ (OD_COEFF_SHIFT - coeff_shift);
+ }
+
+ // Back to original coefficient order
+ od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq,
+ tx_blk_size);
+
+ *eob = tx_blk_size * tx_blk_size;
+
+#if CONFIG_DAALA_EC
+ *rate = (od_ec_enc_tell_frac(&daala_enc->w.ec) - tell)
+ << (AV1_PROB_COST_SHIFT - OD_BITRES);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ assert(*rate >= 0);
+
+ return ac_dc_coded;
+}
+
+void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
+ od_coeff *y, int nb_bands, const int *off,
+ int *size, int skip_rest, int skip_dir,
+ int bs) { // block size in log_2 -2
+ int i;
+ const int tx_blk_size = tx_size_wide[bs];
+
+ for (i = 0; i < nb_bands; i++) {
+ pvq_info->qg[i] = qg[i];
+ pvq_info->theta[i] = theta[i];
+ pvq_info->k[i] = k[i];
+ pvq_info->off[i] = off[i];
+ pvq_info->size[i] = size[i];
+ }
+
+ memcpy(pvq_info->y, y, tx_blk_size * tx_blk_size * sizeof(od_coeff));
+
+ pvq_info->nb_bands = nb_bands;
+ pvq_info->skip_rest = skip_rest;
+ pvq_info->skip_dir = skip_dir;
+ pvq_info->bs = bs;
+}
+#endif
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
new file mode 100644
index 0000000000..73fde1d884
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODEMB_H_
+#define AV1_ENCODER_ENCODEMB_H_
+
+#include "./aom_config.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct optimize_ctx {
+ ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+ ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+ AV1_COMMON *cm;
+ MACROBLOCK *x;
+ struct optimize_ctx *ctx;
+ int8_t *skip;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ int8_t enable_optimize_b;
+};
+
+typedef enum AV1_XFORM_QUANT {
+ AV1_XFORM_QUANT_FP = 0,
+ AV1_XFORM_QUANT_B = 1,
+ AV1_XFORM_QUANT_DC = 2,
+ AV1_XFORM_QUANT_SKIP_QUANT,
+ AV1_XFORM_QUANT_TYPES,
+} AV1_XFORM_QUANT;
+
+void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col);
+#if CONFIG_SUPERTX
+void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif // CONFIG_SUPERTX
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx);
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+ TX_SIZE tx_size, int ctx);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l);
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane,
+ int enable_optimize_b, int mi_row,
+ int mi_col);
+
+#if CONFIG_PVQ
+PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
+ tran_low_t *ref_coeff,
+ tran_low_t *const dqcoeff, uint16_t *eob,
+ const int16_t *quant, int plane,
+ int tx_size, TX_TYPE tx_type, int *rate,
+ int speed, PVQ_INFO *pvq_info);
+
+void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
+ od_coeff *y, int nb_bands, const int *off,
+ int *size, int skip_rest, int skip_dir, int bs);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
new file mode 100644
index 0000000000..a2a53f8408
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/subexp.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+static struct av1_token mv_joint_encodings[MV_JOINTS];
+static struct av1_token mv_class_encodings[MV_CLASSES];
+static struct av1_token mv_fp_encodings[MV_FP_SIZE];
+
+void av1_entropy_mv_init(void) {
+ av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree);
+ av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree);
+ av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+ int usehp) {
+ int offset;
+ const int sign = comp < 0;
+ const int mag = sign ? -comp : comp;
+ const int mv_class = av1_get_mv_class(mag - 1, &offset);
+ const int d = offset >> 3; // int mv data
+ const int fr = (offset >> 1) & 3; // fractional mv data
+ const int hp = offset & 1; // high precision mv data
+
+ assert(comp != 0);
+
+ // Sign
+ aom_write(w, sign, mvcomp->sign);
+
+// Class
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
+#else
+ av1_write_token(w, av1_mv_class_tree, mvcomp->classes,
+ &mv_class_encodings[mv_class]);
+#endif
+
+ // Integer bits
+ if (mv_class == MV_CLASS_0) {
+ aom_write(w, d, mvcomp->class0[0]);
+ } else {
+ int i;
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
+ }
+
+// Fractional bits
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(
+ w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+ MV_FP_SIZE);
+#else
+ av1_write_token(w, av1_mv_fp_tree,
+ mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp,
+ &mv_fp_encodings[fr]);
+#endif
+
+ // High precision bit
+ if (usehp)
+ aom_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+ const nmv_component *const mvcomp,
+ int usehp) {
+ int i, v;
+ int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+ int bits_cost[MV_OFFSET_BITS][2];
+ int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+ int class0_hp_cost[2], hp_cost[2];
+
+ sign_cost[0] = av1_cost_zero(mvcomp->sign);
+ sign_cost[1] = av1_cost_one(mvcomp->sign);
+ av1_cost_tokens(class_cost, mvcomp->classes, av1_mv_class_tree);
+ av1_cost_tokens(class0_cost, mvcomp->class0, av1_mv_class0_tree);
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ bits_cost[i][0] = av1_cost_zero(mvcomp->bits[i]);
+ bits_cost[i][1] = av1_cost_one(mvcomp->bits[i]);
+ }
+
+ for (i = 0; i < CLASS0_SIZE; ++i)
+ av1_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], av1_mv_fp_tree);
+ av1_cost_tokens(fp_cost, mvcomp->fp, av1_mv_fp_tree);
+
+ if (usehp) {
+ class0_hp_cost[0] = av1_cost_zero(mvcomp->class0_hp);
+ class0_hp_cost[1] = av1_cost_one(mvcomp->class0_hp);
+ hp_cost[0] = av1_cost_zero(mvcomp->hp);
+ hp_cost[1] = av1_cost_one(mvcomp->hp);
+ }
+ mvcost[0] = 0;
+ for (v = 1; v <= MV_MAX; ++v) {
+ int z, c, o, d, e, f, cost = 0;
+ z = v - 1;
+ c = av1_get_mv_class(z, &o);
+ cost += class_cost[c];
+ d = (o >> 3); /* int mv data */
+ f = (o >> 1) & 3; /* fractional pel mv data */
+ e = (o & 1); /* high precision mv data */
+ if (c == MV_CLASS_0) {
+ cost += class0_cost[d];
+ } else {
+ const int b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+ }
+ if (c == MV_CLASS_0) {
+ cost += class0_fp_cost[d][f];
+ } else {
+ cost += fp_cost[f];
+ }
+ if (usehp) {
+ if (c == MV_CLASS_0) {
+ cost += class0_hp_cost[e];
+ } else {
+ cost += hp_cost[e];
+ }
+ }
+ mvcost[v] = cost + sign_cost[0];
+ mvcost[-v] = cost + sign_cost[1];
+ }
+}
+
+static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p,
+ aom_prob upd_p) {
+ (void)upd_p;
+#if CONFIG_TILE_GROUPS
+ // Just use the default maximum number of tile groups to avoid passing in the
+ // actual
+ // number
+ av1_cond_prob_diff_update(w, cur_p, ct, DEFAULT_MAX_NUM_TG);
+#else
+ av1_cond_prob_diff_update(w, cur_p, ct, 1);
+#endif
+}
+
+#if !CONFIG_EC_ADAPT
+static void write_mv_update(const aom_tree_index *tree,
+ aom_prob probs[/*n - 1*/],
+ const unsigned int counts[/*n - 1*/], int n,
+ aom_writer *w) {
+ int i;
+ unsigned int branch_ct[32][2];
+
+ // Assuming max number of probabilities <= 32
+ assert(n <= 32);
+
+ av1_tree_probs_from_distribution(tree, branch_ct, counts);
+ for (i = 0; i < n - 1; ++i)
+ update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
+}
+#endif
+
+void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
+ nmv_context_counts *const nmv_counts) {
+ int i;
+#if CONFIG_REF_MV
+ int nmv_ctx = 0;
+ for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
+ nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
+ nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
+#if !CONFIG_EC_ADAPT
+ write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS,
+ w);
+
+ for (i = 0; i < 2; ++i) {
+ int j;
+ nmv_component *comp = &mvc->comps[i];
+ nmv_component_counts *comp_counts = &counts->comps[i];
+
+ update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+ write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
+ MV_CLASSES, w);
+ write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
+ CLASS0_SIZE, w);
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+ }
+
+ for (i = 0; i < 2; ++i) {
+ int j;
+ for (j = 0; j < CLASS0_SIZE; ++j)
+ write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
+ counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+
+ write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+ MV_FP_SIZE, w);
+ }
+#endif
+
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+ MV_UPDATE_PROB);
+ update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+ }
+ }
+ }
+#else
+ nmv_context *const mvc = &cm->fc->nmvc;
+ nmv_context_counts *const counts = nmv_counts;
+
+#if !CONFIG_EC_ADAPT
+ write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
+
+ for (i = 0; i < 2; ++i) {
+ int j;
+ nmv_component *comp = &mvc->comps[i];
+ nmv_component_counts *comp_counts = &counts->comps[i];
+
+ update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+ write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
+ MV_CLASSES, w);
+ write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
+ CLASS0_SIZE, w);
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+ }
+
+ for (i = 0; i < 2; ++i) {
+ int j;
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
+ counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+ }
+ write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+ MV_FP_SIZE, w);
+ }
+#endif // !CONFIG_EC_ADAPT
+
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+ MV_UPDATE_PROB);
+ update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+ }
+ }
+#endif
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx, int usehp) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+#else
+ av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+#endif
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+ // If auto_mv_step_size is enabled then keep track of the largest
+ // motion vector component used.
+ if (cpi->sf.mv.auto_mv_step_size) {
+ unsigned int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+ cpi->max_mv_magnitude = AOMMAX(maxv, cpi->max_mv_magnitude);
+ }
+}
+
+#if CONFIG_INTRABC
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+#if CONFIG_EC_MULTISYMBOL
+ aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+#else
+ av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+#endif
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], 0);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], 0);
+}
+#endif // CONFIG_INTRABC
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *ctx, int usehp) {
+ av1_cost_tokens(mvjoint, ctx->joints, av1_mv_joint_tree);
+ build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+ build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
+}
+
+#if CONFIG_EXT_INTER
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+ const int_mv mvs[2],
+#if CONFIG_REF_MV
+ const int_mv pred_mvs[2],
+#endif
+ nmv_context_counts *nmv_counts) {
+ int i;
+ PREDICTION_MODE mode = mbmi->mode;
+#if !CONFIG_REF_MV
+ nmv_context_counts *counts = nmv_counts;
+#endif
+
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+ for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+ const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+ const MV diff = { mvs[i].as_mv.row - ref->row,
+ mvs[i].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+ nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+ (void)pred_mvs;
+#endif
+ av1_inc_mv(&diff, counts, 1);
+ }
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+ const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
+ const MV diff = { mvs[1].as_mv.row - ref->row,
+ mvs[1].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+ nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+ av1_inc_mv(&diff, counts, 1);
+ } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+ const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+ const MV diff = { mvs[0].as_mv.row - ref->row,
+ mvs[0].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+ nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+ av1_inc_mv(&diff, counts, 1);
+ }
+}
+
+static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
+#if CONFIG_REF_MV
+ const MB_MODE_INFO_EXT *mbmi_ext,
+#endif
+ nmv_context_counts *nmv_counts) {
+ int i;
+ PREDICTION_MODE mode = mi->bmi[block].as_mode;
+#if CONFIG_REF_MV
+ const MB_MODE_INFO *mbmi = &mi->mbmi;
+#else
+ nmv_context_counts *counts = nmv_counts;
+#endif
+
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+ for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) {
+ const MV *ref = &mi->bmi[block].ref_mv[i].as_mv;
+ const MV diff = { mvs[i].as_mv.row - ref->row,
+ mvs[i].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+ nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+ av1_inc_mv(&diff, counts, 1);
+ }
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+ const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
+ const MV diff = { mvs[1].as_mv.row - ref->row,
+ mvs[1].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+ nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+ av1_inc_mv(&diff, counts, 1);
+ } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+ const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
+ const MV diff = { mvs[0].as_mv.row - ref->row,
+ mvs[0].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+ nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+ av1_inc_mv(&diff, counts, 1);
+ }
+}
+#else
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+ const int_mv mvs[2],
+#if CONFIG_REF_MV
+ const int_mv pred_mvs[2],
+#endif
+ nmv_context_counts *nmv_counts) {
+ int i;
+#if !CONFIG_REF_MV
+ nmv_context_counts *counts = nmv_counts;
+#endif
+
+ for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+#if CONFIG_REF_MV
+ int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+ int nmv_ctx =
+ av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+ nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+ const MV *ref = &pred_mvs[i].as_mv;
+#else
+ const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+#endif
+ const MV diff = { mvs[i].as_mv.row - ref->row,
+ mvs[i].as_mv.col - ref->col };
+ av1_inc_mv(&diff, counts, 1);
+ }
+}
+#endif // CONFIG_EXT_INTER
+
+void av1_update_mv_count(ThreadData *td) {
+ const MACROBLOCKD *xd = &td->mb.e_mbd;
+ const MODE_INFO *mi = xd->mi[0];
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
+#if CONFIG_CB4X4
+ const int unify_bsize = 1;
+#else
+ const int unify_bsize = 0;
+#endif
+
+ if (mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
+ int idx, idy;
+
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int i = idy * 2 + idx;
+
+#if CONFIG_EXT_INTER
+ if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
+ inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv,
+#if CONFIG_REF_MV
+ mbmi_ext, td->counts->mv);
+#else
+ &td->counts->mv);
+#endif
+#else
+ if (mi->bmi[i].as_mode == NEWMV)
+ inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv,
+#if CONFIG_REF_MV
+ mi->bmi[i].pred_mv, td->counts->mv);
+#else
+ &td->counts->mv);
+#endif
+#endif // CONFIG_EXT_INTER
+ }
+ }
+ } else {
+#if CONFIG_EXT_INTER
+ if (have_newmv_in_inter_mode(mbmi->mode))
+#else
+ if (mbmi->mode == NEWMV)
+#endif // CONFIG_EXT_INTER
+ inc_mvs(mbmi, mbmi_ext, mbmi->mv,
+#if CONFIG_REF_MV
+ mbmi->pred_mv, td->counts->mv);
+#else
+ &td->counts->mv);
+#endif
+ }
+}
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
new file mode 100644
index 0000000000..6d442147fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODEMV_H_
+#define AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_entropy_mv_init(void);
+
+void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
+ nmv_context_counts *const counts);
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx, int usehp);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *mvctx, int usehp);
+
+void av1_update_mv_count(ThreadData *td);
+
+#if CONFIG_INTRABC
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx);
+#endif // CONFIG_INTRABC
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
new file mode 100644
index 0000000000..0271091519
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -0,0 +1,5980 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+
+#include "av1/common/alloccommon.h"
+#if CONFIG_CDEF
+#include "av1/common/cdef.h"
+#include "av1/common/clpf.h"
+#endif // CONFIG_CDEF
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#endif
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/picklpf.h"
+#if CONFIG_LOOP_RESTORATION
+#include "av1/encoder/pickrst.h"
+#endif // CONFIG_LOOP_RESTORATION
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/temporal_filter.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_scale_rtcd.h"
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_ENTROPY_STATS
+FRAME_COUNTS aggregate_fc;
+#endif // CONFIG_ENTROPY_STATS
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
+
+#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv
+ // for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision
+ // mv. Choose a very high value for
+ // now so that HIGH_PRECISION is always
+ // chosen.
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if CONFIG_CFL
+CFL_CTX NULL_CFL;
+#endif
+
+#if CONFIG_INTERNAL_STATS
+typedef enum { Y, U, V, ALL } STAT_TYPE;
+#endif // CONFIG_INTERNAL_STATS
+
+static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
+ switch (mode) {
+ case NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(AV1_COMP *cpi) {
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int i;
+ if (cpi->active_map.enabled || cpi->active_map.update)
+ for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+ seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(AV1_COMP *cpi) {
+ struct segmentation *const seg = &cpi->common.seg;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ const unsigned char *const active_map = cpi->active_map.map;
+ int i;
+
+ assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+ if (frame_is_intra_only(&cpi->common)) {
+ cpi->active_map.enabled = 0;
+ cpi->active_map.update = 1;
+ }
+
+ if (cpi->active_map.update) {
+ if (cpi->active_map.enabled) {
+ for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+ av1_enable_segmentation(seg);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+ // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+ // filter level being zero regardless of the value of seg->abs_delta.
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF,
+ -MAX_LOOP_FILTER);
+ } else {
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+ if (seg->enabled) {
+ seg->update_data = 1;
+ seg->update_map = 1;
+ }
+ }
+ cpi->active_map.update = 0;
+ }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+ unsigned char *const active_map_8x8 = cpi->active_map.map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+ const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+ cpi->active_map.update = 1;
+ if (new_map_16x16) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ active_map_8x8[r * mi_cols + c] =
+ new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)]
+ ? AM_SEGMENT_ID_ACTIVE
+ : AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ cpi->active_map.enabled = 1;
+ } else {
+ cpi->active_map.enabled = 0;
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+ new_map_16x16) {
+ unsigned char *const seg_map_8x8 = cpi->segmentation_map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+ const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+
+ memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+ if (cpi->active_map.enabled) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ // Cyclic refresh segments are considered active despite not having
+ // AM_SEGMENT_ID_ACTIVE
+ new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |=
+ seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv) {
+ MACROBLOCK *const mb = &cpi->td.mb;
+ cpi->common.allow_high_precision_mv = allow_high_precision_mv;
+
+#if CONFIG_REF_MV
+ if (cpi->common.allow_high_precision_mv) {
+ int i;
+ for (i = 0; i < NMV_CONTEXTS; ++i) {
+ mb->mv_cost_stack[i] = mb->nmvcost_hp[i];
+ mb->mvsadcost = mb->nmvsadcost_hp;
+ }
+ } else {
+ int i;
+ for (i = 0; i < NMV_CONTEXTS; ++i) {
+ mb->mv_cost_stack[i] = mb->nmvcost[i];
+ mb->mvsadcost = mb->nmvsadcost;
+ }
+ }
+#else
+ if (cpi->common.allow_high_precision_mv) {
+ mb->mvcost = mb->nmvcost_hp;
+ mb->mvsadcost = mb->nmvcost_hp;
+ } else {
+ mb->mvcost = mb->nmvcost;
+ mb->mvsadcost = mb->nmvcost;
+ }
+#endif
+}
+
+static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
+#if CONFIG_EXT_PARTITION
+ if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
+ return BLOCK_64X64;
+
+ if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+ return BLOCK_128X128;
+
+ assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+ assert(IMPLIES(cpi->common.tile_cols > 1,
+ cpi->common.tile_width % MAX_MIB_SIZE == 0));
+ assert(IMPLIES(cpi->common.tile_rows > 1,
+ cpi->common.tile_height % MAX_MIB_SIZE == 0));
+
+ // TODO(any): Possibly could improve this with a heuristic.
+ return BLOCK_128X128;
+#else
+ (void)cpi;
+ return BLOCK_64X64;
+#endif // CONFIG_EXT_PARTITION
+}
+
+static void setup_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ // Set up entropy context depending on frame type. The decoder mandates
+ // the use of the default context, index 0, for keyframes and inter
+ // frames where the error_resilient_mode or intra_only flag is set. For
+ // other inter-frames the encoder currently uses only two contexts;
+ // context 1 for ALTREF frames and context 0 for the others.
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+ av1_setup_past_independence(cm);
+ } else {
+#if CONFIG_EXT_REFS
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+ cm->frame_context_idx = EXT_ARF_FRAME;
+ else if (cpi->refresh_alt_ref_frame)
+ cm->frame_context_idx = ARF_FRAME;
+#else
+ if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME;
+#endif // CONFIG_EXT_REFS
+ else if (cpi->rc.is_src_frame_alt_ref)
+ cm->frame_context_idx = OVERLAY_FRAME;
+ else if (cpi->refresh_golden_frame)
+ cm->frame_context_idx = GLD_FRAME;
+#if CONFIG_EXT_REFS
+ else if (cpi->refresh_bwd_ref_frame)
+ cm->frame_context_idx = BRF_FRAME;
+#endif // CONFIG_EXT_REFS
+ else
+ cm->frame_context_idx = REGULAR_FRAME;
+ }
+
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ av1_zero(cpi->interp_filter_selected);
+ } else {
+ *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ av1_zero(cpi->interp_filter_selected[0]);
+ }
+#if CONFIG_EXT_REFS
+#if CONFIG_LOWDELAY_COMPOUND // No change to bitstream
+ if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+ cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame;
+ cpi->rc.is_bipred_frame = 1;
+ }
+#endif
+#endif
+
+ cpi->vaq_refresh = 0;
+
+ set_sb_size(cm, select_sb_size(cpi));
+}
+
+static void av1_enc_setup_mi(AV1_COMMON *cm) {
+ int i;
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+ // Clear top border row
+ memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+ // Clear left border column
+ for (i = 1; i < cm->mi_rows + 1; ++i)
+ memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+ memset(cm->mi_grid_base, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int av1_enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
+ cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
+ if (!cm->mip) return 1;
+ cm->prev_mip = aom_calloc(mi_size, sizeof(*cm->prev_mip));
+ if (!cm->prev_mip) return 1;
+ cm->mi_alloc_size = mi_size;
+
+ cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+ if (!cm->mi_grid_base) return 1;
+ cm->prev_mi_grid_base =
+ (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+ if (!cm->prev_mi_grid_base) return 1;
+
+ return 0;
+}
+
+static void av1_enc_free_mi(AV1_COMMON *cm) {
+ aom_free(cm->mip);
+ cm->mip = NULL;
+ aom_free(cm->prev_mip);
+ cm->prev_mip = NULL;
+ aom_free(cm->mi_grid_base);
+ cm->mi_grid_base = NULL;
+ aom_free(cm->prev_mi_grid_base);
+ cm->prev_mi_grid_base = NULL;
+}
+
+static void av1_swap_mi_and_prev_mi(AV1_COMMON *cm) {
+ // Current mip will be the prev_mip for the next frame.
+ MODE_INFO **temp_base = cm->prev_mi_grid_base;
+ MODE_INFO *temp = cm->prev_mip;
+ cm->prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // Update the upper left visible macroblock ptrs.
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+ cm->prev_mi_grid_base = cm->mi_grid_base;
+ cm->mi_grid_base = temp_base;
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+void av1_initialize_enc(void) {
+ static volatile int init_done = 0;
+
+ if (!init_done) {
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_me_luts();
+#if !CONFIG_XIPHRC
+ av1_rc_init_minq_luts();
+#endif
+ av1_entropy_mv_init();
+ av1_encode_token_init();
+#if CONFIG_EXT_INTER
+ av1_init_wedge_masks();
+#endif
+ init_done = 1;
+ }
+}
+
+static void dealloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i;
+
+ aom_free(cpi->mbmi_ext_base);
+ cpi->mbmi_ext_base = NULL;
+
+#if CONFIG_PVQ
+ if (cpi->oxcf.pass != 1) {
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tile_col, tile_row;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ aom_free(tile_data->pvq_q.buf);
+ }
+ }
+#endif
+ aom_free(cpi->tile_data);
+ cpi->tile_data = NULL;
+
+ // Delete sementation map
+ aom_free(cpi->segmentation_map);
+ cpi->segmentation_map = NULL;
+
+ av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ cpi->cyclic_refresh = NULL;
+
+ aom_free(cpi->active_map.map);
+ cpi->active_map.map = NULL;
+
+ // Free up-sampled reference buffers.
+ for (i = 0; i < (REF_FRAMES + 1); i++)
+ aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
+
+ av1_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_LV_MAP
+ av1_free_txb_buf(cpi);
+#endif
+ av1_free_context_buffers(cm);
+
+ aom_free_frame_buffer(&cpi->last_frame_uf);
+#if CONFIG_LOOP_RESTORATION
+ av1_free_restoration_buffers(cm);
+ aom_free_frame_buffer(&cpi->last_frame_db);
+ aom_free_frame_buffer(&cpi->trial_frame_rst);
+ aom_free(cpi->extra_rstbuf);
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ av1_free_restoration_struct(&cpi->rst_search[i]);
+#endif // CONFIG_LOOP_RESTORATION
+ aom_free_frame_buffer(&cpi->scaled_source);
+ aom_free_frame_buffer(&cpi->scaled_last_source);
+ aom_free_frame_buffer(&cpi->alt_ref_buffer);
+ av1_lookahead_destroy(cpi->lookahead);
+
+ aom_free(cpi->tile_tok[0][0]);
+ cpi->tile_tok[0][0] = 0;
+
+ av1_free_pc_tree(&cpi->td);
+ av1_free_var_tree(&cpi->td);
+
+#if CONFIG_PALETTE
+ if (cpi->common.allow_screen_content_tools)
+ aom_free(cpi->td.mb.palette_buffer);
+#endif // CONFIG_PALETTE
+
+ if (cpi->source_diff_var != NULL) {
+ aom_free(cpi->source_diff_var);
+ cpi->source_diff_var = NULL;
+ }
+#if CONFIG_ANS
+ aom_buf_ans_free(&cpi->buf_ans);
+#endif // CONFIG_ANS
+}
+
+static void save_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+#if CONFIG_REF_MV
+ int i;
+#endif
+
+// Stores a snapshot of key state variables which can subsequently be
+// restored with a call to av1_restore_coding_context. These functions are
+// intended for use in a re-code loop in av1_compress_frame where the
+// quantizer value is adjusted between loop iterations.
+#if CONFIG_REF_MV
+ for (i = 0; i < NMV_CONTEXTS; ++i) {
+ av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
+ av1_copy(cc->nmv_costs, cpi->nmv_costs);
+ av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
+ }
+#else
+ av1_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
+#endif
+
+ av1_copy(cc->nmvcosts, cpi->nmvcosts);
+ av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp);
+
+ av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+ av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+
+ cc->fc = *cm->fc;
+}
+
+static void restore_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+#if CONFIG_REF_MV
+ int i;
+#endif
+
+// Restore key state variables to the snapshot state stored in the
+// previous call to av1_save_coding_context.
+#if CONFIG_REF_MV
+ for (i = 0; i < NMV_CONTEXTS; ++i) {
+ av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
+ av1_copy(cpi->nmv_costs, cc->nmv_costs);
+ av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
+ }
+#else
+ av1_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
+#endif
+
+ av1_copy(cpi->nmvcosts, cc->nmvcosts);
+ av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp);
+
+ av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+ av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+
+ *cm->fc = cc->fc;
+}
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ struct segmentation *const seg = &cm->seg;
+
+ int high_q = (int)(rc->avg_q > 48.0);
+ int qi_delta;
+
+ // Disable and clear down for KF
+ if (cm->frame_type == KEY_FRAME) {
+ // Clear down the global segmentation map
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation
+ av1_disable_segmentation(seg);
+
+ // Clear down the segment features.
+ av1_clearall_segfeatures(seg);
+ } else if (cpi->refresh_alt_ref_frame) {
+ // If this is an alt ref frame
+ // Clear down the global segmentation map
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation and individual segment features by default
+ av1_disable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ // Scan frames from current to arf frame.
+ // This function re-enables segmentation if appropriate.
+ av1_update_mbgraph_stats(cpi);
+
+ // If segmentation was enabled set those features needed for the
+ // arf itself.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+
+ qi_delta =
+ av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+ // Where relevant assume segment data is delta data
+ seg->abs_delta = SEGMENT_DELTADATA;
+ }
+ } else if (seg->enabled) {
+ // All other frames if segmentation has been enabled
+
+ // First normal frame in a valid gf or alt ref group
+ if (rc->frames_since_golden == 0) {
+ // Set up segment features for normal frames in an arf group
+ if (rc->source_alt_ref_active) {
+ seg->update_map = 0;
+ seg->update_data = 1;
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ qi_delta =
+ av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+ // Segment coding disabled for compred testing
+ if (high_q || (cpi->static_mb_pct == 100)) {
+ av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ } else {
+ // Disable segmentation and clear down features if alt ref
+ // is not active for this group
+
+ av1_disable_segmentation(seg);
+
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ av1_clearall_segfeatures(seg);
+ }
+ } else if (rc->is_src_frame_alt_ref) {
+ // Special case where we are coding over the top of a previous
+ // alt ref frame.
+ // Segment coding disabled for compred testing
+
+ // Enable ref frame features for segment 0 as well
+ av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+ // All mbs should use ALTREF_FRAME
+ av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+ // Skip all MBs if high Q (0,0 mv and skip coeffs)
+ if (high_q) {
+ av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ // Enable data update
+ seg->update_data = 1;
+ } else {
+ // All other frames.
+
+ // No updates.. leave things as they are.
+ seg->update_map = 0;
+ seg->update_data = 0;
+ }
+ }
+}
+
+static void update_reference_segmentation_map(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+ uint8_t *cache_ptr = cm->last_frame_seg_map;
+ int row, col;
+
+ for (row = 0; row < cm->mi_rows; row++) {
+ MODE_INFO **mi_8x8 = mi_8x8_ptr;
+ uint8_t *cache = cache_ptr;
+ for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+ cache[0] = mi_8x8[0]->mbmi.segment_id;
+ mi_8x8_ptr += cm->mi_stride;
+ cache_ptr += cm->mi_cols;
+ }
+}
+
+static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+
+ if (!cpi->lookahead)
+ cpi->lookahead = av1_lookahead_init(oxcf->width, oxcf->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ oxcf->lag_in_frames);
+ if (!cpi->lookahead)
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate lag buffers");
+
+ // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+ if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+ NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+ NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+#if CONFIG_LOOP_RESTORATION
+ if (aom_realloc_frame_buffer(&cpi->last_frame_db, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+ NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate last frame deblocked buffer");
+ if (aom_realloc_frame_buffer(&cpi->trial_frame_rst, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+ NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate trial restored frame buffer");
+ int extra_rstbuf_sz = RESTORATION_EXTBUF_SIZE;
+ if (extra_rstbuf_sz > 0) {
+ aom_free(cpi->extra_rstbuf);
+ CHECK_MEM_ERROR(cm, cpi->extra_rstbuf,
+ (uint8_t *)aom_malloc(extra_rstbuf_sz));
+ } else {
+ cpi->extra_rstbuf = NULL;
+ }
+#endif // CONFIG_LOOP_RESTORATION
+
+ if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+ NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate scaled source buffer");
+
+ if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+ NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate scaled last source buffer");
+}
+
+static int alloc_context_buffers_ext(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ int mi_size = cm->mi_cols * cm->mi_rows;
+
+ cpi->mbmi_ext_base = aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
+ if (!cpi->mbmi_ext_base) return 1;
+
+ return 0;
+}
+
+void av1_alloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ av1_alloc_context_buffers(cm, cm->width, cm->height);
+
+#if CONFIG_LV_MAP
+ av1_alloc_txb_buf(cpi);
+#endif
+
+ alloc_context_buffers_ext(cpi);
+
+ aom_free(cpi->tile_tok[0][0]);
+
+ {
+ unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+ CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+ aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+#if CONFIG_ANS && !ANS_MAX_SYMBOLS
+ aom_buf_ans_alloc(&cpi->buf_ans, &cm->error, (int)tokens);
+#endif // CONFIG_ANS
+ }
+
+ av1_setup_pc_tree(&cpi->common, &cpi->td);
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+ cpi->framerate = framerate < 0.1 ? 30 : framerate;
+#if CONFIG_XIPHRC
+ if (!cpi->od_rc.cur_frame) return;
+ cpi->od_rc.framerate = cpi->framerate;
+ od_enc_rc_resize(&cpi->od_rc);
+#else
+ av1_rc_update_framerate(cpi);
+#endif
+}
+
+static void set_tile_info(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES
+ int tile_row, tile_col, num_tiles_in_tg;
+ int tg_row_start, tg_col_start;
+#endif
+#if CONFIG_EXT_TILE
+#if CONFIG_EXT_PARTITION
+ if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
+ cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32);
+ cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
+ cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+ cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+ } else {
+ cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
+ cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+ cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1;
+ cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
+ }
+#else
+ cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
+ cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+ cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+ cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+#endif // CONFIG_EXT_PARTITION
+
+ cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+ cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+
+ assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
+ assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
+
+ // Get the number of tiles
+ cm->tile_cols = 1;
+ while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
+
+ cm->tile_rows = 1;
+ while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
+#else
+ int min_log2_tile_cols, max_log2_tile_cols;
+ av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+ cm->log2_tile_cols =
+ clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+ cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+ cm->tile_cols = 1 << cm->log2_tile_cols;
+ cm->tile_rows = 1 << cm->log2_tile_rows;
+
+ cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+ cm->tile_width >>= cm->log2_tile_cols;
+ cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+ cm->tile_height >>= cm->log2_tile_rows;
+
+ // round to integer multiples of max superblock size
+ cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+ cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+#endif // CONFIG_EXT_TILE
+
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles;
+ if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0;
+#if CONFIG_TILE_GROUPS
+ if (cpi->oxcf.mtu == 0) {
+ cm->num_tg = cpi->oxcf.num_tile_groups;
+ } else {
+ // Use a default value for the purposes of weighting costs in probability
+ // updates
+ cm->num_tg = DEFAULT_MAX_NUM_TG;
+ }
+ num_tiles_in_tg =
+ (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg;
+ tg_row_start = 0;
+ tg_col_start = 0;
+ for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+ if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) {
+ tg_row_start = tile_row;
+ tg_col_start = tile_col;
+ }
+ cm->tile_group_start_row[tile_row][tile_col] = tg_row_start;
+ cm->tile_group_start_col[tile_row][tile_col] = tg_col_start;
+ }
+ }
+#endif
+#endif
+
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+ cm->loop_filter_across_tiles_enabled =
+ cpi->oxcf.loop_filter_across_tiles_enabled;
+#endif // CONFIG_LOOPFILTERING_ACROSS_TILES
+}
+
+static void update_frame_size(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ av1_set_mb_mi(cm, cm->width, cm->height);
+ av1_init_context_buffers(cm);
+ av1_init_macroblockd(cm, xd,
+#if CONFIG_PVQ
+ NULL,
+#endif
+#if CONFIG_CFL
+ &NULL_CFL,
+#endif
+ NULL);
+ memset(cpi->mbmi_ext_base, 0,
+ cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+ set_tile_info(cpi);
+}
+
+static void init_buffer_indices(AV1_COMP *cpi) {
+#if CONFIG_EXT_REFS
+ int fb_idx;
+ for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx)
+ cpi->lst_fb_idxes[fb_idx] = fb_idx;
+ cpi->gld_fb_idx = LAST_REF_FRAMES;
+ cpi->bwd_fb_idx = LAST_REF_FRAMES + 1;
+ cpi->alt_fb_idx = LAST_REF_FRAMES + 2;
+ for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
+ cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
+#else
+ cpi->lst_fb_idx = 0;
+ cpi->gld_fb_idx = 1;
+ cpi->alt_fb_idx = 2;
+#endif // CONFIG_EXT_REFS
+}
+
+static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ cpi->oxcf = *oxcf;
+ cpi->framerate = oxcf->init_framerate;
+
+ cm->profile = oxcf->profile;
+ cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
+ cm->color_space = oxcf->color_space;
+ cm->color_range = oxcf->color_range;
+
+ cm->width = oxcf->width;
+ cm->height = oxcf->height;
+ av1_alloc_compressor_data(cpi);
+
+ // Single thread case: use counts in common.
+ cpi->td.counts = &cm->counts;
+
+ // change includes all joint functionality
+ av1_change_config(cpi, oxcf);
+
+ cpi->static_mb_pct = 0;
+ cpi->ref_frame_flags = 0;
+
+ init_buffer_indices(cpi);
+}
+
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+ const AV1EncoderConfig *oxcf) {
+ const int64_t bandwidth = oxcf->target_bandwidth;
+ const int64_t starting = oxcf->starting_buffer_level_ms;
+ const int64_t optimal = oxcf->optimal_buffer_level_ms;
+ const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+ rc->starting_buffer_level = starting * bandwidth / 1000;
+ rc->optimal_buffer_level =
+ (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+ rc->maximum_buffer_size =
+ (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+}
+
+#if CONFIG_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx3f = SDX3F; \
+ cpi->fn_ptr[BT].sdx8f = SDX8F; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+ }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 4; \
+ }
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 3; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 3; i++) sad_array[i] >>= 4; \
+ }
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 8; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 8; i++) sad_array[i] >>= 4; \
+ }
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \
+ }
+
+#if CONFIG_EXT_PARTITION
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad128x128x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad128x128x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+#endif // CONFIG_EXT_PARTITION
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+
+#if CONFIG_EXT_INTER
+#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF) \
+ cpi->fn_ptr[BT].msdf = MSDF; \
+ cpi->fn_ptr[BT].mvf = MVF; \
+ cpi->fn_ptr[BT].msvf = MSVF;
+
+#define MAKE_MBFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *m, int m_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *m, int m_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *m, int m_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
+ 4; \
+ }
+
+#if CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+#endif // CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+ cpi->fn_ptr[BT].osdf = OSDF; \
+ cpi->fn_ptr[BT].ovf = OVF; \
+ cpi->fn_ptr[BT].osvf = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk); \
+ } \
+ static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 2; \
+ } \
+ static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 4; \
+ }
+
+#if CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+#endif // CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+#endif // CONFIG_MOTION_VAR
+
+static void highbd_set_var_fns(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case AOM_BITS_8:
+ HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
+ aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
+ aom_highbd_8_sub_pixel_variance32x16,
+ aom_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
+ aom_highbd_sad32x16x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
+ aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
+ aom_highbd_8_sub_pixel_variance16x32,
+ aom_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
+ aom_highbd_sad16x32x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
+ aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
+ aom_highbd_8_sub_pixel_variance64x32,
+ aom_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
+ aom_highbd_sad64x32x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
+ aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
+ aom_highbd_8_sub_pixel_variance32x64,
+ aom_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
+ aom_highbd_sad32x64x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
+ aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
+ aom_highbd_8_sub_pixel_variance32x32,
+ aom_highbd_8_sub_pixel_avg_variance32x32,
+ aom_highbd_sad32x32x3_bits8, aom_highbd_sad32x32x8_bits8,
+ aom_highbd_sad32x32x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
+ aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
+ aom_highbd_8_sub_pixel_variance64x64,
+ aom_highbd_8_sub_pixel_avg_variance64x64,
+ aom_highbd_sad64x64x3_bits8, aom_highbd_sad64x64x8_bits8,
+ aom_highbd_sad64x64x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
+ aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
+ aom_highbd_8_sub_pixel_variance16x16,
+ aom_highbd_8_sub_pixel_avg_variance16x16,
+ aom_highbd_sad16x16x3_bits8, aom_highbd_sad16x16x8_bits8,
+ aom_highbd_sad16x16x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
+ aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
+ aom_highbd_8_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x3_bits8,
+ aom_highbd_sad16x8x8_bits8, aom_highbd_sad16x8x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
+ aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
+ aom_highbd_8_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x3_bits8,
+ aom_highbd_sad8x16x8_bits8, aom_highbd_sad8x16x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
+ aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
+ aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits8,
+ aom_highbd_sad8x8x8_bits8, aom_highbd_sad8x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
+ aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
+ aom_highbd_8_sub_pixel_variance8x4,
+ aom_highbd_8_sub_pixel_avg_variance8x4, NULL,
+ aom_highbd_sad8x4x8_bits8, aom_highbd_sad8x4x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
+ aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
+ aom_highbd_8_sub_pixel_variance4x8,
+ aom_highbd_8_sub_pixel_avg_variance4x8, NULL,
+ aom_highbd_sad4x8x8_bits8, aom_highbd_sad4x8x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
+ aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
+ aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8,
+ aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8)
+
+#if CONFIG_CB4X4
+ HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_8_variance2x2, NULL, NULL,
+ NULL, NULL, NULL)
+ HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_8_variance4x2, NULL, NULL,
+ NULL, NULL, NULL)
+ HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_8_variance2x4, NULL, NULL,
+ NULL, NULL, NULL)
+#endif
+
+#if CONFIG_EXT_PARTITION
+ HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
+ aom_highbd_sad128x128_avg_bits8,
+ aom_highbd_8_variance128x128,
+ aom_highbd_8_sub_pixel_variance128x128,
+ aom_highbd_8_sub_pixel_avg_variance128x128,
+ aom_highbd_sad128x128x3_bits8, aom_highbd_sad128x128x8_bits8,
+ aom_highbd_sad128x128x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
+ aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
+ aom_highbd_8_sub_pixel_variance128x64,
+ aom_highbd_8_sub_pixel_avg_variance128x64, NULL, NULL,
+ aom_highbd_sad128x64x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
+ aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
+ aom_highbd_8_sub_pixel_variance64x128,
+ aom_highbd_8_sub_pixel_avg_variance64x128, NULL, NULL,
+ aom_highbd_sad64x128x4d_bits8)
+#endif // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
+ aom_highbd_masked_variance128x128,
+ aom_highbd_masked_sub_pixel_variance128x128)
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
+ aom_highbd_masked_variance128x64,
+ aom_highbd_masked_sub_pixel_variance128x64)
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
+ aom_highbd_masked_variance64x128,
+ aom_highbd_masked_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
+ aom_highbd_masked_variance64x64,
+ aom_highbd_masked_sub_pixel_variance64x64)
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
+ aom_highbd_masked_variance64x32,
+ aom_highbd_masked_sub_pixel_variance64x32)
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
+ aom_highbd_masked_variance32x64,
+ aom_highbd_masked_sub_pixel_variance32x64)
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
+ aom_highbd_masked_variance32x32,
+ aom_highbd_masked_sub_pixel_variance32x32)
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
+ aom_highbd_masked_variance32x16,
+ aom_highbd_masked_sub_pixel_variance32x16)
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
+ aom_highbd_masked_variance16x32,
+ aom_highbd_masked_sub_pixel_variance16x32)
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
+ aom_highbd_masked_variance16x16,
+ aom_highbd_masked_sub_pixel_variance16x16)
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
+ aom_highbd_masked_variance8x16,
+ aom_highbd_masked_sub_pixel_variance8x16)
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
+ aom_highbd_masked_variance16x8,
+ aom_highbd_masked_sub_pixel_variance16x8)
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
+ aom_highbd_masked_variance8x8,
+ aom_highbd_masked_sub_pixel_variance8x8)
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
+ aom_highbd_masked_variance4x8,
+ aom_highbd_masked_sub_pixel_variance4x8)
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
+ aom_highbd_masked_variance8x4,
+ aom_highbd_masked_sub_pixel_variance8x4)
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
+ aom_highbd_masked_variance4x4,
+ aom_highbd_masked_sub_pixel_variance4x4)
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+ HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
+ aom_highbd_obmc_variance128x128,
+ aom_highbd_obmc_sub_pixel_variance128x128)
+ HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
+ aom_highbd_obmc_variance128x64,
+ aom_highbd_obmc_sub_pixel_variance128x64)
+ HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
+ aom_highbd_obmc_variance64x128,
+ aom_highbd_obmc_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
+ aom_highbd_obmc_variance64x64,
+ aom_highbd_obmc_sub_pixel_variance64x64)
+ HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8,
+ aom_highbd_obmc_variance64x32,
+ aom_highbd_obmc_sub_pixel_variance64x32)
+ HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8,
+ aom_highbd_obmc_variance32x64,
+ aom_highbd_obmc_sub_pixel_variance32x64)
+ HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8,
+ aom_highbd_obmc_variance32x32,
+ aom_highbd_obmc_sub_pixel_variance32x32)
+ HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8,
+ aom_highbd_obmc_variance32x16,
+ aom_highbd_obmc_sub_pixel_variance32x16)
+ HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8,
+ aom_highbd_obmc_variance16x32,
+ aom_highbd_obmc_sub_pixel_variance16x32)
+ HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8,
+ aom_highbd_obmc_variance16x16,
+ aom_highbd_obmc_sub_pixel_variance16x16)
+ HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8,
+ aom_highbd_obmc_variance8x16,
+ aom_highbd_obmc_sub_pixel_variance8x16)
+ HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8,
+ aom_highbd_obmc_variance16x8,
+ aom_highbd_obmc_sub_pixel_variance16x8)
+ HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8,
+ aom_highbd_obmc_variance8x8,
+ aom_highbd_obmc_sub_pixel_variance8x8)
+ HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8,
+ aom_highbd_obmc_variance4x8,
+ aom_highbd_obmc_sub_pixel_variance4x8)
+ HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8,
+ aom_highbd_obmc_variance8x4,
+ aom_highbd_obmc_sub_pixel_variance8x4)
+ HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
+ aom_highbd_obmc_variance4x4,
+ aom_highbd_obmc_sub_pixel_variance4x4)
+#endif // CONFIG_MOTION_VAR
+ break;
+
+ case AOM_BITS_10:
+ HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
+ aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
+ aom_highbd_10_sub_pixel_variance32x16,
+ aom_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
+ aom_highbd_sad32x16x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
+ aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
+ aom_highbd_10_sub_pixel_variance16x32,
+ aom_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
+ aom_highbd_sad16x32x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
+ aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
+ aom_highbd_10_sub_pixel_variance64x32,
+ aom_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
+ aom_highbd_sad64x32x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
+ aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
+ aom_highbd_10_sub_pixel_variance32x64,
+ aom_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
+ aom_highbd_sad32x64x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
+ aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
+ aom_highbd_10_sub_pixel_variance32x32,
+ aom_highbd_10_sub_pixel_avg_variance32x32,
+ aom_highbd_sad32x32x3_bits10, aom_highbd_sad32x32x8_bits10,
+ aom_highbd_sad32x32x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
+ aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
+ aom_highbd_10_sub_pixel_variance64x64,
+ aom_highbd_10_sub_pixel_avg_variance64x64,
+ aom_highbd_sad64x64x3_bits10, aom_highbd_sad64x64x8_bits10,
+ aom_highbd_sad64x64x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
+ aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
+ aom_highbd_10_sub_pixel_variance16x16,
+ aom_highbd_10_sub_pixel_avg_variance16x16,
+ aom_highbd_sad16x16x3_bits10, aom_highbd_sad16x16x8_bits10,
+ aom_highbd_sad16x16x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
+ aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
+ aom_highbd_10_sub_pixel_variance16x8,
+ aom_highbd_10_sub_pixel_avg_variance16x8,
+ aom_highbd_sad16x8x3_bits10, aom_highbd_sad16x8x8_bits10,
+ aom_highbd_sad16x8x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
+ aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
+ aom_highbd_10_sub_pixel_variance8x16,
+ aom_highbd_10_sub_pixel_avg_variance8x16,
+ aom_highbd_sad8x16x3_bits10, aom_highbd_sad8x16x8_bits10,
+ aom_highbd_sad8x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
+ aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
+ aom_highbd_10_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits10,
+ aom_highbd_sad8x8x8_bits10, aom_highbd_sad8x8x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits10,
+ aom_highbd_sad8x4_avg_bits10, aom_highbd_10_variance8x4,
+ aom_highbd_10_sub_pixel_variance8x4,
+ aom_highbd_10_sub_pixel_avg_variance8x4, NULL,
+ aom_highbd_sad8x4x8_bits10, aom_highbd_sad8x4x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits10,
+ aom_highbd_sad4x8_avg_bits10, aom_highbd_10_variance4x8,
+ aom_highbd_10_sub_pixel_variance4x8,
+ aom_highbd_10_sub_pixel_avg_variance4x8, NULL,
+ aom_highbd_sad4x8x8_bits10, aom_highbd_sad4x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
+ aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
+ aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10,
+ aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10)
+
+#if CONFIG_CB4X4
+ HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_10_variance2x2, NULL, NULL,
+ NULL, NULL, NULL)
+ HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_10_variance4x2, NULL, NULL,
+ NULL, NULL, NULL)
+ HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_10_variance2x4, NULL, NULL,
+ NULL, NULL, NULL)
+#endif
+
+#if CONFIG_EXT_PARTITION
+ HIGHBD_BFP(
+ BLOCK_128X128, aom_highbd_sad128x128_bits10,
+ aom_highbd_sad128x128_avg_bits10, aom_highbd_10_variance128x128,
+ aom_highbd_10_sub_pixel_variance128x128,
+ aom_highbd_10_sub_pixel_avg_variance128x128,
+ aom_highbd_sad128x128x3_bits10, aom_highbd_sad128x128x8_bits10,
+ aom_highbd_sad128x128x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
+ aom_highbd_sad128x64_avg_bits10,
+ aom_highbd_10_variance128x64,
+ aom_highbd_10_sub_pixel_variance128x64,
+ aom_highbd_10_sub_pixel_avg_variance128x64, NULL, NULL,
+ aom_highbd_sad128x64x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
+ aom_highbd_sad64x128_avg_bits10,
+ aom_highbd_10_variance64x128,
+ aom_highbd_10_sub_pixel_variance64x128,
+ aom_highbd_10_sub_pixel_avg_variance64x128, NULL, NULL,
+ aom_highbd_sad64x128x4d_bits10)
+#endif // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
+ aom_highbd_10_masked_variance128x128,
+ aom_highbd_10_masked_sub_pixel_variance128x128)
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
+ aom_highbd_10_masked_variance128x64,
+ aom_highbd_10_masked_sub_pixel_variance128x64)
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
+ aom_highbd_10_masked_variance64x128,
+ aom_highbd_10_masked_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
+ aom_highbd_10_masked_variance64x64,
+ aom_highbd_10_masked_sub_pixel_variance64x64)
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
+ aom_highbd_10_masked_variance64x32,
+ aom_highbd_10_masked_sub_pixel_variance64x32)
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
+ aom_highbd_10_masked_variance32x64,
+ aom_highbd_10_masked_sub_pixel_variance32x64)
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
+ aom_highbd_10_masked_variance32x32,
+ aom_highbd_10_masked_sub_pixel_variance32x32)
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
+ aom_highbd_10_masked_variance32x16,
+ aom_highbd_10_masked_sub_pixel_variance32x16)
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
+ aom_highbd_10_masked_variance16x32,
+ aom_highbd_10_masked_sub_pixel_variance16x32)
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
+ aom_highbd_10_masked_variance16x16,
+ aom_highbd_10_masked_sub_pixel_variance16x16)
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
+ aom_highbd_10_masked_variance8x16,
+ aom_highbd_10_masked_sub_pixel_variance8x16)
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
+ aom_highbd_10_masked_variance16x8,
+ aom_highbd_10_masked_sub_pixel_variance16x8)
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
+ aom_highbd_10_masked_variance8x8,
+ aom_highbd_10_masked_sub_pixel_variance8x8)
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
+ aom_highbd_10_masked_variance4x8,
+ aom_highbd_10_masked_sub_pixel_variance4x8)
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
+ aom_highbd_10_masked_variance8x4,
+ aom_highbd_10_masked_sub_pixel_variance8x4)
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
+ aom_highbd_10_masked_variance4x4,
+ aom_highbd_10_masked_sub_pixel_variance4x4)
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+ HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
+ aom_highbd_10_obmc_variance128x128,
+ aom_highbd_10_obmc_sub_pixel_variance128x128)
+ HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
+ aom_highbd_10_obmc_variance128x64,
+ aom_highbd_10_obmc_sub_pixel_variance128x64)
+ HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
+ aom_highbd_10_obmc_variance64x128,
+ aom_highbd_10_obmc_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
+ aom_highbd_10_obmc_variance64x64,
+ aom_highbd_10_obmc_sub_pixel_variance64x64)
+ HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10,
+ aom_highbd_10_obmc_variance64x32,
+ aom_highbd_10_obmc_sub_pixel_variance64x32)
+ HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10,
+ aom_highbd_10_obmc_variance32x64,
+ aom_highbd_10_obmc_sub_pixel_variance32x64)
+ HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10,
+ aom_highbd_10_obmc_variance32x32,
+ aom_highbd_10_obmc_sub_pixel_variance32x32)
+ HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10,
+ aom_highbd_10_obmc_variance32x16,
+ aom_highbd_10_obmc_sub_pixel_variance32x16)
+ HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10,
+ aom_highbd_10_obmc_variance16x32,
+ aom_highbd_10_obmc_sub_pixel_variance16x32)
+ HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10,
+ aom_highbd_10_obmc_variance16x16,
+ aom_highbd_10_obmc_sub_pixel_variance16x16)
+ HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10,
+ aom_highbd_10_obmc_variance8x16,
+ aom_highbd_10_obmc_sub_pixel_variance8x16)
+ HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10,
+ aom_highbd_10_obmc_variance16x8,
+ aom_highbd_10_obmc_sub_pixel_variance16x8)
+ HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10,
+ aom_highbd_10_obmc_variance8x8,
+ aom_highbd_10_obmc_sub_pixel_variance8x8)
+ HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10,
+ aom_highbd_10_obmc_variance4x8,
+ aom_highbd_10_obmc_sub_pixel_variance4x8)
+ HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10,
+ aom_highbd_10_obmc_variance8x4,
+ aom_highbd_10_obmc_sub_pixel_variance8x4)
+ HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
+ aom_highbd_10_obmc_variance4x4,
+ aom_highbd_10_obmc_sub_pixel_variance4x4)
+#endif // CONFIG_MOTION_VAR
+ break;
+
+ case AOM_BITS_12:
+ HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
+ aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
+ aom_highbd_12_sub_pixel_variance32x16,
+ aom_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
+ aom_highbd_sad32x16x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
+ aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
+ aom_highbd_12_sub_pixel_variance16x32,
+ aom_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
+ aom_highbd_sad16x32x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
+ aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
+ aom_highbd_12_sub_pixel_variance64x32,
+ aom_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
+ aom_highbd_sad64x32x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
+ aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
+ aom_highbd_12_sub_pixel_variance32x64,
+ aom_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
+ aom_highbd_sad32x64x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
+ aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
+ aom_highbd_12_sub_pixel_variance32x32,
+ aom_highbd_12_sub_pixel_avg_variance32x32,
+ aom_highbd_sad32x32x3_bits12, aom_highbd_sad32x32x8_bits12,
+ aom_highbd_sad32x32x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
+ aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
+ aom_highbd_12_sub_pixel_variance64x64,
+ aom_highbd_12_sub_pixel_avg_variance64x64,
+ aom_highbd_sad64x64x3_bits12, aom_highbd_sad64x64x8_bits12,
+ aom_highbd_sad64x64x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
+ aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
+ aom_highbd_12_sub_pixel_variance16x16,
+ aom_highbd_12_sub_pixel_avg_variance16x16,
+ aom_highbd_sad16x16x3_bits12, aom_highbd_sad16x16x8_bits12,
+ aom_highbd_sad16x16x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
+ aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
+ aom_highbd_12_sub_pixel_variance16x8,
+ aom_highbd_12_sub_pixel_avg_variance16x8,
+ aom_highbd_sad16x8x3_bits12, aom_highbd_sad16x8x8_bits12,
+ aom_highbd_sad16x8x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
+ aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
+ aom_highbd_12_sub_pixel_variance8x16,
+ aom_highbd_12_sub_pixel_avg_variance8x16,
+ aom_highbd_sad8x16x3_bits12, aom_highbd_sad8x16x8_bits12,
+ aom_highbd_sad8x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
+ aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
+ aom_highbd_12_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits12,
+ aom_highbd_sad8x8x8_bits12, aom_highbd_sad8x8x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits12,
+ aom_highbd_sad8x4_avg_bits12, aom_highbd_12_variance8x4,
+ aom_highbd_12_sub_pixel_variance8x4,
+ aom_highbd_12_sub_pixel_avg_variance8x4, NULL,
+ aom_highbd_sad8x4x8_bits12, aom_highbd_sad8x4x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits12,
+ aom_highbd_sad4x8_avg_bits12, aom_highbd_12_variance4x8,
+ aom_highbd_12_sub_pixel_variance4x8,
+ aom_highbd_12_sub_pixel_avg_variance4x8, NULL,
+ aom_highbd_sad4x8x8_bits12, aom_highbd_sad4x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
+ aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
+ aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12,
+ aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12)
+
+#if CONFIG_CB4X4
+ HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_12_variance2x2, NULL, NULL,
+ NULL, NULL, NULL)
+ HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_12_variance4x2, NULL, NULL,
+ NULL, NULL, NULL)
+ HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_12_variance2x4, NULL, NULL,
+ NULL, NULL, NULL)
+#endif
+
+#if CONFIG_EXT_PARTITION
+ HIGHBD_BFP(
+ BLOCK_128X128, aom_highbd_sad128x128_bits12,
+ aom_highbd_sad128x128_avg_bits12, aom_highbd_12_variance128x128,
+ aom_highbd_12_sub_pixel_variance128x128,
+ aom_highbd_12_sub_pixel_avg_variance128x128,
+ aom_highbd_sad128x128x3_bits12, aom_highbd_sad128x128x8_bits12,
+ aom_highbd_sad128x128x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
+ aom_highbd_sad128x64_avg_bits12,
+ aom_highbd_12_variance128x64,
+ aom_highbd_12_sub_pixel_variance128x64,
+ aom_highbd_12_sub_pixel_avg_variance128x64, NULL, NULL,
+ aom_highbd_sad128x64x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
+ aom_highbd_sad64x128_avg_bits12,
+ aom_highbd_12_variance64x128,
+ aom_highbd_12_sub_pixel_variance64x128,
+ aom_highbd_12_sub_pixel_avg_variance64x128, NULL, NULL,
+ aom_highbd_sad64x128x4d_bits12)
+#endif // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
+ aom_highbd_12_masked_variance128x128,
+ aom_highbd_12_masked_sub_pixel_variance128x128)
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
+ aom_highbd_12_masked_variance128x64,
+ aom_highbd_12_masked_sub_pixel_variance128x64)
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
+ aom_highbd_12_masked_variance64x128,
+ aom_highbd_12_masked_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
+ aom_highbd_12_masked_variance64x64,
+ aom_highbd_12_masked_sub_pixel_variance64x64)
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
+ aom_highbd_12_masked_variance64x32,
+ aom_highbd_12_masked_sub_pixel_variance64x32)
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
+ aom_highbd_12_masked_variance32x64,
+ aom_highbd_12_masked_sub_pixel_variance32x64)
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
+ aom_highbd_12_masked_variance32x32,
+ aom_highbd_12_masked_sub_pixel_variance32x32)
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
+ aom_highbd_12_masked_variance32x16,
+ aom_highbd_12_masked_sub_pixel_variance32x16)
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
+ aom_highbd_12_masked_variance16x32,
+ aom_highbd_12_masked_sub_pixel_variance16x32)
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
+ aom_highbd_12_masked_variance16x16,
+ aom_highbd_12_masked_sub_pixel_variance16x16)
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
+ aom_highbd_12_masked_variance8x16,
+ aom_highbd_12_masked_sub_pixel_variance8x16)
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
+ aom_highbd_12_masked_variance16x8,
+ aom_highbd_12_masked_sub_pixel_variance16x8)
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
+ aom_highbd_12_masked_variance8x8,
+ aom_highbd_12_masked_sub_pixel_variance8x8)
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
+ aom_highbd_12_masked_variance4x8,
+ aom_highbd_12_masked_sub_pixel_variance4x8)
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
+ aom_highbd_12_masked_variance8x4,
+ aom_highbd_12_masked_sub_pixel_variance8x4)
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
+ aom_highbd_12_masked_variance4x4,
+ aom_highbd_12_masked_sub_pixel_variance4x4)
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+ HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
+ aom_highbd_12_obmc_variance128x128,
+ aom_highbd_12_obmc_sub_pixel_variance128x128)
+ HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
+ aom_highbd_12_obmc_variance128x64,
+ aom_highbd_12_obmc_sub_pixel_variance128x64)
+ HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
+ aom_highbd_12_obmc_variance64x128,
+ aom_highbd_12_obmc_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
+ aom_highbd_12_obmc_variance64x64,
+ aom_highbd_12_obmc_sub_pixel_variance64x64)
+ HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12,
+ aom_highbd_12_obmc_variance64x32,
+ aom_highbd_12_obmc_sub_pixel_variance64x32)
+ HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12,
+ aom_highbd_12_obmc_variance32x64,
+ aom_highbd_12_obmc_sub_pixel_variance32x64)
+ HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12,
+ aom_highbd_12_obmc_variance32x32,
+ aom_highbd_12_obmc_sub_pixel_variance32x32)
+ HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12,
+ aom_highbd_12_obmc_variance32x16,
+ aom_highbd_12_obmc_sub_pixel_variance32x16)
+ HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12,
+ aom_highbd_12_obmc_variance16x32,
+ aom_highbd_12_obmc_sub_pixel_variance16x32)
+ HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12,
+ aom_highbd_12_obmc_variance16x16,
+ aom_highbd_12_obmc_sub_pixel_variance16x16)
+ HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12,
+ aom_highbd_12_obmc_variance8x16,
+ aom_highbd_12_obmc_sub_pixel_variance8x16)
+ HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12,
+ aom_highbd_12_obmc_variance16x8,
+ aom_highbd_12_obmc_sub_pixel_variance16x8)
+ HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12,
+ aom_highbd_12_obmc_variance8x8,
+ aom_highbd_12_obmc_sub_pixel_variance8x8)
+ HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12,
+ aom_highbd_12_obmc_variance4x8,
+ aom_highbd_12_obmc_sub_pixel_variance4x8)
+ HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12,
+ aom_highbd_12_obmc_variance8x4,
+ aom_highbd_12_obmc_sub_pixel_variance8x4)
+ HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
+ aom_highbd_12_obmc_variance4x4,
+ aom_highbd_12_obmc_sub_pixel_variance4x4)
+#endif // CONFIG_MOTION_VAR
+ break;
+
+ default:
+ assert(0 &&
+ "cm->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static void realloc_segmentation_maps(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ // Create the encoder segmentation map and set all entries to 0
+ aom_free(cpi->segmentation_map);
+ CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+ aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+ // Create a map used for cyclic background refresh.
+ if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+ av1_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+ // Create a map used to mark inactive areas.
+ aom_free(cpi->active_map.map);
+ CHECK_MEM_ERROR(cm, cpi->active_map.map,
+ aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
+ cm->bit_depth = oxcf->bit_depth;
+ cm->color_space = oxcf->color_space;
+ cm->color_range = oxcf->color_range;
+
+ if (cm->profile <= PROFILE_1)
+ assert(cm->bit_depth == AOM_BITS_8);
+ else
+ assert(cm->bit_depth > AOM_BITS_8);
+
+ cpi->oxcf = *oxcf;
+#if CONFIG_HIGHBITDEPTH
+ cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+#endif // CONFIG_HIGHBITDEPTH
+#if CONFIG_GLOBAL_MOTION
+ cpi->td.mb.e_mbd.global_motion = cm->global_motion;
+#endif // CONFIG_GLOBAL_MOTION
+
+ if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+ rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+ } else {
+ rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+ }
+
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+ cpi->refresh_bwd_ref_frame = 0;
+#endif // CONFIG_EXT_REFS
+
+ cm->refresh_frame_context =
+ (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
+ ? REFRESH_FRAME_CONTEXT_FORWARD
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+
+#if CONFIG_PALETTE
+ cm->allow_screen_content_tools = (cpi->oxcf.content == AOM_CONTENT_SCREEN);
+ if (cm->allow_screen_content_tools) {
+ MACROBLOCK *x = &cpi->td.mb;
+ if (x->palette_buffer == 0) {
+ CHECK_MEM_ERROR(cm, x->palette_buffer,
+ aom_memalign(16, sizeof(*x->palette_buffer)));
+ }
+ // Reallocate the pc_tree, as it's contents depends on
+ // the state of cm->allow_screen_content_tools
+ av1_free_pc_tree(&cpi->td);
+ av1_setup_pc_tree(&cpi->common, &cpi->td);
+ }
+#endif // CONFIG_PALETTE
+
+ av1_reset_segment_features(cm);
+ av1_set_high_precision_mv(cpi, 0);
+
+ set_rc_buffer_sizes(rc, &cpi->oxcf);
+
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+ rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+
+ // Set up frame rate and related parameters rate control values.
+ av1_new_framerate(cpi, cpi->framerate);
+
+ // Set absolute upper and lower quality limits
+ rc->worst_quality = cpi->oxcf.worst_allowed_q;
+ rc->best_quality = cpi->oxcf.best_allowed_q;
+
+ cm->interp_filter = cpi->sf.default_interp_filter;
+
+ if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+ cm->render_width = cpi->oxcf.render_width;
+ cm->render_height = cpi->oxcf.render_height;
+ } else {
+ cm->render_width = cpi->oxcf.width;
+ cm->render_height = cpi->oxcf.height;
+ }
+ cm->width = cpi->oxcf.width;
+ cm->height = cpi->oxcf.height;
+
+ if (cpi->initial_width) {
+ if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+ av1_free_context_buffers(cm);
+ av1_alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->initial_width = cpi->initial_height = 0;
+ }
+ }
+ update_frame_size(cpi);
+
+ cpi->alt_ref_source = NULL;
+ rc->is_src_frame_alt_ref = 0;
+
+#if CONFIG_EXT_REFS
+ rc->is_bwd_ref_frame = 0;
+ rc->is_last_bipred_frame = 0;
+ rc->is_bipred_frame = 0;
+#endif // CONFIG_EXT_REFS
+
+#if 0
+ // Experimental RD Code
+ cpi->frame_distortion = 0;
+ cpi->last_frame_distortion = 0;
+#endif
+
+ set_tile_info(cpi);
+
+ cpi->ext_refresh_frame_flags_pending = 0;
+ cpi->ext_refresh_frame_context_pending = 0;
+
+#if CONFIG_HIGHBITDEPTH
+ highbd_set_var_fns(cpi);
+#endif
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+ cpi->common.ans_window_size_log2 = cpi->oxcf.ans_window_size_log2;
+ if (cpi->buf_ans.size != (1 << cpi->common.ans_window_size_log2)) {
+ aom_buf_ans_free(&cpi->buf_ans);
+ aom_buf_ans_alloc(&cpi->buf_ans, &cpi->common.error,
+ 1 << cpi->common.ans_window_size_log2);
+ }
+#endif // CONFIG_ANS && ANS_MAX_SYMBOLS
+}
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+#if !CONFIG_REF_MV
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+ mvjointsadcost[0] = 600;
+ mvjointsadcost[1] = 300;
+ mvjointsadcost[2] = 300;
+ mvjointsadcost[3] = 300;
+}
+#endif
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+ int i = 1;
+
+ mvsadcost[0][0] = 0;
+ mvsadcost[1][0] = 0;
+
+ do {
+ double z = 256 * (2 * (log2f(8 * i) + .6));
+ mvsadcost[0][i] = (int)z;
+ mvsadcost[1][i] = (int)z;
+ mvsadcost[0][-i] = (int)z;
+ mvsadcost[1][-i] = (int)z;
+ } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+ int i = 1;
+
+ mvsadcost[0][0] = 0;
+ mvsadcost[1][0] = 0;
+
+ do {
+ double z = 256 * (2 * (log2f(8 * i) + .6));
+ mvsadcost[0][i] = (int)z;
+ mvsadcost[1][i] = (int)z;
+ mvsadcost[0][-i] = (int)z;
+ mvsadcost[1][-i] = (int)z;
+ } while (++i <= MV_MAX);
+}
+
+static INLINE void init_upsampled_ref_frame_bufs(AV1_COMP *cpi) {
+ int i;
+
+ for (i = 0; i < (REF_FRAMES + 1); ++i) {
+ cpi->upsampled_ref_bufs[i].ref_count = 0;
+ cpi->upsampled_ref_idx[i] = INVALID_IDX;
+ }
+}
+
+AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+ BufferPool *const pool) {
+ unsigned int i;
+ AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+ AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+ if (!cm) return NULL;
+
+ av1_zero(*cpi);
+
+ if (setjmp(cm->error.jmp)) {
+ cm->error.setjmp = 0;
+ av1_remove_compressor(cpi);
+ return 0;
+ }
+
+ cm->error.setjmp = 1;
+ cm->alloc_mi = av1_enc_alloc_mi;
+ cm->free_mi = av1_enc_free_mi;
+ cm->setup_mi = av1_enc_setup_mi;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(cm, cm->frame_contexts,
+ (FRAME_CONTEXT *)aom_memalign(
+ 32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+ memset(cm->fc, 0, sizeof(*cm->fc));
+ memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
+
+ cpi->resize_state = 0;
+ cpi->resize_avg_qp = 0;
+ cpi->resize_buffer_underflow = 0;
+ cpi->common.buffer_pool = pool;
+
+ init_config(cpi, oxcf);
+#if CONFIG_XIPHRC
+ cpi->od_rc.framerate = cpi->framerate;
+ cpi->od_rc.frame_width = cm->render_width;
+ cpi->od_rc.frame_height = cm->render_height;
+ cpi->od_rc.keyframe_rate = oxcf->key_freq;
+ cpi->od_rc.goldenframe_rate = FIXED_GF_INTERVAL;
+ cpi->od_rc.altref_rate = 25;
+ cpi->od_rc.firstpass_quant = 1;
+ cpi->od_rc.bit_depth = cm->bit_depth;
+ cpi->od_rc.minq = oxcf->best_allowed_q;
+ cpi->od_rc.maxq = oxcf->worst_allowed_q;
+ if (cpi->oxcf.rc_mode == AOM_CQ) cpi->od_rc.minq = cpi->od_rc.quality;
+ cpi->od_rc.quality = cpi->oxcf.rc_mode == AOM_Q ? oxcf->cq_level : -1;
+ cpi->od_rc.periodic_boosts = oxcf->frame_periodic_boost;
+ od_enc_rc_init(&cpi->od_rc,
+ cpi->oxcf.rc_mode == AOM_Q ? -1 : oxcf->target_bandwidth,
+ oxcf->maximum_buffer_size_ms);
+#else
+ av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+#endif
+
+ cm->current_video_frame = 0;
+ cpi->partition_search_skippable_frame = 0;
+ cpi->tile_data = NULL;
+ cpi->last_show_frame_buf_idx = INVALID_IDX;
+
+ realloc_segmentation_maps(cpi);
+
+#if CONFIG_REF_MV
+ for (i = 0; i < NMV_CONTEXTS; ++i) {
+ memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+ memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
+ }
+#endif
+
+ memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts));
+ memset(cpi->nmvcosts_hp, 0, sizeof(cpi->nmvcosts_hp));
+ memset(cpi->nmvsadcosts, 0, sizeof(cpi->nmvsadcosts));
+ memset(cpi->nmvsadcosts_hp, 0, sizeof(cpi->nmvsadcosts_hp));
+
+ for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
+ i++) {
+ CHECK_MEM_ERROR(
+ cm, cpi->mbgraph_stats[i].mb_stats,
+ aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+ }
+
+#if CONFIG_FP_MB_STATS
+ cpi->use_fp_mb_stats = 0;
+ if (cpi->use_fp_mb_stats) {
+ // a place holder used to store the first pass mb stats in the first pass
+ CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
+ aom_calloc(cm->MBs * sizeof(uint8_t), 1));
+ } else {
+ cpi->twopass.frame_mb_stats_buf = NULL;
+ }
+#endif
+
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->multi_arf_last_grp_enabled = 0;
+
+ cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+ cpi->b_calculate_blockiness = 1;
+ cpi->b_calculate_consistency = 1;
+ cpi->total_inconsistency = 0;
+ cpi->psnr.worst = 100.0;
+ cpi->worst_ssim = 100.0;
+
+ cpi->count = 0;
+ cpi->bytes = 0;
+
+ if (cpi->b_calculate_psnr) {
+ cpi->total_sq_error = 0;
+ cpi->total_samples = 0;
+ cpi->tot_recode_hits = 0;
+ cpi->summed_quality = 0;
+ cpi->summed_weights = 0;
+ }
+
+ cpi->fastssim.worst = 100.0;
+ cpi->psnrhvs.worst = 100.0;
+
+ if (cpi->b_calculate_blockiness) {
+ cpi->total_blockiness = 0;
+ cpi->worst_blockiness = 0.0;
+ }
+
+ if (cpi->b_calculate_consistency) {
+ CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+ aom_malloc(sizeof(*cpi->ssim_vars) * 4 *
+ cpi->common.mi_rows * cpi->common.mi_cols));
+ cpi->worst_consistency = 100.0;
+ }
+#endif
+#if CONFIG_ENTROPY_STATS
+ av1_zero(aggregate_fc);
+#endif // CONFIG_ENTROPY_STATS
+
+ cpi->first_time_stamp_ever = INT64_MAX;
+
+#if CONFIG_REF_MV
+ for (i = 0; i < NMV_CONTEXTS; ++i) {
+ cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX];
+ cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX];
+ cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX];
+ cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX];
+ }
+#else
+ cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+ cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+ cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+ cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+ cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+#endif
+ cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+ cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+ cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
+
+ cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+ cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+ cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
+
+#ifdef OUTPUT_YUV_SKINMAP
+ yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+ yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+#if 0
+ framepsnr = fopen("framepsnr.stt", "a");
+ kf_list = fopen("kf_list.stt", "w");
+#endif
+
+#if CONFIG_XIPHRC
+ if (oxcf->pass == 2) {
+ cpi->od_rc.twopass_allframes_buf = oxcf->two_pass_stats_in.buf;
+ cpi->od_rc.twopass_allframes_buf_size = oxcf->two_pass_stats_in.sz;
+ }
+#else
+ if (oxcf->pass == 1) {
+ av1_init_first_pass(cpi);
+ } else if (oxcf->pass == 2) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ const size_t psz = cpi->common.MBs * sizeof(uint8_t);
+ const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
+
+ cpi->twopass.firstpass_mb_stats.mb_stats_start =
+ oxcf->firstpass_mb_stats_in.buf;
+ cpi->twopass.firstpass_mb_stats.mb_stats_end =
+ cpi->twopass.firstpass_mb_stats.mb_stats_start +
+ (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
+ }
+#endif
+
+ cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+ cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+ cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+ av1_init_second_pass(cpi);
+ }
+#endif
+
+ init_upsampled_ref_frame_bufs(cpi);
+
+ av1_set_speed_features_framesize_independent(cpi);
+ av1_set_speed_features_framesize_dependent(cpi);
+
+ // Allocate memory to store variances for a frame.
+ CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+ aom_calloc(cm->MBs, sizeof(*cpi->source_diff_var)));
+ cpi->source_var_thresh = 0;
+ cpi->frames_till_next_var_check = 0;
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx3f = SDX3F; \
+ cpi->fn_ptr[BT].sdx8f = SDX8F; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#if CONFIG_EXT_PARTITION
+ BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+ aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+ aom_sad128x128x3, aom_sad128x128x8, aom_sad128x128x4d)
+
+ BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+ aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, NULL,
+ NULL, aom_sad128x64x4d)
+
+ BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+ aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, NULL,
+ NULL, aom_sad64x128x4d)
+#endif // CONFIG_EXT_PARTITION
+
+ BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+ aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, NULL, NULL,
+ aom_sad32x16x4d)
+
+ BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+ aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, NULL, NULL,
+ aom_sad16x32x4d)
+
+ BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+ aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, NULL, NULL,
+ aom_sad64x32x4d)
+
+ BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+ aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, NULL, NULL,
+ aom_sad32x64x4d)
+
+ BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+ aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+ aom_sad32x32x3, aom_sad32x32x8, aom_sad32x32x4d)
+
+ BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+ aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+ aom_sad64x64x3, aom_sad64x64x8, aom_sad64x64x4d)
+
+ BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+ aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+ aom_sad16x16x3, aom_sad16x16x8, aom_sad16x16x4d)
+
+ BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+ aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, aom_sad16x8x3,
+ aom_sad16x8x8, aom_sad16x8x4d)
+
+ BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+ aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, aom_sad8x16x3,
+ aom_sad8x16x8, aom_sad8x16x4d)
+
+ BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+ aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x3,
+ aom_sad8x8x8, aom_sad8x8x4d)
+
+ BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+ aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, NULL,
+ aom_sad8x4x8, aom_sad8x4x4d)
+
+ BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+ aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, NULL,
+ aom_sad4x8x8, aom_sad4x8x4d)
+
+ BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+ aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x3,
+ aom_sad4x4x8, aom_sad4x4x4d)
+
+#if CONFIG_CB4X4
+ BFP(BLOCK_2X2, NULL, NULL, aom_variance2x2, NULL, NULL, NULL, NULL, NULL)
+ BFP(BLOCK_2X4, NULL, NULL, aom_variance2x4, NULL, NULL, NULL, NULL, NULL)
+ BFP(BLOCK_4X2, NULL, NULL, aom_variance4x2, NULL, NULL, NULL, NULL, NULL)
+#endif
+
+#if CONFIG_MOTION_VAR
+#define OBFP(BT, OSDF, OVF, OSVF) \
+ cpi->fn_ptr[BT].osdf = OSDF; \
+ cpi->fn_ptr[BT].ovf = OVF; \
+ cpi->fn_ptr[BT].osvf = OSVF;
+
+#if CONFIG_EXT_PARTITION
+ OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+ aom_obmc_sub_pixel_variance128x128)
+ OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+ aom_obmc_sub_pixel_variance128x64)
+ OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+ aom_obmc_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+ aom_obmc_sub_pixel_variance64x64)
+ OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+ aom_obmc_sub_pixel_variance64x32)
+ OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+ aom_obmc_sub_pixel_variance32x64)
+ OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+ aom_obmc_sub_pixel_variance32x32)
+ OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+ aom_obmc_sub_pixel_variance32x16)
+ OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+ aom_obmc_sub_pixel_variance16x32)
+ OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+ aom_obmc_sub_pixel_variance16x16)
+ OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+ aom_obmc_sub_pixel_variance16x8)
+ OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+ aom_obmc_sub_pixel_variance8x16)
+ OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+ aom_obmc_sub_pixel_variance8x8)
+ OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+ aom_obmc_sub_pixel_variance4x8)
+ OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+ aom_obmc_sub_pixel_variance8x4)
+ OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+ aom_obmc_sub_pixel_variance4x4)
+#endif // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+#define MBFP(BT, MSDF, MVF, MSVF) \
+ cpi->fn_ptr[BT].msdf = MSDF; \
+ cpi->fn_ptr[BT].mvf = MVF; \
+ cpi->fn_ptr[BT].msvf = MSVF;
+
+#if CONFIG_EXT_PARTITION
+ MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_variance128x128,
+ aom_masked_sub_pixel_variance128x128)
+ MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_variance128x64,
+ aom_masked_sub_pixel_variance128x64)
+ MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_variance64x128,
+ aom_masked_sub_pixel_variance64x128)
+#endif // CONFIG_EXT_PARTITION
+ MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_variance64x64,
+ aom_masked_sub_pixel_variance64x64)
+ MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_variance64x32,
+ aom_masked_sub_pixel_variance64x32)
+ MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_variance32x64,
+ aom_masked_sub_pixel_variance32x64)
+ MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_variance32x32,
+ aom_masked_sub_pixel_variance32x32)
+ MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_variance32x16,
+ aom_masked_sub_pixel_variance32x16)
+ MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_variance16x32,
+ aom_masked_sub_pixel_variance16x32)
+ MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_variance16x16,
+ aom_masked_sub_pixel_variance16x16)
+ MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_variance16x8,
+ aom_masked_sub_pixel_variance16x8)
+ MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_variance8x16,
+ aom_masked_sub_pixel_variance8x16)
+ MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_variance8x8,
+ aom_masked_sub_pixel_variance8x8)
+ MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_variance4x8,
+ aom_masked_sub_pixel_variance4x8)
+ MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_variance8x4,
+ aom_masked_sub_pixel_variance8x4)
+ MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_variance4x4,
+ aom_masked_sub_pixel_variance4x4)
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_HIGHBITDEPTH
+ highbd_set_var_fns(cpi);
+#endif
+
+ /* av1_init_quantizer() is first called here. Add check in
+ * av1_frame_init_quantizer() so that av1_init_quantizer is only
+ * called later when needed. This will avoid unnecessary calls of
+ * av1_init_quantizer() for every frame.
+ */
+ av1_init_quantizer(cpi);
+#if CONFIG_AOM_QM
+ aom_qm_init(cm);
+#endif
+
+ av1_loop_filter_init(cm);
+#if CONFIG_LOOP_RESTORATION
+ av1_loop_restoration_precal();
+#endif // CONFIG_LOOP_RESTORATION
+
+ cm->error.setjmp = 0;
+
+ return cpi;
+}
+
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+ AV1_COMMON *cm;
+ unsigned int i;
+ int t;
+
+ if (!cpi) return;
+
+ cm = &cpi->common;
+ if (cm->current_video_frame > 0) {
+#if CONFIG_ENTROPY_STATS
+ if (cpi->oxcf.pass != 1) {
+ fprintf(stderr, "Writing counts.stt\n");
+ FILE *f = fopen("counts.stt", "wb");
+ fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
+ fclose(f);
+ }
+#endif // CONFIG_ENTROPY_STATS
+#if CONFIG_INTERNAL_STATS
+ aom_clear_system_state();
+
+ if (cpi->oxcf.pass != 1) {
+ char headings[512] = { 0 };
+ char results[512] = { 0 };
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded =
+ (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+ 10000000.000;
+ double total_encode_time =
+ (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+ const double dr =
+ (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
+ const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+ const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+ const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+ if (cpi->b_calculate_psnr) {
+ const double total_psnr = aom_sse_to_psnr(
+ (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
+ const double total_ssim =
+ 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+ snprintf(headings, sizeof(headings),
+ "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+ "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+ "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+ snprintf(results, sizeof(results),
+ "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+ dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+ cpi->psnr.stat[ALL] / cpi->count, total_psnr, total_ssim,
+ total_ssim, cpi->fastssim.stat[ALL] / cpi->count,
+ cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
+ cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+
+ if (cpi->b_calculate_blockiness) {
+ SNPRINT(headings, "\t Block\tWstBlck");
+ SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+ }
+
+ if (cpi->b_calculate_consistency) {
+ double consistency =
+ aom_sse_to_psnr((double)cpi->total_samples, peak,
+ (double)cpi->total_inconsistency);
+
+ SNPRINT(headings, "\tConsist\tWstCons");
+ SNPRINT2(results, "\t%7.3f", consistency);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+ }
+ fprintf(f, "%s\t Time\tRcErr\tAbsErr\n", headings);
+ fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time,
+ rate_err, fabs(rate_err));
+ }
+
+ fclose(f);
+ }
+
+#endif
+
+#if 0
+ {
+ printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+ printf("\n_frames recive_data encod_mb_row compress_frame Total\n");
+ printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
+ cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
+ cpi->time_compress_data / 1000,
+ (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+ }
+#endif
+ }
+
+ for (t = 0; t < cpi->num_workers; ++t) {
+ AVxWorker *const worker = &cpi->workers[t];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+ // Deallocate allocated threads.
+ aom_get_worker_interface()->end(worker);
+
+ // Deallocate allocated thread data.
+ if (t < cpi->num_workers - 1) {
+#if CONFIG_PALETTE
+ if (cpi->common.allow_screen_content_tools)
+ aom_free(thread_data->td->mb.palette_buffer);
+#endif // CONFIG_PALETTE
+ aom_free(thread_data->td->counts);
+ av1_free_pc_tree(thread_data->td);
+ av1_free_var_tree(thread_data->td);
+ aom_free(thread_data->td);
+ }
+ }
+ aom_free(cpi->tile_thr_data);
+ aom_free(cpi->workers);
+
+ if (cpi->num_workers > 1) av1_loop_filter_dealloc(&cpi->lf_row_sync);
+
+ dealloc_compressor_data(cpi);
+
+ for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]);
+ ++i) {
+ aom_free(cpi->mbgraph_stats[i].mb_stats);
+ }
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ aom_free(cpi->twopass.frame_mb_stats_buf);
+ cpi->twopass.frame_mb_stats_buf = NULL;
+ }
+#endif
+#if CONFIG_INTERNAL_STATS
+ aom_free(cpi->ssim_vars);
+ cpi->ssim_vars = NULL;
+#endif // CONFIG_INTERNAL_STATS
+
+ av1_remove_common(cm);
+ av1_free_ref_frame_buffers(cm->buffer_pool);
+ aom_free(cpi);
+
+#ifdef OUTPUT_YUV_SKINMAP
+ fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+ fclose(yuv_rec_file);
+#endif
+
+#if 0
+
+ if (keyfile)
+ fclose(keyfile);
+
+ if (framepsnr)
+ fclose(framepsnr);
+
+ if (kf_list)
+ fclose(kf_list);
+
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+ struct aom_codec_cx_pkt pkt;
+ int i;
+ PSNR_STATS psnr;
+#if CONFIG_HIGHBITDEPTH
+ aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
+ cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
+ aom_calc_psnr(cpi->source, cpi->common.frame_to_show, &psnr);
+#endif
+
+ for (i = 0; i < 4; ++i) {
+ pkt.data.psnr.samples[i] = psnr.samples[i];
+ pkt.data.psnr.sse[i] = psnr.sse[i];
+ pkt.data.psnr.psnr[i] = psnr.psnr[i];
+ }
+ pkt.kind = AOM_CODEC_PSNR_PKT;
+ aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
+ if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+ cpi->ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags) {
+ cpi->ext_refresh_golden_frame = (ref_frame_flags & AOM_GOLD_FLAG) != 0;
+ cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & AOM_ALT_FLAG) != 0;
+ cpi->ext_refresh_last_frame = (ref_frame_flags & AOM_LAST_FLAG) != 0;
+ cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+static YV12_BUFFER_CONFIG *get_av1_ref_frame_buffer(
+ AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag) {
+ MV_REFERENCE_FRAME ref_frame = NONE_FRAME;
+ if (ref_frame_flag == AOM_LAST_FLAG) ref_frame = LAST_FRAME;
+#if CONFIG_EXT_REFS
+ else if (ref_frame_flag == AOM_LAST2_FLAG)
+ ref_frame = LAST2_FRAME;
+ else if (ref_frame_flag == AOM_LAST3_FLAG)
+ ref_frame = LAST3_FRAME;
+#endif // CONFIG_EXT_REFS
+ else if (ref_frame_flag == AOM_GOLD_FLAG)
+ ref_frame = GOLDEN_FRAME;
+#if CONFIG_EXT_REFS
+ else if (ref_frame_flag == AOM_BWD_FLAG)
+ ref_frame = BWDREF_FRAME;
+#endif // CONFIG_EXT_REFS
+ else if (ref_frame_flag == AOM_ALT_FLAG)
+ ref_frame = ALTREF_FRAME;
+
+ return ref_frame == NONE_FRAME ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag);
+ if (cfg) {
+ aom_yv12_copy_frame(cfg, sd);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag);
+ if (cfg) {
+ aom_yv12_copy_frame(sd, cfg);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_update_entropy(AV1_COMP *cpi, int update) {
+ cpi->ext_refresh_frame_context = update;
+ cpi->ext_refresh_frame_context_pending = 1;
+ return 0;
+}
+
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+ uint8_t *src = s->y_buffer;
+ int h = s->y_height;
+
+ do {
+ fwrite(src, s->y_width, 1, f);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
+ } while (--h);
+}
+#endif
+
+#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
+static void check_show_existing_frame(AV1_COMP *cpi) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ AV1_COMMON *const cm = &cpi->common;
+ const FRAME_UPDATE_TYPE next_frame_update_type =
+ gf_group->update_type[gf_group->index];
+ const int which_arf = gf_group->arf_update_idx[gf_group->index];
+
+ if (cm->show_existing_frame == 1) {
+ cm->show_existing_frame = 0;
+ } else if (cpi->rc.is_last_bipred_frame) {
+ // NOTE(zoeliu): If the current frame is a last bi-predictive frame, it is
+ // needed next to show the BWDREF_FRAME, which is pointed by
+ // the last_fb_idxes[0] after reference frame buffer update
+ cpi->rc.is_last_bipred_frame = 0;
+ cm->show_existing_frame = 1;
+ cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0];
+ } else if (cpi->is_arf_filter_off[which_arf] &&
+ (next_frame_update_type == OVERLAY_UPDATE ||
+ next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+ // Other parameters related to OVERLAY_UPDATE will be taken care of
+ // in av1_rc_get_second_pass_params(cpi)
+ cm->show_existing_frame = 1;
+ cpi->rc.is_src_frame_alt_ref = 1;
+ cpi->existing_fb_idx_to_show = cpi->alt_fb_idx;
+ cpi->is_arf_filter_off[which_arf] = 0;
+ }
+ cpi->rc.is_src_frame_ext_arf = 0;
+}
+#endif // CONFIG_EXT_REFS && !CONFIG_XIPHRC
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+ uint8_t *src = s->y_buffer;
+ int h = cm->height;
+
+#if CONFIG_HIGHBITDEPTH
+ if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+ do {
+ fwrite(src16, s->y_width, 2, yuv_rec_file);
+ src16 += s->y_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_rec_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+}
+#endif // OUTPUT_YUV_REC
+
+#if CONFIG_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int bd) {
+#else
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+#endif // CONFIG_HIGHBITDEPTH
+ // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
+ int i;
+ const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+ src->v_buffer };
+ const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
+ src->uv_crop_width };
+ const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
+ src->uv_crop_height };
+ uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+ const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+ const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
+ dst->uv_crop_width };
+ const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
+ dst->uv_crop_height };
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ av1_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+ src_strides[i], dsts[i], dst_heights[i],
+ dst_widths[i], dst_strides[i], bd);
+ } else {
+ av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+ dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+ }
+#else
+ av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+ dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+#endif // CONFIG_HIGHBITDEPTH
+ }
+ aom_extend_frame_borders(dst);
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int planes,
+ int bd) {
+#else
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int planes) {
+#endif // CONFIG_HIGHBITDEPTH
+ const int src_w = src->y_crop_width;
+ const int src_h = src->y_crop_height;
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+ src->v_buffer };
+ const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+ const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+ const InterpFilterParams interp_filter_params =
+ av1_get_interp_filter_params(EIGHTTAP_REGULAR);
+ const int16_t *kernel = interp_filter_params.filter_ptr;
+ const int taps = interp_filter_params.taps;
+ int x, y, i;
+
+ assert(planes <= 3);
+ for (y = 0; y < dst_h; y += 16) {
+ for (x = 0; x < dst_w; x += 16) {
+ for (i = 0; i < planes; ++i) {
+ const int factor = (i == 0 || i == 3 ? 1 : 2);
+ const int x_q4 = x * (16 / factor) * src_w / dst_w;
+ const int y_q4 = y * (16 / factor) * src_h / dst_h;
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ const uint8_t *src_ptr = srcs[i] +
+ (y / factor) * src_h / dst_h * src_stride +
+ (x / factor) * src_w / dst_w;
+ uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+#if CONFIG_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+ &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
+ 16 / factor, 16 / factor, bd);
+ } else {
+ aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+ &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+ &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
+ 16 / factor, 16 / factor);
+ }
+#else
+ aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+ &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+ &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
+ 16 / factor, 16 / factor);
+#endif // CONFIG_HIGHBITDEPTH
+ }
+ }
+ }
+
+ if (planes == 1)
+ aom_extend_frame_borders_y(dst);
+ else
+ aom_extend_frame_borders(dst);
+}
+
+static int scale_down(AV1_COMP *cpi, int q) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int scale = 0;
+ assert(frame_is_kf_gf_arf(cpi));
+
+ if (rc->frame_size_selector == UNSCALED &&
+ q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+ const int max_size_thresh =
+ (int)(rate_thresh_mult[SCALE_STEP1] *
+ AOMMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+ scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+ }
+ return scale;
+}
+
+#if CONFIG_GLOBAL_MOTION
+#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
+static int recode_loop_test_global_motion(AV1_COMP *cpi) {
+ int i;
+ int recode = 0;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+ AV1_COMMON *const cm = &cpi->common;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ if (cm->global_motion[i].wmtype != IDENTITY &&
+ rdc->global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
+ cpi->gmparams_cost[i]) {
+ set_default_warp_params(&cm->global_motion[i]);
+ cpi->gmparams_cost[i] = 0;
+#if CONFIG_REF_MV
+ recode = 1;
+#else
+ recode |= (rdc->global_motion_used[i] > 0);
+#endif
+ }
+ }
+ return recode;
+}
+#endif // CONFIG_GLOBAL_MOTION
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
+ int maxq, int minq) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+ int force_recode = 0;
+
+ if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (cpi->sf.recode_loop == ALLOW_RECODE) ||
+ (frame_is_kfgfarf && (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+ if (frame_is_kfgfarf && (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+ scale_down(cpi, q)) {
+ // Code this group at a lower resolution.
+ cpi->resize_pending = 1;
+ return 1;
+ }
+
+ // TODO(agrange) high_limit could be greater than the scale-down threshold.
+ if ((rc->projected_frame_size > high_limit && q < maxq) ||
+ (rc->projected_frame_size < low_limit && q > minq)) {
+ force_recode = 1;
+ } else if (cpi->oxcf.rc_mode == AOM_CQ) {
+ // Deal with frame undershoot and whether or not we are
+ // below the automatically set cq level.
+ if (q > oxcf->cq_level &&
+ rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+ force_recode = 1;
+ }
+ }
+ }
+ return force_recode;
+}
+
+static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
+ int i;
+
+ for (i = 0; i < (REF_FRAMES + 1); i++) {
+ if (!ubufs[i].ref_count) {
+ return i;
+ }
+ }
+ return INVALID_IDX;
+}
+
+// Up-sample 1 reference frame.
+static INLINE int upsample_ref_frame(AV1_COMP *cpi,
+ const YV12_BUFFER_CONFIG *const ref) {
+ AV1_COMMON *const cm = &cpi->common;
+ EncRefCntBuffer *ubufs = cpi->upsampled_ref_bufs;
+ int new_uidx = get_free_upsampled_ref_buf(ubufs);
+
+ if (new_uidx == INVALID_IDX) {
+ return INVALID_IDX;
+ } else {
+ YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
+
+ // Can allocate buffer for Y plane only.
+ if (upsampled_ref->buffer_alloc_sz < (ref->buffer_alloc_sz << 6))
+ if (aom_realloc_frame_buffer(upsampled_ref, (cm->width << 3),
+ (cm->height << 3), cm->subsampling_x,
+ cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ (AOM_BORDER_IN_PIXELS << 3),
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate up-sampled frame buffer");
+
+// Currently, only Y plane is up-sampled, U, V are not used.
+#if CONFIG_HIGHBITDEPTH
+ scale_and_extend_frame(ref, upsampled_ref, 1, (int)cm->bit_depth);
+#else
+ scale_and_extend_frame(ref, upsampled_ref, 1);
+#endif
+ return new_uidx;
+ }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *const ref_buf,
+ char *file_name) {
+ int h;
+ FILE *f_ref = NULL;
+
+ if (ref_buf == NULL) {
+ printf("Frame data buffer is NULL.\n");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ if ((f_ref = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+
+ fclose(f_ref);
+
+ return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ char file_name[256] = "";
+ snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+ cm->current_video_frame, ref_frame);
+ dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
+ }
+}
+#endif // DUMP_REF_FRAME_IMAGES == 1
+
+#if CONFIG_EXT_REFS
+// This function is used to shift the virtual indices of last reference frames
+// as follows:
+// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+// when the LAST_FRAME is updated.
+static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
+ int ref_frame;
+ for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
+ cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
+
+ // [0] is allocated to the current coded frame. The statistics for the
+ // reference frames start at [LAST_FRAME], i.e. [1].
+ if (!cpi->rc.is_src_frame_alt_ref) {
+ memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+ cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+ sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
+ }
+ }
+}
+#endif // CONFIG_EXT_REFS
+
+void av1_update_reference_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+ int new_uidx = 0;
+
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
+
+ if (use_upsampled_ref) {
+#if CONFIG_EXT_REFS
+ if (cm->show_existing_frame) {
+ new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show];
+ // TODO(zoeliu): Once following is confirmed, remove it.
+ assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0);
+ } else {
+#endif // CONFIG_EXT_REFS
+ // Up-sample the current encoded frame.
+ RefCntBuffer *bufs = pool->frame_bufs;
+ const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
+
+ new_uidx = upsample_ref_frame(cpi, ref);
+#if CONFIG_EXT_REFS
+ assert(new_uidx != INVALID_IDX);
+ }
+#endif // CONFIG_EXT_REFS
+ }
+ // At this point the new frame has been encoded.
+ // If any buffer copy / swapping is signaled it should be done here.
+ if (cm->frame_type == KEY_FRAME) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->new_fb_idx);
+#if CONFIG_EXT_REFS
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
+ cm->new_fb_idx);
+#endif // CONFIG_EXT_REFS
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+ cm->new_fb_idx);
+
+ if (use_upsampled_ref) {
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+#if CONFIG_EXT_REFS
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+#endif // CONFIG_EXT_REFS
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+ }
+ } else if (av1_preserve_existing_gf(cpi)) {
+ // We have decided to preserve the previously existing golden frame as our
+ // new ARF frame. However, in the short term in function
+ // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+ // we're updating the GF with the current decoded frame, we save it to the
+ // ARF slot instead.
+ // We now have to update the ARF with the current frame and swap gld_fb_idx
+ // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+ // slot and, if we're updating the GF, the current frame becomes the new GF.
+ int tmp;
+
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+ cm->new_fb_idx);
+ if (use_upsampled_ref)
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+
+ tmp = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = cpi->gld_fb_idx;
+ cpi->gld_fb_idx = tmp;
+
+#if CONFIG_EXT_REFS
+ // We need to modify the mapping accordingly
+ cpi->arf_map[0] = cpi->alt_fb_idx;
+#endif
+// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+// cpi->interp_filter_selected[GOLDEN_FRAME]?
+#if CONFIG_EXT_REFS
+ } else if (cpi->rc.is_last_bipred_frame) {
+ // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the LAST3_FRAME
+ // by updating the virtual indices. Note that the frame BWDREF_FRAME points
+ // to now should be retired, and it should not be used before refreshed.
+ int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+ shift_last_ref_frames(cpi);
+ cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
+ cpi->bwd_fb_idx = tmp;
+
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[BWDREF_FRAME],
+ sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+ } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
+ // Deal with the special case for showing existing internal ALTREF_FRAME
+ // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
+ // by updating the virtual indices.
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int which_arf = gf_group->arf_ref_idx[gf_group->index];
+ int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+ shift_last_ref_frames(cpi);
+ cpi->lst_fb_idxes[0] = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = tmp;
+
+ // We need to modify the mapping accordingly
+ cpi->arf_map[which_arf] = cpi->alt_fb_idx;
+
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
+ sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
+#endif // CONFIG_EXT_REFS
+ } else { /* For non key/golden frames */
+ if (cpi->refresh_alt_ref_frame) {
+ int arf_idx = cpi->alt_fb_idx;
+ int which_arf = 0;
+#if CONFIG_EXT_REFS
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ which_arf = gf_group->arf_update_idx[gf_group->index];
+ arf_idx = cpi->arf_map[which_arf];
+ }
+#else
+ if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ arf_idx = gf_group->arf_update_idx[gf_group->index];
+ }
+#endif // CONFIG_EXT_REFS
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+ if (use_upsampled_ref)
+ uref_cnt_fb(cpi->upsampled_ref_bufs, &cpi->upsampled_ref_idx[arf_idx],
+ new_uidx);
+
+ memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+
+ if (cpi->refresh_golden_frame) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->new_fb_idx);
+ if (use_upsampled_ref)
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+
+#if !CONFIG_EXT_REFS
+ if (!cpi->rc.is_src_frame_alt_ref)
+#endif // !CONFIG_EXT_REFS
+ memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+
+#if CONFIG_EXT_REFS
+ if (cpi->refresh_bwd_ref_frame) {
+ if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+ // We have swapped the virtual indices to allow bwd_ref_frame to use
+ // ALT0 as reference frame. We need to swap them back.
+ // NOTE: The ALT_REFs' are indexed reversely, and ALT0 refers to the
+ // farthest ALT_REF from the first frame in the gf group.
+ int tmp = cpi->arf_map[0];
+ cpi->arf_map[0] = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = cpi->bwd_fb_idx;
+ cpi->bwd_fb_idx = tmp;
+ }
+
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
+ cm->new_fb_idx);
+ if (use_upsampled_ref)
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+
+ memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+#endif // CONFIG_EXT_REFS
+ }
+
+ if (cpi->refresh_last_frame) {
+#if CONFIG_EXT_REFS
+ // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
+ // reference to the reference frame buffer virtual index; and then (2) from
+ // the virtual index to the reference frame buffer physical index:
+ //
+ // LAST_FRAME, ..., LAST3_FRAME, ..., ALTREF_FRAME
+ // | | |
+ // v v v
+ // lst_fb_idxes[0], ..., lst_fb_idxes[2], ..., alt_fb_idx
+ // | | |
+ // v v v
+ // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[]
+ //
+ // When refresh_last_frame is set, it is intended to retire LAST3_FRAME,
+ // have the other 2 LAST reference frames shifted as follows:
+ // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+ // , and then have LAST_FRAME refreshed by the newly coded frame.
+ //
+ // To fulfill it, the decoder will be notified to execute following 2 steps:
+ //
+ // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME
+ // to point to the newly coded frame, i.e.
+ // ref_frame_map[lst_fb_idexes[2]] => new_fb_idx;
+ //
+ // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the
+ // original virtual index of LAST3_FRAME and have the other mappings
+ // shifted as follows:
+ // LAST_FRAME, LAST2_FRAME, LAST3_FRAME
+ // | | |
+ // v v v
+ // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1]
+ int ref_frame;
+
+ if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+ // We have swapped the virtual indices to use ALT0 as BWD_REF
+ // and we need to swap them back.
+ int tmp = cpi->arf_map[0];
+ cpi->arf_map[0] = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = cpi->bwd_fb_idx;
+ cpi->bwd_fb_idx = tmp;
+ }
+
+ if (cm->frame_type == KEY_FRAME) {
+ for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
+ cm->new_fb_idx);
+
+ if (use_upsampled_ref)
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[ref_frame]],
+ new_uidx);
+ }
+ } else {
+ int tmp;
+
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
+ cm->new_fb_idx);
+
+ if (use_upsampled_ref)
+ uref_cnt_fb(
+ cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
+ new_uidx);
+
+ tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+ shift_last_ref_frames(cpi);
+ cpi->lst_fb_idxes[0] = tmp;
+
+ assert(cm->show_existing_frame == 0);
+ // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to
+ // refresh the LAST_FRAME.
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+#else
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+ cm->new_fb_idx);
+ if (use_upsampled_ref)
+ uref_cnt_fb(cpi->upsampled_ref_bufs,
+ &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+ if (!cpi->rc.is_src_frame_alt_ref) {
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+#endif // CONFIG_EXT_REFS
+ }
+
+#if DUMP_REF_FRAME_IMAGES == 1
+ // Dump out all reference frame images.
+ dump_ref_frame_images(cpi);
+#endif // DUMP_REF_FRAME_IMAGES
+}
+
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+ struct loopfilter *lf = &cm->lf;
+ if (is_lossless_requested(&cpi->oxcf)) {
+ lf->filter_level = 0;
+ } else {
+ struct aom_usec_timer timer;
+
+ aom_clear_system_state();
+
+ aom_usec_timer_start(&timer);
+
+ av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
+
+ aom_usec_timer_mark(&timer);
+ cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
+ }
+
+ if (lf->filter_level > 0) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+ av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#else
+ if (cpi->num_workers > 1)
+ av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+ lf->filter_level, 0, 0, cpi->workers,
+ cpi->num_workers, &cpi->lf_row_sync);
+ else
+ av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#endif
+ }
+#if CONFIG_CDEF
+ if (is_lossless_requested(&cpi->oxcf)) {
+ cm->cdef_bits = 0;
+ cm->cdef_strengths[0] = 0;
+ cm->nb_cdef_strengths = 1;
+ } else {
+ // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
+ av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd);
+
+ // Apply the filter
+ av1_cdef_frame(cm->frame_to_show, cm, xd);
+ }
+#endif
+#if CONFIG_LOOP_RESTORATION
+ av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick);
+ if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+ av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL);
+ }
+#endif // CONFIG_LOOP_RESTORATION
+ aom_extend_frame_inner_borders(cm->frame_to_show);
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
+ RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+ if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+ new_fb_ptr->mi_cols < cm->mi_cols) {
+ aom_free(new_fb_ptr->mvs);
+ CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+ (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
+ sizeof(*new_fb_ptr->mvs)));
+ new_fb_ptr->mi_rows = cm->mi_rows;
+ new_fb_ptr->mi_cols = cm->mi_cols;
+ }
+}
+
+void av1_scale_references(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+ const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
+ AOM_LAST_FLAG,
+#if CONFIG_EXT_REFS
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+#endif // CONFIG_EXT_REFS
+ AOM_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+ AOM_BWD_FLAG,
+#endif // CONFIG_EXT_REFS
+ AOM_ALT_FLAG
+ };
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+ if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+ BufferPool *const pool = cm->buffer_pool;
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_buffer(cpi, ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ continue;
+ }
+
+#if CONFIG_HIGHBITDEPTH
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ RefCntBuffer *new_fb_ptr = NULL;
+ int force_scaling = 0;
+ int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+ if (new_fb == INVALID_IDX) {
+ new_fb = get_free_fb(cm);
+ force_scaling = 1;
+ }
+ if (new_fb == INVALID_IDX) return;
+ new_fb_ptr = &pool->frame_bufs[new_fb];
+ if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+ new_fb_ptr->buf.y_crop_height != cm->height) {
+ if (aom_realloc_frame_buffer(
+ &new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x,
+ cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE,
+ (int)cm->bit_depth);
+ cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+ alloc_frame_mvs(cm, new_fb);
+ }
+#else
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ RefCntBuffer *new_fb_ptr = NULL;
+ int force_scaling = 0;
+ int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+ if (new_fb == INVALID_IDX) {
+ new_fb = get_free_fb(cm);
+ force_scaling = 1;
+ }
+ if (new_fb == INVALID_IDX) return;
+ new_fb_ptr = &pool->frame_bufs[new_fb];
+ if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+ new_fb_ptr->buf.y_crop_height != cm->height) {
+ if (aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE);
+ cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+ alloc_frame_mvs(cm, new_fb);
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ if (cpi->sf.use_upsampled_references &&
+ (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+ new_fb_ptr->buf.y_crop_height != cm->height)) {
+ const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+ EncRefCntBuffer *ubuf =
+ &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
+
+ if (aom_realloc_frame_buffer(&ubuf->buf, (cm->width << 3),
+ (cm->height << 3), cm->subsampling_x,
+ cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ (AOM_BORDER_IN_PIXELS << 3),
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate up-sampled frame buffer");
+#if CONFIG_HIGHBITDEPTH
+ scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1,
+ (int)cm->bit_depth);
+#else
+ scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1);
+#endif
+ }
+ } else {
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+ buf->buf.y_crop_width = ref->y_crop_width;
+ buf->buf.y_crop_height = ref->y_crop_height;
+ cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+ ++buf->ref_count;
+ }
+ } else {
+ if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ }
+ }
+}
+
+static void release_scaled_references(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ int i;
+ if (cpi->oxcf.pass == 0) {
+ // Only release scaled references under certain conditions:
+ // if reference will be updated, or if scaled reference has same resolution.
+ int refresh[INTER_REFS_PER_FRAME];
+ refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
+#if CONFIG_EXT_REFS
+ refresh[1] = refresh[2] = 0;
+ refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0;
+ refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0;
+ refresh[5] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#else
+ refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
+ refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#endif // CONFIG_EXT_REFS
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const int idx = cpi->scaled_ref_idx[i - 1];
+ RefCntBuffer *const buf =
+ idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+ const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+ if (buf != NULL &&
+ (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
+ buf->buf.y_crop_height == ref->y_crop_height))) {
+ --buf->ref_count;
+ cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+ }
+ }
+ } else {
+ for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) {
+ const int idx = cpi->scaled_ref_idx[i];
+ RefCntBuffer *const buf =
+ idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+ if (buf != NULL) {
+ --buf->ref_count;
+ cpi->scaled_ref_idx[i] = INVALID_IDX;
+ }
+ }
+ }
+}
+
+static void full_to_model_count(unsigned int *model_count,
+ unsigned int *full_count) {
+ int n;
+ model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+ model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+ model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+ for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
+ model_count[TWO_TOKEN] += full_count[n];
+ model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
+}
+
+void av1_full_to_model_counts(av1_coeff_count_model *model_count,
+ av1_coeff_count *full_count) {
+ int i, j, k, l;
+
+ for (i = 0; i < PLANE_TYPES; ++i)
+ for (j = 0; j < REF_TYPES; ++j)
+ for (k = 0; k < COEF_BANDS; ++k)
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+ full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+}
+
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+ int64_t recon_err;
+
+ aom_clear_system_state();
+
+ recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+
+ if (cpi->twopass.total_left_stats.coded_error != 0.0)
+ fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+ "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
+ "%10"PRId64" %10"PRId64" %10d "
+ "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+ "%6d %6d %5d %5d %5d "
+ "%10"PRId64" %10.3lf"
+ "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+ cpi->common.current_video_frame,
+ cm->width, cm->height,
+ cpi->rc.source_alt_ref_pending,
+ cpi->rc.source_alt_ref_active,
+ cpi->rc.this_frame_target,
+ cpi->rc.projected_frame_size,
+ cpi->rc.projected_frame_size / cpi->common.MBs,
+ (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
+ cpi->rc.vbr_bits_off_target,
+ cpi->rc.vbr_bits_off_target_fast,
+ cpi->twopass.extend_minq,
+ cpi->twopass.extend_minq_fast,
+ cpi->rc.total_target_vs_actual,
+ (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
+ cpi->rc.total_actual_bits, cm->base_qindex,
+ av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+ (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+ av1_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+ cm->bit_depth),
+ cpi->rc.avg_q,
+ av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
+ cpi->refresh_last_frame, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
+ cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats.coded_error,
+ cpi->twopass.bits_left /
+ (1 + cpi->twopass.total_left_stats.coded_error),
+ cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
+ cpi->twopass.kf_zeromotion_pct,
+ cpi->twopass.fr_content_type);
+
+ fclose(f);
+
+ if (0) {
+ FILE *const fmodes = fopen("Modes.stt", "a");
+ int i;
+
+ fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+ cm->frame_type, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame);
+
+ for (i = 0; i < MAX_MODES; ++i)
+ fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+ fprintf(fmodes, "\n");
+
+ fclose(fmodes);
+ }
+}
+#endif
+
+static void set_mv_search_params(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
+
+ // Default based on max resolution.
+ cpi->mv_step_param = av1_init_search_range(max_mv_def);
+
+ if (cpi->sf.mv.auto_mv_step_size) {
+ if (frame_is_intra_only(cm)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame) {
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ cpi->mv_step_param = av1_init_search_range(
+ AOMMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ }
+ cpi->max_mv_magnitude = 0;
+ }
+ }
+}
+
+static void set_size_independent_vars(AV1_COMP *cpi) {
+#if CONFIG_GLOBAL_MOTION
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ set_default_warp_params(&cpi->common.global_motion[i]);
+ }
+ cpi->global_motion_search_done = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ av1_set_speed_features_framesize_independent(cpi);
+ av1_set_rd_speed_thresholds(cpi);
+ av1_set_rd_speed_thresholds_sub8x8(cpi);
+ cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+ int *top_index) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Setup variables that depend on the dimensions of the frame.
+ av1_set_speed_features_framesize_dependent(cpi);
+
+// Decide q and q bounds.
+#if CONFIG_XIPHRC
+ int frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
+ *q = od_enc_rc_select_quantizers_and_lambdas(
+ &cpi->od_rc, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
+ frame_type, bottom_index, top_index);
+#else
+ *q = av1_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+#endif
+
+ if (!frame_is_intra_only(cm)) {
+ av1_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+ }
+
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in the second pass of a two pass encode, as it requires
+ // lagged coding, and if the relevant speed feature flag is set.
+ if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+ configure_static_seg_features(cpi);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+ int y_stride = cpi->scaled_source.y_stride;
+
+ if (cpi->sf.mv.search_method == NSTEP) {
+ av1_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+ } else if (cpi->sf.mv.search_method == DIAMOND) {
+ av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+ }
+}
+
+#if CONFIG_LOOP_RESTORATION
+static void set_restoration_tilesize(int width, int height,
+ RestorationInfo *rst) {
+ (void)width;
+ (void)height;
+ rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1);
+ rst[1].restoration_tilesize = rst[0].restoration_tilesize;
+ rst[2].restoration_tilesize = rst[0].restoration_tilesize;
+}
+#endif // CONFIG_LOOP_RESTORATION
+
+static void set_frame_size(AV1_COMP *cpi) {
+ int ref_frame;
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ if (oxcf->pass == 2 && oxcf->rc_mode == AOM_VBR &&
+ ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+ (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+ av1_calculate_coded_size(cpi, &oxcf->scaled_frame_width,
+ &oxcf->scaled_frame_height);
+
+ // There has been a change in frame size.
+ av1_set_size_literal(cpi, oxcf->scaled_frame_width,
+ oxcf->scaled_frame_height);
+ }
+
+ if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+ oxcf->resize_mode == RESIZE_DYNAMIC) {
+ if (cpi->resize_pending == 1) {
+ oxcf->scaled_frame_width =
+ (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+ oxcf->scaled_frame_height =
+ (cm->height * cpi->resize_scale_num) / cpi->resize_scale_den;
+ } else if (cpi->resize_pending == -1) {
+ // Go back up to original size.
+ oxcf->scaled_frame_width = oxcf->width;
+ oxcf->scaled_frame_height = oxcf->height;
+ }
+ if (cpi->resize_pending != 0) {
+ // There has been a change in frame size.
+ av1_set_size_literal(cpi, oxcf->scaled_frame_width,
+ oxcf->scaled_frame_height);
+
+ // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+ set_mv_search_params(cpi);
+ }
+ }
+
+#if !CONFIG_XIPHRC
+ if (oxcf->pass == 2) {
+ av1_set_target_rate(cpi);
+ }
+#endif
+
+ alloc_frame_mvs(cm, cm->new_fb_idx);
+
+ // Reset the frame pointers to the current frame size.
+ if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+ NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+#if CONFIG_LOOP_RESTORATION
+ set_restoration_tilesize(cm->width, cm->height, cm->rst_info);
+ for (int i = 0; i < MAX_MB_PLANE; ++i)
+ cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+ av1_alloc_restoration_buffers(cm);
+ for (int i = 0; i < MAX_MB_PLANE; ++i) {
+ cpi->rst_search[i].restoration_tilesize =
+ cm->rst_info[i].restoration_tilesize;
+ av1_alloc_restoration_struct(cm, &cpi->rst_search[i], cm->width,
+ cm->height);
+ }
+#endif // CONFIG_LOOP_RESTORATION
+ alloc_util_frame_buffers(cpi);
+ init_motion_estimation(cpi);
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+ ref_buf->idx = buf_idx;
+
+ if (buf_idx != INVALID_IDX) {
+ YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+ ref_buf->buf = buf;
+#if CONFIG_HIGHBITDEPTH
+ av1_setup_scale_factors_for_frame(
+ &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width,
+ cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0);
+#else
+ av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width,
+ buf->y_crop_height, cm->width,
+ cm->height);
+#endif // CONFIG_HIGHBITDEPTH
+ if (av1_is_scaled(&ref_buf->sf)) aom_extend_frame_borders(buf);
+ } else {
+ ref_buf->buf = NULL;
+ }
+ }
+
+ set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void reset_use_upsampled_references(AV1_COMP *cpi) {
+ MV_REFERENCE_FRAME ref_frame;
+
+ // reset up-sampled reference buffer structure.
+ init_upsampled_ref_frame_bufs(cpi);
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, ref_frame);
+ int new_uidx = upsample_ref_frame(cpi, ref);
+
+ // Update the up-sampled reference index.
+ cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)] = new_uidx;
+ cpi->upsampled_ref_bufs[new_uidx].ref_count++;
+ }
+}
+
+static void encode_without_recode_loop(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int q = 0, bottom_index = 0, top_index = 0; // Dummy variables.
+ const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+
+ aom_clear_system_state();
+
+ set_frame_size(cpi);
+
+ // For 1 pass CBR under dynamic resize mode: use faster scaling for source.
+ // Only for 2x2 scaling for now.
+ if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR &&
+ cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+ cpi->un_scaled_source->y_width == (cm->width << 1) &&
+ cpi->un_scaled_source->y_height == (cm->height << 1)) {
+ cpi->source = av1_scale_if_required_fast(cm, cpi->un_scaled_source,
+ &cpi->scaled_source);
+ if (cpi->unscaled_last_source != NULL)
+ cpi->last_source = av1_scale_if_required_fast(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source);
+ } else {
+ cpi->source =
+ av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source);
+ if (cpi->unscaled_last_source != NULL)
+ cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source);
+ }
+
+ if (frame_is_intra_only(cm) == 0) {
+ av1_scale_references(cpi);
+ }
+
+ set_size_independent_vars(cpi);
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+ // cpi->sf.use_upsampled_references can be different from frame to frame.
+ // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
+ // The reference frames for this frame have to be up-sampled before encoding.
+ if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
+ cm->frame_type != KEY_FRAME)
+ reset_use_upsampled_references(cpi);
+
+ av1_set_quantizer(cm, q);
+ av1_set_variance_partition_thresholds(cpi, q);
+
+ setup_frame(cpi);
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+ av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+ av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
+ cm->coef_probs_update_idx = 0;
+ av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+
+ suppress_active_map(cpi);
+ // Variance adaptive and in frame q adjustment experiments are mutually
+ // exclusive.
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ av1_vaq_frame_setup(cpi);
+ } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ av1_setup_in_frame_q_adj(cpi);
+ } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_setup(cpi);
+ }
+ apply_active_map(cpi);
+
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+
+ // Update some stats from cyclic refresh, and check if we should not update
+ // golden reference, for 1 pass CBR.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
+ (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR))
+ av1_cyclic_refresh_check_golden_update(cpi);
+
+ // Update the skip mb flag probabilities based on the distribution
+ // seen in the last encoder iteration.
+ // update_base_skip_probs(cpi);
+ aom_clear_system_state();
+}
+
+static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int bottom_index, top_index;
+ int loop_count = 0;
+ int loop_at_this_size = 0;
+ int loop = 0;
+#if !CONFIG_XIPHRC
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+#endif
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+ int q = 0, q_low = 0, q_high = 0;
+ const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+
+ set_size_independent_vars(cpi);
+
+ do {
+ aom_clear_system_state();
+
+ set_frame_size(cpi);
+
+ if (loop_count == 0 || cpi->resize_pending != 0) {
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+ // cpi->sf.use_upsampled_references can be different from frame to frame.
+ // Every time when cpi->sf.use_upsampled_references is changed from 0 to
+ // 1.
+ // The reference frames for this frame have to be up-sampled before
+ // encoding.
+ if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
+ cm->frame_type != KEY_FRAME)
+ reset_use_upsampled_references(cpi);
+
+ // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+ set_mv_search_params(cpi);
+
+#if !CONFIG_XIPHRC
+ // Reset the loop state for new frame size.
+ overshoot_seen = 0;
+ undershoot_seen = 0;
+#endif
+
+ // Reconfiguration for change in frame size has concluded.
+ cpi->resize_pending = 0;
+
+ q_low = bottom_index;
+ q_high = top_index;
+
+ loop_at_this_size = 0;
+ }
+
+ // Decide frame size bounds first time through.
+ if (loop_count == 0) {
+ av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+ }
+
+ cpi->source =
+ av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source);
+
+ if (cpi->unscaled_last_source != NULL)
+ cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source);
+
+ if (frame_is_intra_only(cm) == 0) {
+ if (loop_count > 0) {
+ release_scaled_references(cpi);
+ }
+ av1_scale_references(cpi);
+ }
+
+ av1_set_quantizer(cm, q);
+
+ if (loop_count == 0) setup_frame(cpi);
+
+#if CONFIG_Q_ADAPT_PROBS
+ // Base q-index may have changed, so we need to assign proper default coef
+ // probs before every iteration.
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+ int i;
+ av1_default_coef_probs(cm);
+ if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+ cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+ for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+ } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+ cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+ }
+ }
+#endif // CONFIG_Q_ADAPT_PROBS
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+ if (loop_count == 0 || frame_is_intra_only(cm) ||
+ cm->error_resilient_mode) {
+ av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+ av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
+ } else {
+ if (cm->do_subframe_update) {
+ av1_copy(cm->fc->coef_probs,
+ cpi->subframe_stats.enc_starting_coef_probs);
+ av1_copy(cm->starting_coef_probs,
+ cpi->subframe_stats.enc_starting_coef_probs);
+ av1_zero(cpi->subframe_stats.coef_counts_buf);
+ av1_zero(cpi->subframe_stats.eob_counts_buf);
+ }
+ }
+ cm->coef_probs_update_idx = 0;
+ av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+
+ // Variance adaptive and in frame q adjustment experiments are mutually
+ // exclusive.
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ av1_vaq_frame_setup(cpi);
+ } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ av1_setup_in_frame_q_adj(cpi);
+ }
+
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+
+ // Update the skip mb flag probabilities based on the distribution
+ // seen in the last encoder iteration.
+ // update_base_skip_probs(cpi);
+
+ aom_clear_system_state();
+
+ // Dummy pack of the bitstream using up to date stats to get an
+ // accurate estimate of output frame size to determine if we need
+ // to recode.
+ if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+ save_coding_context(cpi);
+
+ av1_pack_bitstream(cpi, dest, size);
+
+ rc->projected_frame_size = (int)(*size) << 3;
+ restore_coding_context(cpi);
+
+ if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+ }
+
+ if (cpi->oxcf.rc_mode == AOM_Q) {
+ loop = 0;
+ } else {
+ if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
+ (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+ int last_q = q;
+ int64_t kf_err;
+
+ int64_t high_err_target = cpi->ambient_err;
+ int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ } else {
+ kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ }
+#else
+ kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+#endif // CONFIG_HIGHBITDEPTH
+
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += !kf_err;
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if ((kf_err > high_err_target &&
+ rc->projected_frame_size <= frame_over_shoot_limit) ||
+ (kf_err > low_err_target &&
+ rc->projected_frame_size <= frame_under_shoot_limit)) {
+ // Lower q_high
+ q_high = q > q_low ? q - 1 : q_low;
+
+ // Adjust Q
+ q = (int)((q * high_err_target) / kf_err);
+ q = AOMMIN(q, (q_high + q_low) >> 1);
+ } else if (kf_err < low_err_target &&
+ rc->projected_frame_size >= frame_under_shoot_limit) {
+ // The key frame is much better than the previous frame
+ // Raise q_low
+ q_low = q < q_high ? q + 1 : q_high;
+
+ // Adjust Q
+ q = (int)((q * low_err_target) / kf_err);
+ q = AOMMIN(q, (q_high + q_low + 1) >> 1);
+ }
+
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
+
+ loop = q != last_q;
+ } else if (recode_loop_test(cpi, frame_over_shoot_limit,
+ frame_under_shoot_limit, q,
+ AOMMAX(q_high, top_index), bottom_index)) {
+ // Is the projected frame size out of range and are we allowed
+ // to attempt to recode.
+ int last_q = q;
+#if !CONFIG_XIPHRC
+ int retries = 0;
+#endif
+
+ if (cpi->resize_pending == 1) {
+ // Change in frame size so go back around the recode loop.
+ cpi->rc.frame_size_selector =
+ SCALE_STEP1 - cpi->rc.frame_size_selector;
+ cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+ ++cpi->tot_recode_hits;
+#endif
+ ++loop_count;
+ loop = 1;
+ continue;
+ }
+
+#if !CONFIG_XIPHRC
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+ // Frame is too large
+ if (rc->projected_frame_size > rc->this_frame_target) {
+ // Special case if the projected size is > the max allowed.
+ if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+ q_high = rc->worst_quality;
+
+ // Raise Qlow as to at least the current value
+ q_low = q < q_high ? q + 1 : q_high;
+
+ if (undershoot_seen || loop_at_this_size > 1) {
+ // Update rate_correction_factor unless
+ av1_rc_update_rate_correction_factors(cpi);
+
+ q = (q_high + q_low + 1) / 2;
+ } else {
+ // Update rate_correction_factor unless
+ av1_rc_update_rate_correction_factors(cpi);
+
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index));
+
+ while (q < q_low && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi);
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index));
+ retries++;
+ }
+ }
+
+ overshoot_seen = 1;
+ } else {
+ // Frame is too small
+ q_high = q > q_low ? q - 1 : q_low;
+
+ if (overshoot_seen || loop_at_this_size > 1) {
+ av1_rc_update_rate_correction_factors(cpi);
+ q = (q_high + q_low) / 2;
+ } else {
+ av1_rc_update_rate_correction_factors(cpi);
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index);
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) {
+ q_low = q;
+ }
+
+ while (q > q_high && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi);
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index);
+ retries++;
+ }
+ }
+
+ undershoot_seen = 1;
+ }
+#endif
+
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
+
+ loop = (q != last_q);
+ } else {
+ loop = 0;
+ }
+ }
+
+ // Special case for overlay frame.
+ if (rc->is_src_frame_alt_ref &&
+ rc->projected_frame_size < rc->max_frame_bandwidth)
+ loop = 0;
+
+#if CONFIG_GLOBAL_MOTION
+ if (recode_loop_test_global_motion(cpi)) {
+ loop = 1;
+ }
+#endif // CONFIG_GLOBAL_MOTION
+
+ if (loop) {
+ ++loop_count;
+ ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+ ++cpi->tot_recode_hits;
+#endif
+ }
+ } while (loop);
+}
+
+static int get_ref_frame_flags(const AV1_COMP *cpi) {
+ const int *const map = cpi->common.ref_frame_map;
+
+#if CONFIG_EXT_REFS
+ const int last2_is_last =
+ map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]];
+ const int last3_is_last =
+ map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
+ const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
+#if CONFIG_LOWDELAY_COMPOUND
+ const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
+ const int last3_is_last2 =
+ map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
+ const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
+ const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
+#else
+ const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]];
+ const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
+
+ const int last3_is_last2 =
+ map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
+ const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
+ const int bwd_is_last2 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[1]];
+
+ const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
+ const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]];
+
+ const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx];
+
+#endif
+ const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx];
+ const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx];
+ const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+ const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx];
+#else
+ const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+ const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+ const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+#endif // CONFIG_EXT_REFS
+
+ int flags = AOM_REFFRAME_ALL;
+
+#if CONFIG_EXT_REFS
+ // Disable the use of BWDREF_FRAME for non-bipredictive frames.
+ if (!(cpi->rc.is_bipred_frame || cpi->rc.is_last_bipred_frame ||
+ (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs)))
+ flags &= ~AOM_BWD_FLAG;
+#endif // CONFIG_EXT_REFS
+
+ if (gld_is_last || gld_is_alt) flags &= ~AOM_GOLD_FLAG;
+
+ if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
+
+ if (alt_is_last) flags &= ~AOM_ALT_FLAG;
+
+#if CONFIG_EXT_REFS
+ if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
+
+ if (last3_is_last || last3_is_last2 || last3_is_alt) flags &= ~AOM_LAST3_FLAG;
+
+ if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG;
+
+#if CONFIG_LOWDELAY_COMPOUND // Changes LL & HL bitstream
+ /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */
+ if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG;
+#else
+ if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 || bwd_is_gld ||
+ bwd_is_alt) &&
+ (flags & AOM_BWD_FLAG))
+ flags &= ~AOM_BWD_FLAG;
+#endif
+#endif // CONFIG_EXT_REFS
+
+ return flags;
+}
+
+static void set_ext_overrides(AV1_COMP *cpi) {
+ // Overrides the defaults with the externally supplied values with
+ // av1_update_reference() and av1_update_entropy() calls
+ // Note: The overrides are valid only for the next frame passed
+ // to encode_frame_to_data_rate() function
+ if (cpi->ext_refresh_frame_context_pending) {
+ cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+ cpi->ext_refresh_frame_context_pending = 0;
+ }
+ if (cpi->ext_refresh_frame_flags_pending) {
+ cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+ cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+ cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+ cpi->ext_refresh_frame_flags_pending = 0;
+ }
+}
+
+YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled) {
+ if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+ cm->mi_rows * MI_SIZE != unscaled->y_height) {
+ // For 2x2 scaling down.
+ aom_scale_frame(unscaled, scaled, unscaled->y_buffer, 9, 2, 1, 2, 1, 0);
+ aom_extend_frame_borders(scaled);
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled) {
+ if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+ cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_HIGHBITDEPTH
+ scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
+ scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif // CONFIG_HIGHBITDEPTH
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
+static void set_arf_sign_bias(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int arf_sign_bias;
+#if CONFIG_EXT_REFS
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ // The arf_sign_bias will be one for internal ARFs'
+ arf_sign_bias = cpi->rc.source_alt_ref_active &&
+ (!cpi->refresh_alt_ref_frame ||
+ (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
+#else
+ if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ arf_sign_bias = cpi->rc.source_alt_ref_active &&
+ (!cpi->refresh_alt_ref_frame ||
+ (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
+ } else {
+ arf_sign_bias =
+ (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
+ }
+#endif // CONFIG_EXT_REFS
+
+ cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
+#if CONFIG_EXT_REFS
+ cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
+#endif // CONFIG_EXT_REFS
+}
+
+static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
+ InterpFilter ifilter;
+ int ref_total[TOTAL_REFS_PER_FRAME] = { 0 };
+ MV_REFERENCE_FRAME ref;
+ int mask = 0;
+ int arf_idx = ALTREF_FRAME;
+
+#if CONFIG_EXT_REFS
+ // Get which arf used as ALTREF_FRAME
+ if (cpi->oxcf.pass == 2)
+ arf_idx += cpi->twopass.gf_group.arf_ref_idx[cpi->twopass.gf_group.index];
+#endif // CONFIG_EXT_REFS
+
+ if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+ return mask;
+
+#if CONFIG_EXT_REFS
+ for (ref = LAST_FRAME; ref < ALTREF_FRAME; ++ref)
+ for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+ ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+ for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+ ref_total[ref] += cpi->interp_filter_selected[arf_idx][ifilter];
+#else
+ for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+ for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+ ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+#endif // CONFIG_EXT_REFS
+
+ for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
+ if ((ref_total[LAST_FRAME] &&
+ cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+#if CONFIG_EXT_REFS
+ (ref_total[LAST2_FRAME] == 0 ||
+ cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 <
+ ref_total[LAST2_FRAME]) &&
+ (ref_total[LAST3_FRAME] == 0 ||
+ cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 <
+ ref_total[LAST3_FRAME]) &&
+#endif // CONFIG_EXT_REFS
+ (ref_total[GOLDEN_FRAME] == 0 ||
+ cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
+ ref_total[GOLDEN_FRAME]) &&
+#if CONFIG_EXT_REFS
+ (ref_total[BWDREF_FRAME] == 0 ||
+ cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
+ ref_total[BWDREF_FRAME]) &&
+#endif // CONFIG_EXT_REFS
+ (ref_total[ALTREF_FRAME] == 0 ||
+ cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
+ ref_total[ALTREF_FRAME]))
+ mask |= 1 << ifilter;
+ }
+ return mask;
+}
+
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+ int h;
+ char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+ FILE *f_recon = NULL;
+
+ if (recon_buf == NULL || !cm->show_frame) {
+ printf("Frame %d is not ready or no show to dump.\n",
+ cm->current_video_frame);
+ return;
+ }
+
+ if (cm->current_video_frame == 0) {
+ if ((f_recon = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return;
+ }
+ } else {
+ if ((f_recon = fopen(file_name, "ab")) == NULL) {
+ printf("Unable to open file %s to append.\n", file_name);
+ return;
+ }
+ }
+ printf(
+ "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
+ "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
+ cm->current_video_frame, cpi->twopass.gf_group.index,
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+ cm->show_existing_frame, recon_buf->y_stride, recon_buf->uv_stride,
+ cm->width, cm->height);
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+ f_recon);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+
+ fclose(f_recon);
+}
+#endif // DUMP_RECON_FRAMES
+
+#if CONFIG_EC_ADAPT
+
+static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows,
+ const int tile_cols,
+ FRAME_CONTEXT *ec_ctxs[]) {
+ int i;
+ for (i = 0; i < tile_rows * tile_cols; ++i)
+ ec_ctxs[i] = &cpi->tile_data[i].tctx;
+}
+
+#endif
+static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest, int skip_adapt,
+ unsigned int *frame_flags) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ struct segmentation *const seg = &cm->seg;
+ TX_SIZE t;
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
+ sizeof(&cpi->tile_data[0].tctx));
+ aom_cdf_prob **cdf_ptrs =
+ aom_malloc(cm->tile_rows * cm->tile_cols *
+ sizeof(&cpi->tile_data[0].tctx.partition_cdf[0][0]));
+#endif
+#if CONFIG_XIPHRC
+ int frame_type;
+ int drop_this_frame = 0;
+#endif // CONFIG_XIPHRC
+ set_ext_overrides(cpi);
+ aom_clear_system_state();
+
+ // Set the arf sign bias for this frame.
+ set_arf_sign_bias(cpi);
+#if CONFIG_TEMPMV_SIGNALING
+ // frame type has been decided outside of this function call
+ cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
+ cm->use_prev_frame_mvs =
+ !cpi->oxcf.disable_tempmv && !cm->cur_frame->intra_only;
+#endif
+
+#if CONFIG_EXT_REFS
+ // NOTE:
+ // (1) Move the setup of the ref_frame_flags upfront as it would be
+ // determined by the current frame properties;
+ // (2) The setup of the ref_frame_flags applies to both show_existing_frame's
+ // and the other cases.
+ if (cm->current_video_frame > 0)
+ cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+ if (cm->show_existing_frame) {
+ // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
+ // BWDREF_FRAME in the reference frame buffer.
+ cm->frame_type = INTER_FRAME;
+ cm->show_frame = 1;
+ cpi->frame_flags = *frame_flags;
+
+ // In the case of show_existing frame, we will not send fresh flag
+ // to decoder. Any change in the reference frame buffer can be done by
+ // switching the virtual indices.
+
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_bwd_ref_frame = 0;
+ cpi->rc.is_last_bipred_frame = 0;
+ cpi->rc.is_bipred_frame = 0;
+
+ // Build the bitstream
+ av1_pack_bitstream(cpi, dest, size);
+
+ // Set up frame to show to get ready for stats collection.
+ cm->frame_to_show = get_frame_new_buffer(cm);
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ // Update the LAST_FRAME in the reference frame buffer.
+ av1_update_reference_frames(cpi);
+
+ // Update frame flags
+ cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+ cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+ cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+ *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+
+ // Update the frame type
+ cm->last_frame_type = cm->frame_type;
+
+ // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+ // to do post-encoding update accordingly.
+ if (cpi->rc.is_src_frame_alt_ref) {
+ av1_set_target_rate(cpi);
+#if CONFIG_XIPHRC
+ frame_type = cm->frame_type == INTER_FRAME ? OD_P_FRAME : OD_I_FRAME;
+ drop_this_frame = od_enc_rc_update_state(
+ &cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame, frame_type, cpi->droppable);
+#else
+ av1_rc_postencode_update(cpi, *size);
+#endif
+ }
+
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
+
+ ++cm->current_video_frame;
+
+#if CONFIG_EC_ADAPT
+ aom_free(tile_ctxs);
+ aom_free(cdf_ptrs);
+#endif
+ return;
+ }
+#endif // CONFIG_EXT_REFS
+
+ // Set default state for segment based loop filter update flags.
+ cm->lf.mode_ref_delta_update = 0;
+
+ if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
+ cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
+
+ // Set various flags etc to special state if it is a key frame.
+ if (frame_is_intra_only(cm)) {
+ // Reset the loop filter deltas and segmentation map.
+ av1_reset_segment_features(cm);
+
+ // If segmentation is enabled force a map update for key frames.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+ }
+
+ // The alternate reference frame cannot be active for a key frame.
+ cpi->rc.source_alt_ref_active = 0;
+
+ cm->error_resilient_mode = oxcf->error_resilient_mode;
+
+ // By default, encoder assumes decoder can use prev_mi.
+ if (cm->error_resilient_mode) {
+ cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+ cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
+ } else if (cm->intra_only) {
+ // Only reset the current context.
+ cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
+ }
+ }
+#if CONFIG_TILE_GROUPS
+ if (cpi->oxcf.mtu == 0) {
+ cm->num_tg = cpi->oxcf.num_tile_groups;
+ } else {
+ // Use a default value for the purposes of weighting costs in probability
+ // updates
+ cm->num_tg = DEFAULT_MAX_NUM_TG;
+ }
+#endif
+
+#if CONFIG_EXT_TILE
+ cm->tile_encoding_mode = cpi->oxcf.tile_encoding_mode;
+#endif // CONFIG_EXT_TILE
+
+#if CONFIG_XIPHRC
+ if (drop_this_frame) {
+ av1_rc_postencode_update_drop_frame(cpi);
+ ++cm->current_video_frame;
+#if CONFIG_EC_ADAPT
+ aom_free(tile_ctxs);
+ aom_free(cdf_ptrs);
+#endif
+ return;
+ }
+#else
+ // For 1 pass CBR, check if we are dropping this frame.
+ // Never drop on key frame.
+ if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+ cm->frame_type != KEY_FRAME) {
+ if (av1_rc_drop_frame(cpi)) {
+ av1_rc_postencode_update_drop_frame(cpi);
+ ++cm->current_video_frame;
+#if CONFIG_EC_ADAPT
+ aom_free(tile_ctxs);
+ aom_free(cdf_ptrs);
+#endif
+ return;
+ }
+ }
+#endif
+
+ aom_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+ memset(cpi->mode_chosen_counts, 0,
+ MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+#if CONFIG_REFERENCE_BUFFER
+ {
+ /* Non-normative definition of current_frame_id ("frame counter" with
+ * wraparound) */
+ const int frame_id_length = FRAME_ID_LENGTH_MINUS7 + 7;
+ if (cm->current_frame_id == -1) {
+ int lsb, msb;
+/* quasi-random initialization of current_frame_id for a key frame */
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+ msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+ } else {
+#endif
+ lsb = cpi->source->y_buffer[0] & 0xff;
+ msb = cpi->source->y_buffer[1] & 0xff;
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+ cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+ } else {
+ cm->current_frame_id =
+ (cm->current_frame_id + 1 + (1 << frame_id_length)) %
+ (1 << frame_id_length);
+ }
+ }
+#endif
+
+#if CONFIG_EXT_DELTA_Q
+ cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+ cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
+#endif
+
+ if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+ encode_without_recode_loop(cpi);
+ } else {
+ encode_with_recode_loop(cpi, size, dest);
+ }
+
+#ifdef OUTPUT_YUV_SKINMAP
+ if (cpi->common.current_video_frame > 1) {
+ av1_compute_skin_map(cpi, yuv_skinmap_file);
+ }
+#endif // OUTPUT_YUV_SKINMAP
+
+ // Special case code to reduce pulsing when key frames are forced at a
+ // fixed interval. Note the reconstruction error if it is the frame before
+ // the force key frame
+ if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ cpi->ambient_err =
+ aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ } else {
+ cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ }
+#else
+ cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+#endif // CONFIG_HIGHBITDEPTH
+ }
+
+ // If the encoder forced a KEY_FRAME decision
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->refresh_last_frame = 1;
+ }
+
+ cm->frame_to_show = get_frame_new_buffer(cm);
+ cm->frame_to_show->color_space = cm->color_space;
+ cm->frame_to_show->color_range = cm->color_range;
+ cm->frame_to_show->render_width = cm->render_width;
+ cm->frame_to_show->render_height = cm->render_height;
+
+#if CONFIG_EXT_REFS
+// TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+// off.
+#endif // CONFIG_EXT_REFS
+
+ // Pick the loop filter level for the frame.
+ loopfilter_frame(cpi, cm);
+
+ // Build the bitstream
+ av1_pack_bitstream(cpi, dest, size);
+
+ if (skip_adapt) {
+#if CONFIG_EC_ADAPT
+ aom_free(tile_ctxs);
+ aom_free(cdf_ptrs);
+#endif
+ return;
+ }
+
+#if CONFIG_REFERENCE_BUFFER
+ {
+ int i;
+ /* Update reference frame id values based on the value of refresh_mask */
+ for (i = 0; i < REF_FRAMES; i++) {
+ if ((cm->refresh_mask >> i) & 1) {
+ cm->ref_frame_id[i] = cm->current_frame_id;
+ }
+ }
+ }
+#endif
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ if (cm->show_frame) dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ if (cm->seg.update_map) update_reference_segmentation_map(cpi);
+
+ if (frame_is_intra_only(cm) == 0) {
+ release_scaled_references(cpi);
+ }
+
+ av1_update_reference_frames(cpi);
+
+ for (t = 0; t < TX_SIZES; t++)
+ av1_full_to_model_counts(cpi->td.counts->coef[t],
+ cpi->td.rd_counts.coef_counts[t]);
+#if CONFIG_ENTROPY_STATS
+ av1_accumulate_frame_counts(&aggregate_fc, &cm->counts);
+#endif // CONFIG_ENTROPY_STATS
+ if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ cm->partial_prob_update = 0;
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+ av1_adapt_coef_probs(cm);
+ av1_adapt_intra_frame_probs(cm);
+#if CONFIG_EC_ADAPT
+ make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs);
+ av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
+ cm->tile_rows * cm->tile_cols);
+ av1_average_tile_intra_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
+ cm->tile_rows * cm->tile_cols);
+#if CONFIG_PVQ
+ av1_average_tile_pvq_cdfs(cpi->common.fc, tile_ctxs,
+ cm->tile_rows * cm->tile_cols);
+#endif // CONFIG_PVQ
+#endif // CONFIG_EC_ADAPT
+#if CONFIG_ADAPT_SCAN
+ av1_adapt_scan_order(cm);
+#endif // CONFIG_ADAPT_SCAN
+ }
+
+ if (!frame_is_intra_only(cm)) {
+ if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ av1_adapt_inter_frame_probs(cm);
+ av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+#if CONFIG_EC_ADAPT
+ av1_average_tile_inter_cdfs(&cpi->common, cpi->common.fc, tile_ctxs,
+ cdf_ptrs, cm->tile_rows * cm->tile_cols);
+ av1_average_tile_mv_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
+ cm->tile_rows * cm->tile_cols);
+#endif
+ }
+ }
+
+ if (cpi->refresh_golden_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+ if (cpi->refresh_alt_ref_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+#if CONFIG_EXT_REFS
+ if (cpi->refresh_bwd_ref_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_BWDREF;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+#endif // CONFIG_EXT_REFS
+
+#if !CONFIG_EXT_REFS
+ cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+#endif // !CONFIG_EXT_REFS
+
+ cm->last_frame_type = cm->frame_type;
+
+#if CONFIG_XIPHRC
+ frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
+
+ drop_this_frame =
+ od_enc_rc_update_state(&cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame, frame_type, 0);
+ if (drop_this_frame) {
+ av1_rc_postencode_update_drop_frame(cpi);
+ ++cm->current_video_frame;
+#if CONFIG_EC_ADAPT
+ aom_free(tile_ctxs);
+ aom_free(cdf_ptrs);
+#endif
+ return;
+ }
+#else // !CONFIG_XIPHRC
+ av1_rc_postencode_update(cpi, *size);
+#endif // CONFIG_XIPHRC
+
+#if 0
+ output_frame_level_debug_stats(cpi);
+#endif
+
+ if (cm->frame_type == KEY_FRAME) {
+ // Tell the caller that the frame was coded as a key frame
+ *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+ } else {
+ *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+ }
+
+ // Clear the one shot update flags for segmentation map and mode/ref loop
+ // filter deltas.
+ cm->seg.update_map = 0;
+ cm->seg.update_data = 0;
+ cm->lf.mode_ref_delta_update = 0;
+
+ // keep track of the last coded dimensions
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
+
+ // reset to normal state now that we are done.
+ if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame;
+
+ if (cm->show_frame) {
+#if CONFIG_EXT_REFS
+// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
+// being used as reference.
+#endif // CONFIG_EXT_REFS
+ av1_swap_mi_and_prev_mi(cm);
+ // Don't increment frame counters if this was an altref buffer
+ // update not a real frame
+ ++cm->current_video_frame;
+ }
+
+#if CONFIG_EXT_REFS
+ // NOTE: Shall not refer to any frame not used as reference.
+ if (cm->is_reference_frame)
+#endif // CONFIG_EXT_REFS
+ cm->prev_frame = cm->cur_frame;
+#if CONFIG_EC_ADAPT
+ aom_free(tile_ctxs);
+ aom_free(cdf_ptrs);
+#endif
+}
+
+static void Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+ int skip_adapt, unsigned int *frame_flags) {
+#if CONFIG_XIPHRC
+ int64_t ip_count;
+ int frame_type, is_golden, is_altref;
+
+ /* Not updated during init so update it here */
+ if (cpi->oxcf.rc_mode == AOM_Q) cpi->od_rc.quality = cpi->oxcf.cq_level;
+
+ frame_type = od_frame_type(&cpi->od_rc, cpi->od_rc.cur_frame, &is_golden,
+ &is_altref, &ip_count);
+
+ if (frame_type == OD_I_FRAME) {
+ frame_type = KEY_FRAME;
+ cpi->frame_flags &= FRAMEFLAGS_KEY;
+ } else if (frame_type == OD_P_FRAME) {
+ frame_type = INTER_FRAME;
+ }
+
+ if (is_altref) {
+ cpi->refresh_alt_ref_frame = 1;
+ cpi->rc.source_alt_ref_active = 1;
+ }
+
+ cpi->refresh_golden_frame = is_golden;
+ cpi->common.frame_type = frame_type;
+ if (is_golden) cpi->frame_flags &= FRAMEFLAGS_GOLDEN;
+#else
+ if (cpi->oxcf.rc_mode == AOM_CBR) {
+ av1_rc_get_one_pass_cbr_params(cpi);
+ } else {
+ av1_rc_get_one_pass_vbr_params(cpi);
+ }
+#endif
+ encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags);
+}
+
+#if !CONFIG_XIPHRC
+static void Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+ unsigned int *frame_flags) {
+ encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags);
+
+#if CONFIG_EXT_REFS
+ // Do not do post-encoding update for those frames that do not have a spot in
+ // a gf group, but note that an OVERLAY frame always has a spot in a gf group,
+ // even when show_existing_frame is used.
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
+ av1_twopass_postencode_update(cpi);
+ }
+ check_show_existing_frame(cpi);
+#else
+ av1_twopass_postencode_update(cpi);
+#endif // CONFIG_EXT_REFS
+}
+#endif
+
+static void init_ref_frame_bufs(AV1_COMMON *cm) {
+ int i;
+ BufferPool *const pool = cm->buffer_pool;
+ cm->new_fb_idx = INVALID_IDX;
+ for (i = 0; i < REF_FRAMES; ++i) {
+ cm->ref_frame_map[i] = INVALID_IDX;
+ pool->frame_bufs[i].ref_count = 0;
+ }
+}
+
+static void check_initial_width(AV1_COMP *cpi,
+#if CONFIG_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int subsampling_x, int subsampling_y) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ if (!cpi->initial_width ||
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth != use_highbitdepth ||
+#endif
+ cm->subsampling_x != subsampling_x ||
+ cm->subsampling_y != subsampling_y) {
+ cm->subsampling_x = subsampling_x;
+ cm->subsampling_y = subsampling_y;
+#if CONFIG_HIGHBITDEPTH
+ cm->use_highbitdepth = use_highbitdepth;
+#endif
+
+ alloc_raw_frame_buffers(cpi);
+ init_ref_frame_bufs(cm);
+ alloc_util_frame_buffers(cpi);
+
+ init_motion_estimation(cpi); // TODO(agrange) This can be removed.
+
+ cpi->initial_width = cm->width;
+ cpi->initial_height = cm->height;
+ cpi->initial_mbs = cm->MBs;
+ }
+}
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct aom_usec_timer timer;
+ int res = 0;
+ const int subsampling_x = sd->subsampling_x;
+ const int subsampling_y = sd->subsampling_y;
+#if CONFIG_HIGHBITDEPTH
+ const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+ check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
+ check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif // CONFIG_HIGHBITDEPTH
+
+ aom_usec_timer_start(&timer);
+
+ if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+#if CONFIG_HIGHBITDEPTH
+ use_highbitdepth,
+#endif // CONFIG_HIGHBITDEPTH
+ frame_flags))
+ res = -1;
+ aom_usec_timer_mark(&timer);
+ cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+
+ if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
+ (subsampling_x != 1 || subsampling_y != 1)) {
+ aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+ "Non-4:2:0 color format requires profile 1 or 3");
+ res = -1;
+ }
+ if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
+ (subsampling_x == 1 && subsampling_y == 1)) {
+ aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+ "4:2:0 color format requires profile 0 or 2");
+ res = -1;
+ }
+
+ return res;
+}
+
+static int frame_is_reference(const AV1_COMP *cpi) {
+ const AV1_COMMON *cm = &cpi->common;
+
+ return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+ cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+ cpi->refresh_bwd_ref_frame ||
+#endif // CONFIG_EXT_REFS
+ cpi->refresh_alt_ref_frame || !cm->error_resilient_mode ||
+ cm->lf.mode_ref_delta_update || cm->seg.update_map ||
+ cm->seg.update_data;
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi,
+ const struct lookahead_entry *source) {
+ int64_t this_duration;
+ int step = 0;
+
+ if (source->ts_start == cpi->first_time_stamp_ever) {
+ this_duration = source->ts_end - source->ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration =
+ cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+ this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = (int)((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step) {
+ av1_new_framerate(cpi, 10000000.0 / this_duration);
+ } else {
+ // Average this frame's rate into the last second's average
+ // frame rate. If we haven't seen 1 second yet, then average
+ // over the whole interval seen.
+ const double interval = AOMMIN(
+ (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+ double avg_duration = 10000000.0 / cpi->framerate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+
+ av1_new_framerate(cpi, 10000000.0 / avg_duration);
+ }
+ }
+ cpi->last_time_stamp_seen = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int arf_src_index = 0;
+ if (is_altref_enabled(cpi)) {
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ arf_src_index = gf_group->arf_src_offset[gf_group->index];
+ }
+ } else if (rc->source_alt_ref_pending) {
+ arf_src_index = rc->frames_till_gf_update_due;
+ }
+ }
+ return arf_src_index;
+}
+
+#if CONFIG_EXT_REFS
+static int get_brf_src_index(AV1_COMP *cpi) {
+ int brf_src_index = 0;
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+ // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
+ // flag.
+ if (gf_group->bidir_pred_enabled[gf_group->index]) {
+ if (cpi->oxcf.pass == 2) {
+ if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
+ brf_src_index = gf_group->brf_src_offset[gf_group->index];
+ } else {
+ // TODO(zoeliu): To re-visit the setup for this scenario
+ brf_src_index = cpi->rc.bipred_group_interval - 1;
+ }
+ }
+
+ return brf_src_index;
+}
+#endif // CONFIG_EXT_REFS
+
+static void check_src_altref(AV1_COMP *cpi,
+ const struct lookahead_entry *source) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // If pass == 2, the parameters set here will be reset in
+ // av1_rc_get_second_pass_params()
+
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ rc->is_src_frame_alt_ref =
+#if CONFIG_EXT_REFS
+ (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
+#endif // CONFIG_EXT_REFS
+ (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+ } else {
+ rc->is_src_frame_alt_ref =
+ cpi->alt_ref_source && (source == cpi->alt_ref_source);
+ }
+
+ if (rc->is_src_frame_alt_ref) {
+ // Current frame is an ARF overlay frame.
+ cpi->alt_ref_source = NULL;
+
+ // Don't refresh the last buffer for an ARF overlay frame. It will
+ // become the GF so preserve last as an alternative prediction option.
+ cpi->refresh_last_frame = 0;
+ }
+}
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch,
+ int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+ ImageStat *s) {
+ s->stat[Y] += y;
+ s->stat[U] += u;
+ s->stat[V] += v;
+ s->stat[ALL] += all;
+ s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ double samples = 0.0;
+ uint32_t in_bit_depth = 8;
+ uint32_t bit_depth = 8;
+
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ in_bit_depth = cpi->oxcf.input_bit_depth;
+ bit_depth = cm->bit_depth;
+ }
+#endif
+ if (cm->show_frame) {
+ const YV12_BUFFER_CONFIG *orig = cpi->source;
+ const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ double y, u, v, frame_all;
+
+ cpi->count++;
+ if (cpi->b_calculate_psnr) {
+ PSNR_STATS psnr;
+ double frame_ssim2 = 0.0, weight = 0.0;
+ aom_clear_system_state();
+// TODO(yaowu): unify these two versions into one.
+#if CONFIG_HIGHBITDEPTH
+ aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(orig, recon, &psnr);
+#endif // CONFIG_HIGHBITDEPTH
+
+ adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+ &cpi->psnr);
+ cpi->total_sq_error += psnr.sse[0];
+ cpi->total_samples += psnr.samples[0];
+ samples = psnr.samples[0];
+// TODO(yaowu): unify these two versions into one.
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ frame_ssim2 =
+ aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
+ else
+ frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+#else
+ frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+#endif // CONFIG_HIGHBITDEPTH
+
+ cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
+ cpi->summed_quality += frame_ssim2 * weight;
+ cpi->summed_weights += weight;
+
+#if 0
+ {
+ FILE *f = fopen("q_used.stt", "a");
+ fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+ cpi->common.current_video_frame, y2, u2, v2,
+ frame_psnr2, frame_ssim2);
+ fclose(f);
+ }
+#endif
+ }
+ if (cpi->b_calculate_blockiness) {
+#if CONFIG_HIGHBITDEPTH
+ if (!cm->use_highbitdepth)
+#endif
+ {
+ const double frame_blockiness =
+ av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+ recon->y_stride, orig->y_width, orig->y_height);
+ cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
+ cpi->total_blockiness += frame_blockiness;
+ }
+
+ if (cpi->b_calculate_consistency) {
+#if CONFIG_HIGHBITDEPTH
+ if (!cm->use_highbitdepth)
+#endif
+ {
+ const double this_inconsistency = aom_get_ssim_metrics(
+ orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+ orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const double consistency =
+ aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+ if (consistency > 0.0)
+ cpi->worst_consistency =
+ AOMMIN(cpi->worst_consistency, consistency);
+ cpi->total_inconsistency += this_inconsistency;
+ }
+ }
+ }
+
+ frame_all =
+ aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+ frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+ size_t *size, uint8_t *dest, int64_t *time_stamp,
+ int64_t *time_end, int flush) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ RATE_CONTROL *const rc = &cpi->rc;
+ struct aom_usec_timer cmptimer;
+ YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+ struct lookahead_entry *last_source = NULL;
+ struct lookahead_entry *source = NULL;
+ int arf_src_index;
+#if CONFIG_EXT_REFS
+ int brf_src_index;
+#endif // CONFIG_EXT_REFS
+ int i;
+
+#if CONFIG_XIPHRC
+ cpi->od_rc.end_of_input = flush;
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+ assert(cpi->oxcf.max_threads == 0 &&
+ "bitstream debug tool does not support multithreading");
+ bitstream_queue_record_write();
+ bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
+ aom_usec_timer_start(&cmptimer);
+
+ av1_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+
+ // Is multi-arf enabled.
+ // Note that at the moment multi_arf is only configured for 2 pass VBR
+ if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1))
+ cpi->multi_arf_allowed = 1;
+ else
+ cpi->multi_arf_allowed = 0;
+
+ // Normal defaults
+ cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+ cm->refresh_frame_context =
+ (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
+ ? REFRESH_FRAME_CONTEXT_FORWARD
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+ cpi->refresh_bwd_ref_frame = 0;
+#endif // CONFIG_EXT_REFS
+ cpi->refresh_alt_ref_frame = 0;
+
+#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
+ if (oxcf->pass == 2 && cm->show_existing_frame) {
+ // Manage the source buffer and flush out the source frame that has been
+ // coded already; Also get prepared for PSNR calculation if needed.
+ if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
+ *size = 0;
+ return -1;
+ }
+ cpi->source = &source->img;
+ // TODO(zoeliu): To track down to determine whether it's needed to adjust
+ // the frame rate.
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+
+ // We need to adjust frame rate for an overlay frame
+ if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
+
+ // Find a free buffer for the new frame, releasing the reference previously
+ // held.
+ if (cm->new_fb_idx != INVALID_IDX) {
+ --pool->frame_bufs[cm->new_fb_idx].ref_count;
+ }
+ cm->new_fb_idx = get_free_fb(cm);
+
+ if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+ // Clear down mmx registers
+ aom_clear_system_state();
+
+ // Start with a 0 size frame.
+ *size = 0;
+
+ // We need to update the gf_group for show_existing overlay frame
+ if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
+
+ Pass2Encode(cpi, size, dest, frame_flags);
+
+ if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+ compute_internal_stats(cpi);
+ cpi->bytes += (int)(*size);
+#endif // CONFIG_INTERNAL_STATS
+
+ // Clear down mmx registers
+ aom_clear_system_state();
+
+ cm->show_existing_frame = 0;
+ return 0;
+ }
+#endif // CONFIG_EXT_REFS && !CONFIG_XIPHRC
+
+ // Should we encode an arf frame.
+ arf_src_index = get_arf_src_index(cpi);
+ if (arf_src_index) {
+ for (i = 0; i <= arf_src_index; ++i) {
+ struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+ // Avoid creating an alt-ref if there's a forced keyframe pending.
+ if (e == NULL) {
+ break;
+ } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+ arf_src_index = 0;
+ flush = 1;
+ break;
+ }
+ }
+ }
+
+ if (arf_src_index) {
+ assert(arf_src_index <= rc->frames_to_key);
+
+ if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+ cpi->alt_ref_source = source;
+
+ if (oxcf->arnr_max_frames > 0) {
+ // Produce the filtered ARF frame.
+ av1_temporal_filter(cpi, arf_src_index);
+ aom_extend_frame_borders(&cpi->alt_ref_buffer);
+ force_src_buffer = &cpi->alt_ref_buffer;
+ }
+
+ cm->show_frame = 0;
+ cm->intra_only = 0;
+ cpi->refresh_alt_ref_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_last_frame = 0;
+ rc->is_src_frame_alt_ref = 0;
+ }
+ rc->source_alt_ref_pending = 0;
+ }
+
+#if CONFIG_EXT_REFS
+ rc->is_bwd_ref_frame = 0;
+ brf_src_index = get_brf_src_index(cpi);
+ if (brf_src_index) {
+ assert(brf_src_index <= rc->frames_to_key);
+ if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+ cm->show_frame = 0;
+ cm->intra_only = 0;
+
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ rc->is_bwd_ref_frame = 1;
+ }
+ }
+#endif // CONFIG_EXT_REFS
+
+ if (!source) {
+ // Get last frame source.
+ if (cm->current_video_frame > 0) {
+ if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL)
+ return -1;
+ }
+
+ // Read in the source frame.
+ source = av1_lookahead_pop(cpi->lookahead, flush);
+
+ if (source != NULL) {
+ cm->show_frame = 1;
+ cm->intra_only = 0;
+
+ // Check to see if the frame should be encoded as an arf overlay.
+ check_src_altref(cpi, source);
+ }
+ }
+
+ if (source) {
+ cpi->un_scaled_source = cpi->source =
+ force_src_buffer ? force_src_buffer : &source->img;
+
+ cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+ } else {
+ *size = 0;
+ if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+#if CONFIG_XIPHRC
+ od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 1);
+#else
+ av1_end_first_pass(cpi); /* get last stats packet */
+#endif
+ cpi->twopass.first_pass_done = 1;
+ }
+ return -1;
+ }
+
+ if (source->ts_start < cpi->first_time_stamp_ever) {
+ cpi->first_time_stamp_ever = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_start;
+ }
+
+ // Clear down mmx registers
+ aom_clear_system_state();
+
+ // adjust frame rates based on timestamps given
+ if (cm->show_frame) adjust_frame_rate(cpi, source);
+
+ // Find a free buffer for the new frame, releasing the reference previously
+ // held.
+ if (cm->new_fb_idx != INVALID_IDX) {
+ --pool->frame_bufs[cm->new_fb_idx].ref_count;
+ }
+ cm->new_fb_idx = get_free_fb(cm);
+
+ if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+ cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+#if CONFIG_EXT_REFS
+ if (oxcf->pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ cpi->alt_fb_idx = cpi->arf_map[gf_group->arf_ref_idx[gf_group->index]];
+ }
+#else
+ if (cpi->multi_arf_allowed) {
+ if (cm->frame_type == KEY_FRAME) {
+ init_buffer_indices(cpi);
+ } else if (oxcf->pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
+ }
+ }
+#endif // CONFIG_EXT_REFS
+
+ // Start with a 0 size frame.
+ *size = 0;
+
+ cpi->frame_flags = *frame_flags;
+
+ if (oxcf->pass == 2) {
+#if CONFIG_XIPHRC
+ if (od_enc_rc_2pass_in(&cpi->od_rc) < 0) return -1;
+ }
+#else
+ av1_rc_get_second_pass_params(cpi);
+ } else if (oxcf->pass == 1) {
+ set_frame_size(cpi);
+ }
+#endif
+
+ if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
+ for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i)
+ cpi->scaled_ref_idx[i] = INVALID_IDX;
+ }
+
+#if CONFIG_AOM_QM
+ cm->using_qmatrix = cpi->oxcf.using_qm;
+ cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+ cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+#endif
+
+#if CONFIG_REFERENCE_BUFFER
+ if (*time_stamp == 0) {
+ cpi->common.current_frame_id = -1;
+ }
+#endif
+
+#if CONFIG_XIPHRC
+ if (oxcf->pass == 1) {
+ size_t tmp;
+ if (cpi->od_rc.cur_frame == 0) Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
+ cpi->od_rc.firstpass_quant = cpi->od_rc.target_quantizer;
+ Pass0Encode(cpi, &tmp, dest, 0, frame_flags);
+ od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 0);
+ } else if (oxcf->pass == 2) {
+ Pass0Encode(cpi, size, dest, 0, frame_flags);
+ } else {
+ if (cpi->od_rc.cur_frame == 0) {
+ size_t tmp;
+ Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
+ }
+ Pass0Encode(cpi, size, dest, 0, frame_flags);
+ }
+#else
+ if (oxcf->pass == 1) {
+ cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
+ av1_first_pass(cpi, source);
+ } else if (oxcf->pass == 2) {
+ Pass2Encode(cpi, size, dest, frame_flags);
+ } else {
+ // One pass encode
+ Pass0Encode(cpi, size, dest, 0, frame_flags);
+ }
+#endif
+
+ if (!cm->error_resilient_mode)
+ cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+
+ // No frame encoded, or frame was dropped, release scaled references.
+ if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
+ release_scaled_references(cpi);
+ }
+
+ if (*size > 0) {
+ cpi->droppable = !frame_is_reference(cpi);
+ }
+
+ aom_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+
+ if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
+ generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+ if (oxcf->pass != 1) {
+ compute_internal_stats(cpi);
+ cpi->bytes += (int)(*size);
+ }
+#endif // CONFIG_INTERNAL_STATS
+
+#if CONFIG_XIPHRC
+ cpi->od_rc.cur_frame++;
+#endif
+
+ aom_clear_system_state();
+
+ return 0;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+ AV1_COMMON *cm = &cpi->common;
+ if (!cm->show_frame) {
+ return -1;
+ } else {
+ int ret;
+ if (cm->frame_to_show) {
+ *dest = *cm->frame_to_show;
+ dest->y_width = cm->width;
+ dest->y_height = cm->height;
+ dest->uv_width = cm->width >> cm->subsampling_x;
+ dest->uv_height = cm->height >> cm->subsampling_y;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ aom_clear_system_state();
+ return ret;
+ }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+ if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1;
+
+ *frame =
+ cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+ return 0;
+}
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+ AOM_SCALING vert_mode) {
+ AV1_COMMON *cm = &cpi->common;
+ int hr = 0, hs = 0, vr = 0, vs = 0;
+
+ if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+
+ Scale2Ratio(horiz_mode, &hr, &hs);
+ Scale2Ratio(vert_mode, &vr, &vs);
+
+ // always go to the next whole number
+ cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+ cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+ assert(cm->width <= cpi->initial_width);
+ assert(cm->height <= cpi->initial_height);
+
+ update_frame_size(cpi);
+
+ return 0;
+}
+
+int av1_set_size_literal(AV1_COMP *cpi, unsigned int width,
+ unsigned int height) {
+ AV1_COMMON *cm = &cpi->common;
+#if CONFIG_HIGHBITDEPTH
+ check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+#else
+ check_initial_width(cpi, 1, 1);
+#endif // CONFIG_HIGHBITDEPTH
+
+ if (width) {
+ cm->width = width;
+ if (cm->width > cpi->initial_width) {
+ cm->width = cpi->initial_width;
+ printf("Warning: Desired width too large, changed to %d\n", cm->width);
+ }
+ }
+
+ if (height) {
+ cm->height = height;
+ if (cm->height > cpi->initial_height) {
+ cm->height = cpi->initial_height;
+ printf("Warning: Desired height too large, changed to %d\n", cm->height);
+ }
+ }
+ assert(cm->width <= cpi->initial_width);
+ assert(cm->height <= cpi->initial_height);
+
+ update_frame_size(cpi);
+
+ return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+ if (flags &
+ (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF)) {
+ int ref = AOM_REFFRAME_ALL;
+
+ if (flags & AOM_EFLAG_NO_REF_LAST) {
+ ref ^= AOM_LAST_FLAG;
+#if CONFIG_EXT_REFS
+ ref ^= AOM_LAST2_FLAG;
+ ref ^= AOM_LAST3_FLAG;
+#endif // CONFIG_EXT_REFS
+ }
+
+ if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_REF_ARF) ref ^= AOM_ALT_FLAG;
+
+ av1_use_as_reference(cpi, ref);
+ }
+
+ if (flags &
+ (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+ AOM_EFLAG_FORCE_GF | AOM_EFLAG_FORCE_ARF)) {
+ int upd = AOM_REFFRAME_ALL;
+
+ if (flags & AOM_EFLAG_NO_UPD_LAST) {
+ upd ^= AOM_LAST_FLAG;
+#if CONFIG_EXT_REFS
+ upd ^= AOM_LAST2_FLAG;
+ upd ^= AOM_LAST3_FLAG;
+#endif // CONFIG_EXT_REFS
+ }
+
+ if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_UPD_ARF) upd ^= AOM_ALT_FLAG;
+
+ av1_update_reference(cpi, upd);
+ }
+
+ if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+ av1_update_entropy(cpi, 0);
+ }
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
new file mode 100644
index 0000000000..4e7aef8fcd
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -0,0 +1,883 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ENCODER_H_
+#define AV1_ENCODER_ENCODER_H_
+
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#include "aom_dsp/buf_ans.h"
+#endif
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/variance_tree.h"
+#if CONFIG_XIPHRC
+#include "av1/encoder/ratectrl_xiph.h"
+#endif
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ int nmvjointcost[MV_JOINTS];
+ int nmvcosts[2][MV_VALS];
+ int nmvcosts_hp[2][MV_VALS];
+
+#if CONFIG_REF_MV
+ int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
+ int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+ int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
+#endif
+
+ // 0 = Intra, Last, GF, ARF
+ signed char last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
+ // 0 = ZERO_MV, MV
+ signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+ FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+typedef enum {
+ // regular inter frame
+ REGULAR_FRAME = 0,
+ // alternate reference frame
+ ARF_FRAME = 1,
+ // overlay frame
+ OVERLAY_FRAME = 2,
+ // golden frame
+ GLD_FRAME = 3,
+#if CONFIG_EXT_REFS
+ // backward reference frame
+ BRF_FRAME = 4,
+ // extra alternate reference frame
+ EXT_ARF_FRAME = 5
+#endif
+} FRAME_CONTEXT_INDEX;
+
+typedef enum {
+ NORMAL = 0,
+ FOURFIVE = 1,
+ THREEFIVE = 2,
+ ONETWO = 3
+} AOM_SCALING;
+
+typedef enum {
+ // Good Quality Fast Encoding. The encoder balances quality with the amount of
+ // time it takes to encode the output. Speed setting controls how fast.
+ GOOD
+} MODE;
+
+typedef enum {
+ FRAMEFLAGS_KEY = 1 << 0,
+ FRAMEFLAGS_GOLDEN = 1 << 1,
+#if CONFIG_EXT_REFS
+ FRAMEFLAGS_BWDREF = 1 << 2,
+ FRAMEFLAGS_ALTREF = 1 << 3,
+#else
+ FRAMEFLAGS_ALTREF = 1 << 2,
+#endif // CONFIG_EXT_REFS
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+ NO_AQ = 0,
+ VARIANCE_AQ = 1,
+ COMPLEXITY_AQ = 2,
+ CYCLIC_REFRESH_AQ = 3,
+#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q
+ DELTA_AQ = 4,
+#endif
+ AQ_MODE_COUNT // This should always be the last member of the enum
+} AQ_MODE;
+#if CONFIG_EXT_DELTA_Q
+typedef enum {
+ NO_DELTA_Q = 0,
+ DELTA_Q_ONLY = 1,
+ DELTA_Q_LF = 2,
+ DELTAQ_MODE_COUNT // This should always be the last member of the enum
+} DELTAQ_MODE;
+#endif
+typedef enum {
+ RESIZE_NONE = 0, // No frame resizing allowed.
+ RESIZE_FIXED = 1, // All frames are coded at the specified dimension.
+ RESIZE_DYNAMIC = 2 // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
+
+typedef struct AV1EncoderConfig {
+ BITSTREAM_PROFILE profile;
+ aom_bit_depth_t bit_depth; // Codec bit-depth.
+ int width; // width of data passed to the compressor
+ int height; // height of data passed to the compressor
+ unsigned int input_bit_depth; // Input bit depth.
+ double init_framerate; // set to passed in framerate
+ int64_t target_bandwidth; // bandwidth to be used in bits per second
+
+ int noise_sensitivity; // pre processing blur: recommendation 0
+ int sharpness; // sharpening output: recommendation 0:
+ int speed;
+ // maximum allowed bitrate for any intra frame in % of bitrate target.
+ unsigned int rc_max_intra_bitrate_pct;
+ // maximum allowed bitrate for any inter frame in % of bitrate target.
+ unsigned int rc_max_inter_bitrate_pct;
+ // percent of rate boost for golden frame in CBR mode.
+ unsigned int gf_cbr_boost_pct;
+
+ MODE mode;
+ int pass;
+
+ // Key Framing Operations
+ int auto_key; // autodetect cut scenes and set the keyframes
+ int key_freq; // maximum distance to key frame.
+
+ int lag_in_frames; // how many frames lag before we start encoding
+
+ // ----------------------------------------------------------------
+ // DATARATE CONTROL OPTIONS
+
+ // vbr, cbr, constrained quality or constant quality
+ enum aom_rc_mode rc_mode;
+
+ // buffer targeting aggressiveness
+ int under_shoot_pct;
+ int over_shoot_pct;
+
+ // buffering parameters
+ int64_t starting_buffer_level_ms;
+ int64_t optimal_buffer_level_ms;
+ int64_t maximum_buffer_size_ms;
+
+ // Frame drop threshold.
+ int drop_frames_water_mark;
+
+ // controlling quality
+ int fixed_q;
+ int worst_allowed_q;
+ int best_allowed_q;
+ int cq_level;
+ AQ_MODE aq_mode; // Adaptive Quantization mode
+#if CONFIG_EXT_DELTA_Q
+ DELTAQ_MODE deltaq_mode;
+#endif
+#if CONFIG_AOM_QM
+ int using_qm;
+ int qm_minlevel;
+ int qm_maxlevel;
+#endif
+#if CONFIG_TILE_GROUPS
+ unsigned int num_tile_groups;
+ unsigned int mtu;
+#endif
+
+#if CONFIG_TEMPMV_SIGNALING
+ unsigned int disable_tempmv;
+#endif
+ // Internal frame size scaling.
+ RESIZE_TYPE resize_mode;
+ int scaled_frame_width;
+ int scaled_frame_height;
+
+ // Enable feature to reduce the frame quantization every x frames.
+ int frame_periodic_boost;
+
+ // two pass datarate control
+ int two_pass_vbrbias; // two pass datarate control tweaks
+ int two_pass_vbrmin_section;
+ int two_pass_vbrmax_section;
+ // END DATARATE CONTROL OPTIONS
+ // ----------------------------------------------------------------
+
+ int enable_auto_arf;
+#if CONFIG_EXT_REFS
+ int enable_auto_brf; // (b)ackward (r)ef (f)rame
+#endif // CONFIG_EXT_REFS
+
+ /* Bitfield defining the error resiliency features to enable.
+ * Can provide decodable frames after losses in previous
+ * frames and decodable partitions after losses in the same frame.
+ */
+ unsigned int error_resilient_mode;
+
+ /* Bitfield defining the parallel decoding mode where the
+ * decoding in successive frames may be conducted in parallel
+ * just by decoding the frame headers.
+ */
+ unsigned int frame_parallel_decoding_mode;
+
+ int arnr_max_frames;
+ int arnr_strength;
+
+ int min_gf_interval;
+ int max_gf_interval;
+
+ int tile_columns;
+ int tile_rows;
+#if CONFIG_DEPENDENT_HORZTILES
+ int dependent_horz_tiles;
+#endif
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+ int loop_filter_across_tiles_enabled;
+#endif // CONFIG_LOOPFILTERING_ACROSS_TILES
+
+ int max_threads;
+
+ aom_fixed_buf_t two_pass_stats_in;
+ struct aom_codec_pkt_list *output_pkt_list;
+
+#if CONFIG_FP_MB_STATS
+ aom_fixed_buf_t firstpass_mb_stats_in;
+#endif
+
+ aom_tune_metric tuning;
+ aom_tune_content content;
+#if CONFIG_HIGHBITDEPTH
+ int use_highbitdepth;
+#endif
+ aom_color_space_t color_space;
+ int color_range;
+ int render_width;
+ int render_height;
+
+#if CONFIG_EXT_PARTITION
+ aom_superblock_size_t superblock_size;
+#endif // CONFIG_EXT_PARTITION
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+ int ans_window_size_log2;
+#endif // CONFIG_ANS && ANS_MAX_SYMBOLS
+#if CONFIG_EXT_TILE
+ unsigned int tile_encoding_mode;
+#endif // CONFIG_EXT_TILE
+
+ unsigned int motion_vector_unit_test;
+} AV1EncoderConfig;
+
+static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
+ return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+ TileInfo tile_info;
+ int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+ int mode_map[BLOCK_SIZES][MAX_MODES];
+ int m_search_count;
+ int ex_search_count;
+#if CONFIG_PVQ
+ PVQ_QUEUE pvq_q;
+#endif
+#if CONFIG_CFL
+ CFL_CTX cfl;
+#endif
+#if CONFIG_EC_ADAPT
+ DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+#endif
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+ av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+ int64_t comp_pred_diff[REFERENCE_MODES];
+#if CONFIG_GLOBAL_MOTION
+ // Stores number of 4x4 blocks using global motion per reference frame.
+ int global_motion_used[TOTAL_REFS_PER_FRAME];
+#endif // CONFIG_GLOBAL_MOTION
+} RD_COUNTS;
+
+typedef struct ThreadData {
+ MACROBLOCK mb;
+ RD_COUNTS rd_counts;
+ FRAME_COUNTS *counts;
+
+ PICK_MODE_CONTEXT *leaf_tree;
+ PC_TREE *pc_tree;
+ PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+
+ VAR_TREE *var_tree;
+ VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+ int enabled;
+ int update;
+ unsigned char *map;
+} ActiveMap;
+
+#define NUM_STAT_TYPES 4 // types of stats: Y, U, V and ALL
+
+typedef struct IMAGE_STAT {
+ double stat[NUM_STAT_TYPES];
+ double worst;
+} ImageStat;
+
+#undef NUM_STAT_TYPES
+
+typedef struct {
+ int ref_count;
+ YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+typedef struct SUBFRAME_STATS {
+ av1_coeff_probs_model coef_probs_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+ av1_coeff_count coef_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+ unsigned int eob_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES][REF_TYPES]
+ [COEF_BANDS][COEFF_CONTEXTS];
+ av1_coeff_probs_model enc_starting_coef_probs[TX_SIZES][PLANE_TYPES];
+} SUBFRAME_STATS;
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+
+typedef struct TileBufferEnc {
+ uint8_t *data;
+ size_t size;
+} TileBufferEnc;
+
+typedef struct AV1_COMP {
+ QUANTS quants;
+ ThreadData td;
+ MB_MODE_INFO_EXT *mbmi_ext_base;
+ DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); // 8: SIMD width
+#if CONFIG_NEW_QUANT
+ DECLARE_ALIGNED(16, dequant_val_type_nuq,
+ y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+ DECLARE_ALIGNED(16, dequant_val_type_nuq,
+ uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+#endif // CONFIG_NEW_QUANT
+ AV1_COMMON common;
+ AV1EncoderConfig oxcf;
+ struct lookahead_ctx *lookahead;
+ struct lookahead_entry *alt_ref_source;
+
+ YV12_BUFFER_CONFIG *source;
+ YV12_BUFFER_CONFIG *last_source; // NULL for first frame and alt_ref frames
+ YV12_BUFFER_CONFIG *un_scaled_source;
+ YV12_BUFFER_CONFIG scaled_source;
+ YV12_BUFFER_CONFIG *unscaled_last_source;
+ YV12_BUFFER_CONFIG scaled_last_source;
+
+ // Up-sampled reference buffers
+ // NOTE(zoeliu): It is needed to allocate sufficient space to the up-sampled
+ // reference buffers, which should include the up-sampled version of all the
+ // possibly stored references plus the currently coded frame itself.
+ EncRefCntBuffer upsampled_ref_bufs[REF_FRAMES + 1];
+ int upsampled_ref_idx[REF_FRAMES + 1];
+
+ // For a still frame, this flag is set to 1 to skip partition search.
+ int partition_search_skippable_frame;
+
+ int scaled_ref_idx[TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_REFS
+ int lst_fb_idxes[LAST_REF_FRAMES];
+#else
+ int lst_fb_idx;
+#endif // CONFIG_EXT_REFS
+ int gld_fb_idx;
+#if CONFIG_EXT_REFS
+ int bwd_fb_idx; // BWD_REF_FRAME
+#endif // CONFIG_EXT_REFS
+ int alt_fb_idx;
+
+ int last_show_frame_buf_idx; // last show frame buffer index
+
+ int refresh_last_frame;
+ int refresh_golden_frame;
+#if CONFIG_EXT_REFS
+ int refresh_bwd_ref_frame;
+#endif // CONFIG_EXT_REFS
+ int refresh_alt_ref_frame;
+
+ int ext_refresh_frame_flags_pending;
+ int ext_refresh_last_frame;
+ int ext_refresh_golden_frame;
+ int ext_refresh_alt_ref_frame;
+
+ int ext_refresh_frame_context_pending;
+ int ext_refresh_frame_context;
+
+ YV12_BUFFER_CONFIG last_frame_uf;
+#if CONFIG_LOOP_RESTORATION
+ YV12_BUFFER_CONFIG last_frame_db;
+ YV12_BUFFER_CONFIG trial_frame_rst;
+ uint8_t *extra_rstbuf; // Extra buffers used in restoration search
+ RestorationInfo rst_search[MAX_MB_PLANE]; // Used for encoder side search
+#endif // CONFIG_LOOP_RESTORATION
+
+ // Ambient reconstruction err target for force key frames
+ int64_t ambient_err;
+
+ RD_OPT rd;
+
+ CODING_CONTEXT coding_context;
+
+#if CONFIG_REF_MV
+ int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+ int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
+#endif
+
+ int nmvcosts[2][MV_VALS];
+ int nmvcosts_hp[2][MV_VALS];
+ int nmvsadcosts[2][MV_VALS];
+ int nmvsadcosts_hp[2][MV_VALS];
+
+ int64_t last_time_stamp_seen;
+ int64_t last_end_time_stamp_seen;
+ int64_t first_time_stamp_ever;
+
+ RATE_CONTROL rc;
+#if CONFIG_XIPHRC
+ od_rc_state od_rc;
+#endif
+ double framerate;
+
+ // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter
+ // references; Plus the currently coded frame itself, it is needed to allocate
+ // sufficient space to the size of the maximum possible number of frames.
+ int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE];
+
+ struct aom_codec_pkt_list *output_pkt_list;
+
+ MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+ int mbgraph_n_frames; // number of frames filled in the above
+ int static_mb_pct; // % forced skip mbs by segmentation
+ int ref_frame_flags;
+
+ SPEED_FEATURES sf;
+
+ unsigned int max_mv_magnitude;
+ int mv_step_param;
+
+ int allow_comp_inter_inter;
+
+ uint8_t *segmentation_map;
+
+ CYCLIC_REFRESH *cyclic_refresh;
+ ActiveMap active_map;
+
+ fractional_mv_step_fp *find_fractional_mv_step;
+ av1_full_search_fn_t full_search_sad; // It is currently unused.
+ av1_diamond_search_fn_t diamond_search_sad;
+ aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+ uint64_t time_receive_data;
+ uint64_t time_compress_data;
+ uint64_t time_pick_lpf;
+ uint64_t time_encode_sb_row;
+
+#if CONFIG_FP_MB_STATS
+ int use_fp_mb_stats;
+#endif
+
+ TWO_PASS twopass;
+
+ YV12_BUFFER_CONFIG alt_ref_buffer;
+
+#if CONFIG_INTERNAL_STATS
+ unsigned int mode_chosen_counts[MAX_MODES];
+
+ int count;
+ uint64_t total_sq_error;
+ uint64_t total_samples;
+ ImageStat psnr;
+
+ double total_blockiness;
+ double worst_blockiness;
+
+ int bytes;
+ double summed_quality;
+ double summed_weights;
+ unsigned int tot_recode_hits;
+ double worst_ssim;
+
+ ImageStat fastssim;
+ ImageStat psnrhvs;
+
+ int b_calculate_blockiness;
+ int b_calculate_consistency;
+
+ double total_inconsistency;
+ double worst_consistency;
+ Ssimv *ssim_vars;
+ Metrics metrics;
+#endif
+ int b_calculate_psnr;
+
+ int droppable;
+
+ int initial_width;
+ int initial_height;
+ int initial_mbs; // Number of MBs in the full-size frame; to be used to
+ // normalize the firstpass stats. This will differ from the
+ // number of MBs in the current frame when the frame is
+ // scaled.
+
+ // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
+ DIFF *source_diff_var;
+ // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+ unsigned int source_var_thresh;
+ int frames_till_next_var_check;
+
+ int frame_flags;
+
+ search_site_config ss_cfg;
+
+ int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+#if CONFIG_REF_MV
+ int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+ int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
+ int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+ int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+#endif
+
+ unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+#if CONFIG_EXT_INTER
+ unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
+ [INTER_COMPOUND_MODES];
+ unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ int motion_mode_cost[BLOCK_SIZES][MOTION_MODES];
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+ int motion_mode_cost1[BLOCK_SIZES][2];
+#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
+ int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES
+ int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
+ [EXT_PARTITION_TYPES];
+#else
+ int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
+ [PARTITION_TYPES];
+#endif
+#if CONFIG_PALETTE
+ int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+ int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+ int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+#endif // CONFIG_PALETTE
+ int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_EXT_TX
+ int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+ int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [TX_TYPES];
+#else
+ int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+ int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+#endif // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+ int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif // CONFIG_INTRA_INTERP
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_LOOP_RESTORATION
+ int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+#endif // CONFIG_LOOP_RESTORATION
+#if CONFIG_GLOBAL_MOTION
+ int gmtype_cost[TRANS_TYPES];
+ int gmparams_cost[TOTAL_REFS_PER_FRAME];
+#endif // CONFIG_GLOBAL_MOTION
+
+ int multi_arf_allowed;
+ int multi_arf_enabled;
+ int multi_arf_last_grp_enabled;
+
+ TileDataEnc *tile_data;
+ int allocated_tiles; // Keep track of memory allocated for tiles.
+
+ TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+ unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+ TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+ int resize_pending;
+ int resize_state;
+ int resize_scale_num;
+ int resize_scale_den;
+ int resize_avg_qp;
+ int resize_buffer_underflow;
+ int resize_count;
+
+ // VAR_BASED_PARTITION thresholds
+ // 0 - threshold_128x128;
+ // 1 - threshold_64x64;
+ // 2 - threshold_32x32;
+ // 3 - threshold_16x16;
+ // 4 - threshold_8x8;
+ int64_t vbp_thresholds[5];
+ int64_t vbp_threshold_minmax;
+ int64_t vbp_threshold_sad;
+ BLOCK_SIZE vbp_bsize_min;
+
+ // VARIANCE_AQ segment map refresh
+ int vaq_refresh;
+
+ // Multi-threading
+ int num_workers;
+ AVxWorker *workers;
+ struct EncWorkerData *tile_thr_data;
+ AV1LfSync lf_row_sync;
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ SUBFRAME_STATS subframe_stats;
+ // TODO(yaowu): minimize the size of count buffers
+ SUBFRAME_STATS wholeframe_stats;
+ av1_coeff_stats branch_ct_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+#if CONFIG_ANS
+ struct BufAnsCoder buf_ans;
+#endif
+#if CONFIG_EXT_REFS
+ int refresh_frame_mask;
+ int existing_fb_idx_to_show;
+ int is_arf_filter_off[MAX_EXT_ARFS + 1];
+ int num_extra_arfs;
+ int arf_map[MAX_EXT_ARFS + 1];
+#endif // CONFIG_EXT_REFS
+#if CONFIG_GLOBAL_MOTION
+ int global_motion_search_done;
+#endif
+#if CONFIG_REFERENCE_BUFFER
+ SequenceHeader seq_params;
+#endif
+#if CONFIG_LV_MAP
+ tran_low_t *tcoeff_buf[MAX_MB_PLANE];
+#endif
+} AV1_COMP;
+
+void av1_initialize_enc(void);
+
+struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+ BufferPool *const pool);
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+
+// receive a frames worth of data. caller can assume that a copy of this
+// frame is made and not just a copy of the pointer..
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time_stamp);
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+ size_t *size, uint8_t *dest, int64_t *time_stamp,
+ int64_t *time_end, int flush);
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+int av1_update_entropy(AV1_COMP *cpi, int update);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+ AOM_SCALING vert_mode);
+
+int av1_set_size_literal(AV1_COMP *cpi, unsigned int width,
+ unsigned int height);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+void av1_full_to_model_counts(av1_coeff_count_model *model_count,
+ av1_coeff_count *full_count);
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+ return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
+ MV_REFERENCE_FRAME ref_frame) {
+#if CONFIG_EXT_REFS
+ if (ref_frame >= LAST_FRAME && ref_frame <= LAST3_FRAME)
+ return cpi->lst_fb_idxes[ref_frame - 1];
+#else
+ if (ref_frame == LAST_FRAME) return cpi->lst_fb_idx;
+#endif // CONFIG_EXT_REFS
+ else if (ref_frame == GOLDEN_FRAME)
+ return cpi->gld_fb_idx;
+#if CONFIG_EXT_REFS
+ else if (ref_frame == BWDREF_FRAME)
+ return cpi->bwd_fb_idx;
+#endif // CONFIG_EXT_REFS
+ else
+ return cpi->alt_fb_idx;
+}
+
+static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
+ MV_REFERENCE_FRAME ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+ const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
+ : NULL;
+}
+
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
+ const AV1_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
+ // Use up-sampled reference frames.
+ const int buf_idx =
+ cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
+ return &cpi->upsampled_ref_bufs[buf_idx].buf;
+}
+
+#if CONFIG_EXT_REFS
+static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
+ MV_REFERENCE_FRAME ref_frame;
+ AV1_COMMON *const cm = &cpi->common;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ if (buf_idx == INVALID_IDX) continue;
+ if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break;
+ }
+ return (ref_frame <= ALTREF_FRAME);
+}
+#endif // CONFIG_EXT_REFS
+
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
+ // We assume 3 planes all at full resolution. We assume up to 1 token per
+ // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane,
+ // plus EOSB_TOKEN per plane.
+ return mb_rows * mb_cols * (16 * 16 + 17) * 3;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(TileInfo tile) {
+#if CONFIG_CB4X4
+ int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
+ int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
+#else
+ int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+ int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+#endif
+
+ return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
+void av1_alloc_compressor_data(AV1_COMP *cpi);
+
+void av1_scale_references(AV1_COMP *cpi);
+
+void av1_update_reference_frames(AV1_COMP *cpi);
+
+void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv);
+#if CONFIG_TEMPMV_SIGNALING
+void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction);
+#endif
+
+YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled);
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled);
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
+ return cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.enable_auto_arf;
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+#if 0 && CONFIG_EXT_REFS
+static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) {
+ // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
+ // alt_ref, and now will be off when the alt_ref interval is
+ // not sufficiently large.
+ return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
+}
+#endif // CONFIG_EXT_REFS
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME ref0,
+ MV_REFERENCE_FRAME ref1) {
+ xd->block_refs[0] =
+ &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0];
+ xd->block_refs[1] =
+ &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0];
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+ return frame_index & 0x1;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+ return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Update up-sampled reference frame index.
+static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
+ int new_uidx) {
+ const int ref_index = *uidx;
+
+ if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
+ ubufs[ref_index].ref_count--;
+
+ *uidx = new_uidx;
+ ubufs[new_uidx].ref_count++;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
new file mode 100644
index 0000000000..3f71a4472e
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/scan.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/subexp.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+#if 0
+ AV1_COMMON *cm = &cpi->common;
+ int mi_block_size = 1 << MI_SIZE_LOG2;
+ // TODO(angiebird): Make sure cm->subsampling_x/y is set correctly, and then
+ // use precise buffer size according to cm->subsampling_x/y
+ int pixel_stride = mi_block_size * cm->mi_cols;
+ int pixel_height = mi_block_size * cm->mi_rows;
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ CHECK_MEM_ERROR(
+ cm, cpi->tcoeff_buf[i],
+ aom_malloc(sizeof(*cpi->tcoeff_buf[i]) * pixel_stride * pixel_height));
+ }
+#else
+ (void)cpi;
+#endif
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) {
+#if 0
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ aom_free(cpi->tcoeff_buf[i]);
+ }
+#else
+ (void)cpi;
+#endif
+}
+
+static void write_golomb(aom_writer *w, int level) {
+ int x = level + 1;
+ int i = x;
+ int length = 0;
+
+ while (i) {
+ i >>= 1;
+ ++length;
+ }
+ assert(length > 0);
+
+ for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+ for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+ aom_writer *w, int block, int plane,
+ const tran_low_t *tcoeff, uint16_t eob,
+ TXB_CTX *txb_ctx) {
+ aom_prob *nz_map;
+ aom_prob *eob_flag;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE tx_size = get_tx_size(plane, xd);
+ const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+ const int16_t *scan = scan_order->scan;
+ int c;
+ int is_nz;
+ const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+ const int seg_eob = tx_size_2d[tx_size];
+ uint8_t txb_mask[32 * 32] = { 0 };
+ uint16_t update_eob = 0;
+
+ aom_write(w, eob == 0, cm->fc->txb_skip[tx_size][txb_ctx->txb_skip_ctx]);
+
+ if (eob == 0) return;
+#if CONFIG_TXK_SEL
+ av1_write_tx_type(cm, xd, block, plane, w);
+#endif
+
+ nz_map = cm->fc->nz_map[tx_size][plane_type];
+ eob_flag = cm->fc->eob_flag[tx_size][plane_type];
+
+ for (c = 0; c < eob; ++c) {
+ int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl);
+ int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl);
+
+ tran_low_t v = tcoeff[scan[c]];
+ is_nz = (v != 0);
+
+ if (c == seg_eob - 1) break;
+
+ aom_write(w, is_nz, nz_map[coeff_ctx]);
+
+ if (is_nz) {
+ aom_write(w, c == (eob - 1), eob_flag[eob_ctx]);
+ }
+ txb_mask[scan[c]] = 1;
+ }
+
+ int i;
+ for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+ aom_prob *coeff_base = cm->fc->coeff_base[tx_size][plane_type][i];
+
+ update_eob = 0;
+ for (c = eob - 1; c >= 0; --c) {
+ tran_low_t v = tcoeff[scan[c]];
+ tran_low_t level = abs(v);
+ int sign = (v < 0) ? 1 : 0;
+ int ctx;
+
+ if (level <= i) continue;
+
+ ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1);
+
+ if (level == i + 1) {
+ aom_write(w, 1, coeff_base[ctx]);
+ if (c == 0) {
+ aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+ } else {
+ aom_write_bit(w, sign);
+ }
+ continue;
+ }
+ aom_write(w, 0, coeff_base[ctx]);
+ update_eob = AOMMAX(update_eob, c);
+ }
+ }
+
+ for (c = update_eob; c >= 0; --c) {
+ tran_low_t v = tcoeff[scan[c]];
+ tran_low_t level = abs(v);
+ int sign = (v < 0) ? 1 : 0;
+ int idx;
+ int ctx;
+
+ if (level <= NUM_BASE_LEVELS) continue;
+
+ if (c == 0) {
+ aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+ } else {
+ aom_write_bit(w, sign);
+ }
+
+ // level is above 1.
+ ctx = get_level_ctx(tcoeff, scan[c], bwl);
+ for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+ if (level == (idx + 1 + NUM_BASE_LEVELS)) {
+ aom_write(w, 1, cm->fc->coeff_lps[tx_size][plane_type][ctx]);
+ break;
+ }
+ aom_write(w, 0, cm->fc->coeff_lps[tx_size][plane_type][ctx]);
+ }
+ if (idx < COEFF_BASE_RANGE) continue;
+
+ // use 0-th order Golomb code to handle the residual level.
+ write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+ }
+}
+
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+ aom_writer *w, int plane) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ struct macroblockd_plane *pd = &xd->plane[plane];
+
+#if CONFIG_CB4X4
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+#endif
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ TX_SIZE tx_size = get_tx_size(plane, xd);
+ const int bkw = tx_size_wide_unit[tx_size];
+ const int bkh = tx_size_high_unit[tx_size];
+ const int step = tx_size_wide_unit[tx_size] * tx_size_high_unit[tx_size];
+ int row, col;
+ int block = 0;
+ for (row = 0; row < max_blocks_high; row += bkh) {
+ for (col = 0; col < max_blocks_wide; col += bkw) {
+ tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+ uint16_t eob = x->mbmi_ext->eobs[plane][block];
+ TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+ x->mbmi_ext->dc_sign_ctx[plane][block] };
+ av1_write_coeffs_txb(cm, xd, w, block, plane, tcoeff, eob, &txb_ctx);
+ block += step;
+ }
+ }
+}
+
+static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
+ int c, // raster order
+ const int bwl,
+ int ctx_set[NUM_BASE_LEVELS]) {
+ const int row = c >> bwl;
+ const int col = c - (row << bwl);
+ const int stride = 1 << bwl;
+ int mag[NUM_BASE_LEVELS] = { 0 };
+ int idx;
+ tran_low_t abs_coeff;
+ int i;
+
+ for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
+ int ref_row = row + base_ref_offset[idx][0];
+ int ref_col = col + base_ref_offset[idx][1];
+ int pos = (ref_row << bwl) + ref_col;
+
+ if (ref_row < 0 || ref_col < 0 || ref_row >= stride || ref_col >= stride)
+ continue;
+
+ abs_coeff = abs(tcoeffs[pos]);
+
+ for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+ ctx_set[i] += abs_coeff > i;
+ if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0)
+ mag[i] |= abs_coeff > (i + 1);
+ }
+ }
+
+ for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+ ctx_set[i] = (ctx_set[i] + 1) >> 1;
+
+ if (row == 0 && col == 0)
+ ctx_set[i] = (ctx_set[i] << 1) + mag[i];
+ else if (row == 0)
+ ctx_set[i] = 8 + (ctx_set[i] << 1) + mag[i];
+ else if (col == 0)
+ ctx_set[i] = 18 + (ctx_set[i] << 1) + mag[i];
+ else
+ ctx_set[i] = 28 + (ctx_set[i] << 1) + mag[i];
+ }
+ return;
+}
+
+int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+ int block, TXB_CTX *txb_ctx) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TX_SIZE tx_size = get_tx_size(plane, xd);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const struct macroblock_plane *p = &x->plane[plane];
+ const int eob = p->eobs[block];
+ const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ int c, cost;
+ const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
+ int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+ aom_prob *nz_map = xd->fc->nz_map[tx_size][plane_type];
+
+ const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+ // txb_mask is only initialized for once here. After that, it will be set when
+ // coding zero map and then reset when coding level 1 info.
+ uint8_t txb_mask[32 * 32] = { 0 };
+ aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] =
+ xd->fc->coeff_base[tx_size][plane_type];
+
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+ const int16_t *scan = scan_order->scan;
+
+ cost = 0;
+
+ if (eob == 0) {
+ cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 1);
+ return cost;
+ }
+
+ cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 0);
+
+#if CONFIG_TXK_SEL
+ cost += av1_tx_type_cost(cpi, xd, mbmi->sb_type, plane, tx_size, tx_type);
+#endif
+
+ for (c = 0; c < eob; ++c) {
+ tran_low_t v = qcoeff[scan[c]];
+ int is_nz = (v != 0);
+ int level = abs(v);
+
+ if (c < seg_eob) {
+ int coeff_ctx = get_nz_map_ctx(qcoeff, txb_mask, scan[c], bwl);
+ cost += av1_cost_bit(nz_map[coeff_ctx], is_nz);
+ }
+
+ if (is_nz) {
+ int ctx_ls[NUM_BASE_LEVELS] = { 0 };
+ int sign = (v < 0) ? 1 : 0;
+
+ // sign bit cost
+ if (c == 0) {
+ int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+
+ cost += av1_cost_bit(xd->fc->dc_sign[plane_type][dc_sign_ctx], sign);
+ } else {
+ cost += av1_cost_bit(128, sign);
+ }
+
+ get_base_ctx_set(qcoeff, scan[c], bwl, ctx_ls);
+
+ int i;
+ for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+ if (level <= i) continue;
+
+ if (level == i + 1) {
+ cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 1);
+ continue;
+ }
+ cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 0);
+ }
+
+ if (level > NUM_BASE_LEVELS) {
+ int idx;
+ int ctx;
+
+ ctx = get_level_ctx(qcoeff, scan[c], bwl);
+
+ for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+ if (level == (idx + 1 + NUM_BASE_LEVELS)) {
+ cost +=
+ av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 1);
+ break;
+ }
+ cost += av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 0);
+ }
+
+ if (idx >= COEFF_BASE_RANGE) {
+ // residual cost
+ int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+ int ri = r;
+ int length = 0;
+
+ while (ri) {
+ ri >>= 1;
+ ++length;
+ }
+
+ for (ri = 0; ri < length - 1; ++ri) cost += av1_cost_bit(128, 0);
+
+ for (ri = length - 1; ri >= 0; --ri)
+ cost += av1_cost_bit(128, (r >> ri) & 0x01);
+ }
+ }
+
+ if (c < seg_eob) {
+ int eob_ctx = get_eob_ctx(qcoeff, scan[c], bwl);
+ cost += av1_cost_bit(xd->fc->eob_flag[tx_size][plane_type][eob_ctx],
+ c == (eob - 1));
+ }
+ }
+
+ txb_mask[scan[c]] = 1;
+ }
+
+ return cost;
+}
+
+typedef struct TxbParams {
+ const AV1_COMP *cpi;
+ ThreadData *td;
+ int rate;
+} TxbParams;
+
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob) {
+ const int16_t *scan = scan_order->scan;
+ int cul_level = 0;
+ int c;
+ for (c = 0; c < eob; ++c) {
+ cul_level += abs(qcoeff[scan[c]]);
+ }
+
+ cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+ set_dc_sign(&cul_level, qcoeff[0]);
+
+ return cul_level;
+}
+
+static void update_txb_context(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ TxbParams *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const uint16_t eob = p->eobs[block];
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+ (void)plane_bsize;
+
+ int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+ av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
+}
+
+static void update_and_record_txb_context(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ TxbParams *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ int eob = p->eobs[block], update_eob = 0;
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+ const int segment_id = mbmi->segment_id;
+ const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+ const int16_t *scan = scan_order->scan;
+ const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ int c, i;
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
+ pd->left_context + blk_row, &txb_ctx);
+ const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+ int cul_level = 0;
+ unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2];
+ uint8_t txb_mask[32 * 32] = { 0 };
+
+ nz_map_count = &td->counts->nz_map[tx_size][plane_type];
+
+ memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+ ++td->counts->txb_skip[tx_size][txb_ctx.txb_skip_ctx][eob == 0];
+ x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
+
+ x->mbmi_ext->eobs[plane][block] = eob;
+
+ if (eob == 0) {
+ av1_set_contexts(xd, pd, plane, tx_size, 0, blk_col, blk_row);
+ return;
+ }
+
+#if CONFIG_TXK_SEL
+ av1_update_tx_type_count(cm, xd, block, plane, mbmi->sb_type, tx_size,
+ td->counts);
+#endif
+
+ for (c = 0; c < eob; ++c) {
+ tran_low_t v = qcoeff[scan[c]];
+ int is_nz = (v != 0);
+ int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl);
+ int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl);
+
+ if (c == seg_eob - 1) break;
+
+ ++(*nz_map_count)[coeff_ctx][is_nz];
+
+ if (is_nz) {
+ ++td->counts->eob_flag[tx_size][plane_type][eob_ctx][c == (eob - 1)];
+ }
+ txb_mask[scan[c]] = 1;
+ }
+
+ // Reverse process order to handle coefficient level and sign.
+ for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+ update_eob = 0;
+ for (c = eob - 1; c >= 0; --c) {
+ tran_low_t v = qcoeff[scan[c]];
+ tran_low_t level = abs(v);
+ int ctx;
+
+ if (level <= i) continue;
+
+ ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1);
+
+ if (level == i + 1) {
+ ++td->counts->coeff_base[tx_size][plane_type][i][ctx][1];
+ if (c == 0) {
+ int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+
+ ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
+ x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+ }
+ cul_level += level;
+ continue;
+ }
+ ++td->counts->coeff_base[tx_size][plane_type][i][ctx][0];
+ update_eob = AOMMAX(update_eob, c);
+ }
+ }
+
+ for (c = update_eob; c >= 0; --c) {
+ tran_low_t v = qcoeff[scan[c]];
+ tran_low_t level = abs(v);
+ int idx;
+ int ctx;
+
+ if (level <= NUM_BASE_LEVELS) continue;
+
+ cul_level += level;
+ if (c == 0) {
+ int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+
+ ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
+ x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+ }
+
+ // level is above 1.
+ ctx = get_level_ctx(tcoeff, scan[c], bwl);
+ for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+ if (level == (idx + 1 + NUM_BASE_LEVELS)) {
+ ++td->counts->coeff_lps[tx_size][plane_type][ctx][1];
+ break;
+ }
+ ++td->counts->coeff_lps[tx_size][plane_type][ctx][0];
+ }
+ if (idx < COEFF_BASE_RANGE) continue;
+
+ // use 0-th order Golomb code to handle the residual level.
+ }
+
+ cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+
+ // DC value
+ set_dc_sign(&cul_level, tcoeff[0]);
+ av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
+
+#if CONFIG_ADAPT_SCAN
+ // Since dqcoeff is not available here, we pass qcoeff into
+ // av1_update_scan_count_facade(). The update behavior should be the same
+ // because av1_update_scan_count_facade() only cares if coefficients are zero
+ // or not.
+ av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
+ qcoeff, eob);
+#endif
+}
+
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const int ctx = av1_get_skip_context(xd);
+ const int skip_inc =
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+ struct TxbParams arg = { cpi, td, 0 };
+ (void)rate;
+ (void)mi_row;
+ (void)mi_col;
+ if (mbmi->skip) {
+ if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+ reset_skip_context(xd, bsize);
+ return;
+ }
+
+ if (!dry_run) {
+ td->counts->skip[ctx][0] += skip_inc;
+ av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
+ update_and_record_txb_context, &arg);
+ } else if (dry_run == DRY_RUN_NORMAL) {
+ av1_foreach_transformed_block(xd, bsize, mi_row, mi_col, update_txb_context,
+ &arg);
+ } else {
+ printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
+ assert(0);
+ }
+}
+
+static void find_new_prob(unsigned int *branch_cnt, aom_prob *oldp,
+ int *savings, int *update, aom_writer *const bc) {
+ const aom_prob upd = DIFF_UPDATE_PROB;
+ int u = 0;
+ aom_prob newp = get_binary_prob(branch_cnt[0], branch_cnt[1]);
+ int s = av1_prob_diff_update_savings_search(branch_cnt, *oldp, &newp, upd, 1);
+
+ if (s > 0 && newp != *oldp) u = 1;
+
+ if (u)
+ *savings += s - (int)(av1_cost_zero(upd)); // TODO(jingning): 1?
+ else
+ *savings -= (int)(av1_cost_zero(upd));
+
+ if (update) {
+ ++update[u];
+ return;
+ }
+
+ aom_write(bc, u, upd);
+ if (u) {
+ /* send/use new probability */
+ av1_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+}
+
+static void write_txb_probs(aom_writer *const bc, AV1_COMP *cpi,
+ TX_SIZE tx_size) {
+ FRAME_CONTEXT *fc = cpi->common.fc;
+ FRAME_COUNTS *counts = cpi->td.counts;
+ int savings = 0;
+ int update[2] = { 0, 0 };
+ int plane, ctx, level;
+
+ for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
+ find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
+ &savings, update, bc);
+ }
+
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+ find_new_prob(counts->nz_map[tx_size][plane][ctx],
+ &fc->nz_map[tx_size][plane][ctx], &savings, update, bc);
+ }
+ }
+
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
+ find_new_prob(counts->eob_flag[tx_size][plane][ctx],
+ &fc->eob_flag[tx_size][plane][ctx], &savings, update, bc);
+ }
+ }
+
+ for (level = 0; level < NUM_BASE_LEVELS; ++level) {
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
+ find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
+ &fc->coeff_base[tx_size][plane][level][ctx], &savings,
+ update, bc);
+ }
+ }
+ }
+
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
+ &fc->coeff_lps[tx_size][plane][ctx], &savings, update, bc);
+ }
+ }
+
+ // Decide if to update the model for this tx_size
+ if (update[1] == 0 || savings < 0) {
+ aom_write_bit(bc, 0);
+ return;
+ }
+ aom_write_bit(bc, 1);
+
+ for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
+ find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
+ &savings, NULL, bc);
+ }
+
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+ find_new_prob(counts->nz_map[tx_size][plane][ctx],
+ &fc->nz_map[tx_size][plane][ctx], &savings, NULL, bc);
+ }
+ }
+
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
+ find_new_prob(counts->eob_flag[tx_size][plane][ctx],
+ &fc->eob_flag[tx_size][plane][ctx], &savings, NULL, bc);
+ }
+ }
+
+ for (level = 0; level < NUM_BASE_LEVELS; ++level) {
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
+ find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
+ &fc->coeff_base[tx_size][plane][level][ctx], &savings,
+ NULL, bc);
+ }
+ }
+ }
+
+ for (plane = 0; plane < PLANE_TYPES; ++plane) {
+ for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
+ &fc->coeff_lps[tx_size][plane][ctx], &savings, NULL, bc);
+ }
+ }
+}
+
+void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w) {
+ const TX_MODE tx_mode = cpi->common.tx_mode;
+ const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ TX_SIZE tx_size;
+ int ctx, plane;
+
+ for (plane = 0; plane < PLANE_TYPES; ++plane)
+ for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+ av1_cond_prob_diff_update(w, &cpi->common.fc->dc_sign[plane][ctx],
+ cpi->td.counts->dc_sign[plane][ctx], 1);
+
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+ write_txb_probs(w, cpi, tx_size);
+}
+
+#if CONFIG_TXK_SEL
+int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+ int use_fast_coef_costing, RD_STATS *rd_stats) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ TX_TYPE txk_start = DCT_DCT;
+ TX_TYPE txk_end = TX_TYPES - 1;
+ TX_TYPE best_tx_type = txk_start;
+ int64_t best_rd = INT64_MAX;
+ const int coeff_ctx = combine_entropy_contexts(*a, *l);
+ TX_TYPE tx_type;
+ for (tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+ if (plane == 0) mbmi->txk_type[block] = tx_type;
+ TX_TYPE ref_tx_type =
+ get_tx_type(get_plane_type(plane), xd, block, tx_size);
+ if (tx_type != ref_tx_type) {
+ // use get_tx_type() to check if the tx_type is valid for the current mode
+ // if it's not, we skip it here.
+ continue;
+ }
+ RD_STATS this_rd_stats;
+ av1_invalid_rd_stats(&this_rd_stats);
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ coeff_ctx, AV1_XFORM_QUANT_FP);
+ if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
+ av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+ av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
+ &this_rd_stats.dist, &this_rd_stats.sse,
+ OUTPUT_HAS_PREDICTED_PIXELS);
+ const SCAN_ORDER *scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+ this_rd_stats.rate = av1_cost_coeffs(
+ cpi, x, plane, block, tx_size, scan_order, a, l, use_fast_coef_costing);
+ int rd =
+ RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+ if (rd < best_rd) {
+ best_rd = rd;
+ *rd_stats = this_rd_stats;
+ best_tx_type = tx_type;
+ }
+ }
+ if (plane == 0) mbmi->txk_type[block] = best_tx_type;
+ // TODO(angiebird): Instead of re-call av1_xform_quant and av1_optimize_b,
+ // copy the best result in the above tx_type search for loop
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ coeff_ctx, AV1_XFORM_QUANT_FP);
+ if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
+ av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+ if (!is_inter_block(mbmi)) {
+ // intra mode needs decoded result such that the next transform block
+ // can use it for prediction.
+ av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+ x->plane[plane].eobs[block]);
+ }
+ return best_rd;
+}
+#endif // CONFIG_TXK_SEL
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
new file mode 100644
index 0000000000..552d47b542
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef ENCODETXB_H_
+#define ENCODETXB_H_
+
+#include "./aom_config.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+void av1_free_txb_buf(AV1_COMP *cpi);
+int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+ int block, TXB_CTX *txb_ctx);
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+ aom_writer *w, int block, int plane,
+ const tran_low_t *tcoeff, uint16_t eob,
+ TXB_CTX *txb_ctx);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+ aom_writer *w, int plane);
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob);
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ const int mi_row, const int mi_col);
+void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w);
+
+#if CONFIG_TXK_SEL
+int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+ int use_fast_coef_costing, RD_STATS *rd_stats);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif // COEFFS_CODING_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
new file mode 100644
index 0000000000..34f0b95665
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+ int i, j, k, l, m, n;
+
+ for (i = 0; i < REFERENCE_MODES; i++)
+ td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+#if CONFIG_GLOBAL_MOTION
+ for (i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+ td->rd_counts.global_motion_used[i] +=
+ td_t->rd_counts.global_motion_used[i];
+#endif // CONFIG_GLOBAL_MOTION
+
+ for (i = 0; i < TX_SIZES; i++)
+ for (j = 0; j < PLANE_TYPES; j++)
+ for (k = 0; k < REF_TYPES; k++)
+ for (l = 0; l < COEF_BANDS; l++)
+ for (m = 0; m < COEFF_CONTEXTS; m++)
+ for (n = 0; n < ENTROPY_TOKENS; n++)
+ td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+ td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+ AV1_COMP *const cpi = thread_data->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int t;
+
+ (void)unused;
+
+ for (t = thread_data->start; t < tile_rows * tile_cols;
+ t += cpi->num_workers) {
+ int tile_row = t / tile_cols;
+ int tile_col = t % tile_cols;
+
+ av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+ }
+
+ return 0;
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
+ int i;
+
+ av1_init_tile_data(cpi);
+
+ // Only run once to create threads and allocate thread data.
+ if (cpi->num_workers == 0) {
+ CHECK_MEM_ERROR(cm, cpi->workers,
+ aom_malloc(num_workers * sizeof(*cpi->workers)));
+
+ CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+ aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+ for (i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+ ++cpi->num_workers;
+ winterface->init(worker);
+
+ thread_data->cpi = cpi;
+
+ if (i < num_workers - 1) {
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ aom_memalign(32, sizeof(*thread_data->td)));
+ av1_zero(*thread_data->td);
+
+ // Set up pc_tree.
+ thread_data->td->leaf_tree = NULL;
+ thread_data->td->pc_tree = NULL;
+ av1_setup_pc_tree(cm, thread_data->td);
+
+ // Set up variance tree if needed.
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+ av1_setup_var_tree(cm, thread_data->td);
+
+ // Allocate frame counters in thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td->counts,
+ aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ } else {
+ // Main thread acts as a worker and uses the thread data in cpi.
+ thread_data->td = &cpi->td;
+ }
+
+ winterface->sync(worker);
+ }
+ }
+
+ for (i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *thread_data;
+
+ worker->hook = (AVxWorkerHook)enc_worker_hook;
+ worker->data1 = &cpi->tile_thr_data[i];
+ worker->data2 = NULL;
+ thread_data = (EncWorkerData *)worker->data1;
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ }
+ if (thread_data->td->counts != &cpi->common.counts) {
+ memcpy(thread_data->td->counts, &cpi->common.counts,
+ sizeof(cpi->common.counts));
+ }
+
+#if CONFIG_PALETTE
+ // Allocate buffers used by palette coding mode.
+ if (cpi->common.allow_screen_content_tools && i < num_workers - 1) {
+ MACROBLOCK *x = &thread_data->td->mb;
+ CHECK_MEM_ERROR(cm, x->palette_buffer,
+ aom_memalign(16, sizeof(*x->palette_buffer)));
+ }
+#endif // CONFIG_PALETTE
+ }
+
+ // Encode a frame
+ for (i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ if (i == cpi->num_workers - 1)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+
+ // Encoding ends.
+ for (i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ winterface->sync(worker);
+ }
+
+ for (i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
new file mode 100644
index 0000000000..6c30a3e5cf
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ETHREAD_H_
+#define AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+ struct AV1_COMP *cpi;
+ struct ThreadData *td;
+ int start;
+} EncWorkerData;
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
new file mode 100644
index 0000000000..007694a38c
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+ uint8_t *dst, int dst_pitch, int w, int h,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+
+ // copy the left and right most columns out
+ const uint8_t *src_ptr1 = src;
+ const uint8_t *src_ptr2 = src + w - 1;
+ uint8_t *dst_ptr1 = dst - extend_left;
+ uint8_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+ memset(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize);
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize);
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+ uint8_t *dst8, int dst_pitch, int w,
+ int h, int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ // copy the left and right most columns out
+ const uint16_t *src_ptr1 = src;
+ const uint16_t *src_ptr2 = src + w - 1;
+ uint16_t *dst_ptr1 = dst - extend_left;
+ uint16_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+ aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+ dst_ptr2 += dst_pitch;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ // Extend src frame in buffer
+ // Altref filtering assumes 16 pixel extension
+ const int et_y = 16;
+ const int el_y = 16;
+ // Motion estimation may use src block variance with the block size up
+ // to 64x64, so the right and bottom need to be extended to 64 multiple
+ // or up to 16, whichever is greater.
+ const int er_y =
+ AOMMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+ src->y_crop_width;
+ const int eb_y =
+ AOMMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+ src->y_crop_height;
+ const int uv_width_subsampling = (src->uv_width != src->y_width);
+ const int uv_height_subsampling = (src->uv_height != src->y_height);
+ const int et_uv = et_y >> uv_height_subsampling;
+ const int el_uv = el_y >> uv_width_subsampling;
+ const int eb_uv = eb_y >> uv_height_subsampling;
+ const int er_uv = er_y >> uv_width_subsampling;
+
+#if CONFIG_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width,
+ src->y_crop_height, et_y, el_y, eb_y, er_y);
+
+ highbd_copy_and_extend_plane(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+
+ highbd_copy_and_extend_plane(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width, src->y_crop_height,
+ et_y, el_y, eb_y, er_y);
+
+ copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+ et_uv, el_uv, eb_uv, er_uv);
+
+ copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+ et_uv, el_uv, eb_uv, er_uv);
+}
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw) {
+ // If the side is not touching the bounder then don't extend.
+ const int et_y = srcy ? 0 : dst->border;
+ const int el_y = srcx ? 0 : dst->border;
+ const int eb_y = srcy + srch != src->y_height
+ ? 0
+ : dst->border + dst->y_height - src->y_height;
+ const int er_y = srcx + srcw != src->y_width
+ ? 0
+ : dst->border + dst->y_width - src->y_width;
+ const int src_y_offset = srcy * src->y_stride + srcx;
+ const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+ const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+ const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+ const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+ const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+ const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+ const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+ const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+ const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+
+ copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+ dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
+ et_y, el_y, eb_y, er_y);
+
+ copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+ dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+ srch_uv, et_uv, el_uv, eb_uv, er_uv);
+
+ copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+ dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+ srch_uv, et_uv, el_uv, eb_uv, er_uv);
+}
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
new file mode 100644
index 0000000000..48178b9647
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_EXTEND_H_
+#define AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
new file mode 100644
index 0000000000..e35a54ef2f
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -0,0 +1,3026 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "aom_dsp/variance.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h" // av1_setup_dst_planes()
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+
+#define OUTPUT_FPF 0
+#define ARF_STATS_OUTPUT 0
+
+#define GROUP_ADAPTIVE_MAXQ 1
+
+#define BOOST_BREAKOUT 12.5
+#define BOOST_FACTOR 12.5
+#define FACTOR_PT_LOW 0.70
+#define FACTOR_PT_HIGH 0.90
+#define FIRST_PASS_Q 10.0
+#define GF_MAX_BOOST 96.0
+#define INTRA_MODE_PENALTY 1024
+#define KF_MAX_BOOST 128.0
+#define MIN_ARF_GF_BOOST 240
+#define MIN_DECAY_FACTOR 0.01
+#define MIN_KF_BOOST 300
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+#define DEFAULT_GRP_WEIGHT 1.0
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.75
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+ p->stats_in = position;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+ if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+ (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+ return NULL;
+ }
+
+ return &p->stats_in[offset];
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+ if (p->stats_in >= p->stats_in_end) return EOF;
+
+ *fps = *p->stats_in;
+ ++p->stats_in;
+ return 1;
+}
+
+static void output_stats(FIRSTPASS_STATS *stats,
+ struct aom_codec_pkt_list *pktlist) {
+ struct aom_codec_cx_pkt pkt;
+ pkt.kind = AOM_CODEC_STATS_PKT;
+ pkt.data.twopass_stats.buf = stats;
+ pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+ aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile,
+ "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
+ stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+ stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+ stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+ stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+ stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+ stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+ stats->count, stats->duration);
+ fclose(fpfile);
+ }
+#endif
+}
+
+#if CONFIG_FP_MB_STATS
+static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size,
+ struct aom_codec_pkt_list *pktlist) {
+ struct aom_codec_cx_pkt pkt;
+ pkt.kind = AOM_CODEC_FPMB_STATS_PKT;
+ pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
+ pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats);
+ aom_codec_pkt_list_add(pktlist, &pkt);
+}
+#endif
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+ section->frame = 0.0;
+ section->weight = 0.0;
+ section->intra_error = 0.0;
+ section->coded_error = 0.0;
+ section->sr_coded_error = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
+ section->intra_skip_pct = 0.0;
+ section->inactive_zone_rows = 0.0;
+ section->inactive_zone_cols = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->new_mv_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame += frame->frame;
+ section->weight += frame->weight;
+ section->intra_error += frame->intra_error;
+ section->coded_error += frame->coded_error;
+ section->sr_coded_error += frame->sr_coded_error;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
+ section->intra_skip_pct += frame->intra_skip_pct;
+ section->inactive_zone_rows += frame->inactive_zone_rows;
+ section->inactive_zone_cols += frame->inactive_zone_cols;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->new_mv_count += frame->new_mv_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame -= frame->frame;
+ section->weight -= frame->weight;
+ section->intra_error -= frame->intra_error;
+ section->coded_error -= frame->coded_error;
+ section->sr_coded_error -= frame->sr_coded_error;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->intra_skip_pct -= frame->intra_skip_pct;
+ section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->new_mv_count -= frame->new_mv_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0 // 1920x1080
+static double get_linear_size_factor(const AV1_COMP *cpi) {
+ const double this_area = cpi->initial_width * cpi->initial_height;
+ return pow(this_area / BASE_SIZE, 0.5);
+}
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *this_frame) {
+ double active_pct;
+
+ active_pct =
+ 1.0 -
+ ((this_frame->intra_skip_pct / 2) +
+ ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+ return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const AV1_COMP *cpi,
+ const TWO_PASS *twopass,
+ const AV1EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
+ const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+ const double av_weight = stats->weight / stats->count;
+ const double av_err = (stats->coded_error * av_weight) / stats->count;
+ double modified_error =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ oxcf->two_pass_vbrbias / 100.0);
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_error *=
+ pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+ return fclamp(modified_error, twopass->modified_error_min,
+ twopass->modified_error_max);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+ const AV1EncoderConfig *oxcf) {
+ int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+ (int64_t)oxcf->two_pass_vbrmax_section) /
+ 100;
+ if (max_bits < 0)
+ max_bits = 0;
+ else if (max_bits > rc->max_frame_bandwidth)
+ max_bits = rc->max_frame_bandwidth;
+
+ return (int)max_bits;
+}
+
+void av1_init_first_pass(AV1_COMP *cpi) {
+ zero_stats(&cpi->twopass.total_stats);
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+ output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_8X8: return aom_mse8x8;
+ case BLOCK_16X8: return aom_mse16x8;
+ case BLOCK_8X16: return aom_mse8x16;
+ default: return aom_mse16x16;
+ }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+
+#if CONFIG_HIGHBITDEPTH
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+ int bd) {
+ switch (bd) {
+ default:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_8_mse8x8;
+ case BLOCK_16X8: return aom_highbd_8_mse16x8;
+ case BLOCK_8X16: return aom_highbd_8_mse8x16;
+ default: return aom_highbd_8_mse16x16;
+ }
+ break;
+ case 10:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_10_mse8x8;
+ case BLOCK_16X8: return aom_highbd_10_mse16x8;
+ case BLOCK_8X16: return aom_highbd_10_mse8x16;
+ default: return aom_highbd_10_mse16x16;
+ }
+ break;
+ case 12:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_12_mse8x8;
+ case BLOCK_16X8: return aom_highbd_12_mse16x8;
+ case BLOCK_8X16: return aom_highbd_12_mse8x16;
+ default: return aom_highbd_12_mse16x16;
+ }
+ break;
+ }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref,
+ int bd) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const AV1_COMP *cpi) {
+ int sr = 0;
+ const int dim = AOMMIN(cpi->initial_width, cpi->initial_height);
+
+ while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+ return sr;
+}
+
+static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+ const MV *ref_mv, MV *best_mv,
+ int *best_motion_err) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV tmp_mv = { 0, 0 };
+ MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int num00, tmp_err, n;
+ const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+ aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+ const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+
+ int step_param = 3;
+ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+ const int sr = get_search_range(cpi);
+ step_param += sr;
+ further_steps -= sr;
+
+ // Override the default variance function to use MSE.
+ v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ // Center the initial step/diamond search on best mv.
+ tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+ step_param, x->sadperbit16, &num00,
+ &v_fn_ptr, ref_mv);
+ if (tmp_err < INT_MAX)
+ tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = tmp_mv;
+ }
+
+ // Carry out further step/diamond searches as necessary.
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ --num00;
+ } else {
+ tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+ step_param + n, x->sadperbit16, &num00,
+ &v_fn_ptr, ref_mv);
+ if (tmp_err < INT_MAX)
+ tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty)
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = tmp_mv;
+ }
+ }
+ }
+}
+
+static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) {
+ if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] <
+ cm->mi_cols) {
+ return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+ cm->mi_rows
+ ? BLOCK_16X16
+ : BLOCK_16X8;
+ } else {
+ return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+ cm->mi_rows
+ ? BLOCK_8X16
+ : BLOCK_8X8;
+ }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+ int i;
+
+ for (i = 0; i < QINDEX_RANGE; ++i)
+ if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break;
+
+ if (i == QINDEX_RANGE) i--;
+
+ return i;
+}
+
+static void set_first_pass_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!cpi->refresh_alt_ref_frame &&
+ (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+ cm->frame_type = KEY_FRAME;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ // Do not use periodic key frames.
+ cpi->rc.frames_to_key = INT_MAX;
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+ int mb_row, mb_col;
+ MACROBLOCK *const x = &cpi->td.mb;
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo tile;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const PICK_MODE_CONTEXT *ctx =
+ &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
+ int i;
+
+ int recon_yoffset, recon_uvoffset;
+ int64_t intra_error = 0;
+ int64_t coded_error = 0;
+ int64_t sr_coded_error = 0;
+
+ int sum_mvr = 0, sum_mvc = 0;
+ int sum_mvr_abs = 0, sum_mvc_abs = 0;
+ int64_t sum_mvrs = 0, sum_mvcs = 0;
+ int mvcount = 0;
+ int intercount = 0;
+ int second_ref_count = 0;
+ const int intrapenalty = INTRA_MODE_PENALTY;
+ double neutral_count;
+ int intra_skip_count = 0;
+ int image_data_start_row = INVALID_ROW;
+ int new_mv_count = 0;
+ int sum_in_vectors = 0;
+ MV lastmv = { 0, 0 };
+ TWO_PASS *twopass = &cpi->twopass;
+ const MV zero_mv = { 0, 0 };
+ int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+ YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+ const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+ double intra_factor;
+ double brightness_factor;
+ BufferPool *const pool = cm->buffer_pool;
+ const int qindex = find_fp_qindex(cm->bit_depth);
+ const int mb_scale = mi_size_wide[BLOCK_16X16];
+#if CONFIG_PVQ
+ PVQ_QUEUE pvq_q;
+ od_adapt_ctx pvq_context;
+#endif
+
+ // First pass code requires valid last and new frame buffers.
+ assert(new_yv12 != NULL);
+ assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs);
+ }
+#endif
+
+ aom_clear_system_state();
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+ x->e_mbd.mi[0]->mbmi.sb_type = BLOCK_16X16;
+
+ intra_factor = 0.0;
+ brightness_factor = 0.0;
+ neutral_count = 0.0;
+
+ set_first_pass_params(cpi);
+ av1_set_quantizer(cm, qindex);
+
+ av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+ av1_setup_src_planes(x, cpi->source, 0, 0);
+ av1_setup_dst_planes(xd->plane, cm->sb_size, new_yv12, 0, 0);
+
+ if (!frame_is_intra_only(cm)) {
+ av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+ }
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+
+#if CONFIG_CFL
+ // Don't store luma on the fist pass since chroma is not computed
+ x->cfl_store_y = 0;
+#endif
+ av1_frame_init_quantizer(cpi);
+
+#if CONFIG_PVQ
+ // For pass 1 of 2-pass encoding, init here for PVQ for now.
+ {
+ pvq_q.buf_len = 5000;
+ CHECK_MEM_ERROR(cm, pvq_q.buf,
+ aom_malloc(pvq_q.buf_len * sizeof(PVQ_INFO)));
+ pvq_q.curr_pos = 0;
+ x->pvq_coded = 0;
+
+ x->pvq_q = &pvq_q;
+
+ // TODO(yushin): Since this init step is also called in 2nd pass,
+ // or 1-pass encoding, consider factoring out it as a function.
+ // TODO(yushin)
+ // If activity masking is enabled, change below to OD_HVS_QM
+ x->daala_enc.qm = OD_FLAT_QM; // Hard coded. Enc/dec required to sync.
+ x->daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA;
+ x->daala_enc.pvq_norm_lambda_dc = OD_PVQ_LAMBDA;
+
+ od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv,
+ x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+#if CONFIG_DAALA_EC
+ od_ec_enc_init(&x->daala_enc.w.ec, 65025);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+#if CONFIG_DAALA_EC
+ od_ec_enc_reset(&x->daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ }
+#endif
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ pd[i].dqcoeff = ctx->dqcoeff[i];
+#if CONFIG_PVQ
+ pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
+#endif
+ p[i].eobs = ctx->eobs[i];
+#if CONFIG_LV_MAP
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+#endif
+ }
+
+ av1_init_mv_probs(cm);
+#if CONFIG_ADAPT_SCAN
+ av1_init_scan_order(cm);
+#endif
+ av1_convolve_init(cm);
+#if CONFIG_PVQ
+ od_adapt_ctx_reset(&pvq_context, 0);
+ x->daala_enc.state.adapt = &pvq_context;
+#endif // CONFIG_PVQ
+ av1_initialize_rd_consts(cpi);
+
+ // Tiling is ignored in the first pass.
+ av1_tile_init(&tile, cm, 0, 0);
+
+ recon_y_stride = new_yv12->y_stride;
+ recon_uv_stride = new_yv12->uv_stride;
+ uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ MV best_ref_mv = { 0, 0 };
+
+ // Reset above block coeffs.
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.row_max =
+ ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ int this_error;
+ const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+ const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+ double log_intra;
+ int level_sample;
+
+#if CONFIG_FP_MB_STATS
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
+
+ aom_clear_system_state();
+
+ xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+ xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+ xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+ xd->mi[0]->mbmi.sb_type = bsize;
+ xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+ set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize],
+ mb_col * mb_scale, mi_size_wide[bsize],
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize]);
+
+ // Do intra 16x16 prediction.
+ xd->mi[0]->mbmi.segment_id = 0;
+#if CONFIG_SUPERTX
+ xd->mi[0]->mbmi.segment_id_supertx = 0;
+#endif // CONFIG_SUPERTX
+ xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0);
+ xd->mi[0]->mbmi.mode = DC_PRED;
+ xd->mi[0]->mbmi.tx_size =
+ use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+ av1_encode_intra_block_plane(cm, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
+ this_error = aom_get_mb_ss(x->plane[0].src_diff);
+
+ // Keep a record of blocks that have almost no intra error residual
+ // (i.e. are in effect completely flat and untextured in the intra
+ // domain). In natural videos this is uncommon, but it is much more
+ // common in animations, graphics and screen content, so may be used
+ // as a signal to detect these types of content.
+ if (this_error < UL_INTRA_THRESH) {
+ ++intra_skip_count;
+ } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
+ image_data_start_row = mb_row;
+ }
+
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: this_error >>= 4; break;
+ case AOM_BITS_12: this_error >>= 8; break;
+ default:
+ assert(0 &&
+ "cm->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ return;
+ }
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ aom_clear_system_state();
+ log_intra = log(this_error + 1.0);
+ if (log_intra < 10.0)
+ intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+ else
+ intra_factor += 1.0;
+
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+ else
+ level_sample = x->plane[0].src.buf[0];
+#else
+ level_sample = x->plane[0].src.buf[0];
+#endif
+ if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+ brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+ else
+ brightness_factor += 1.0;
+
+ // Intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (e.g. a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_error += intrapenalty;
+
+ // Accumulate the intra error.
+ intra_error += (int64_t)this_error;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // initialization
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ }
+#endif
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.col_max =
+ ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+ if (!frame_is_intra_only(cm)) { // Do a motion search
+ int tmp_err, motion_error, raw_motion_error;
+ // Assume 0,0 motion with no mv overhead.
+ MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+ struct buf_2d unscaled_last_source_buf_2d;
+
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
+ motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+ }
+#else
+ motion_error =
+ get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif // CONFIG_HIGHBITDEPTH
+
+ // Compute the motion error of the 0,0 motion using the last source
+ // frame as the reference. Skip the further motion search on
+ // reconstructed frame if this error is small.
+ unscaled_last_source_buf_2d.buf =
+ cpi->unscaled_last_source->y_buffer + recon_yoffset;
+ unscaled_last_source_buf_2d.stride =
+ cpi->unscaled_last_source->y_stride;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ raw_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+ } else {
+ raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
+ }
+#else
+ raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
+#endif // CONFIG_HIGHBITDEPTH
+
+ // TODO(pengchong): Replace the hard-coded threshold
+ if (raw_motion_error > 25) {
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search.
+ first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
+
+ // If the current best reference mv is not centered on 0,0 then do a
+ // 0,0 based search as well.
+ if (!is_zero_mv(&best_ref_mv)) {
+ tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv = tmp_mv;
+ }
+ }
+
+ // Search in an older reference frame.
+ if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
+ // Assume 0,0 motion with no mv overhead.
+ int gf_motion_error;
+
+ xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ gf_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
+ gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+ }
+#else
+ gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+#endif // CONFIG_HIGHBITDEPTH
+
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
+ &gf_motion_error);
+
+ if (gf_motion_error < motion_error && gf_motion_error < this_error)
+ ++second_ref_count;
+
+ // Reset to last frame as reference buffer.
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+ xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+ xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+ // In accumulating a score for the older reference frame take the
+ // best of the motion predicted score and the intra coded error
+ // (just as will be done for) accumulation of "coded_error" for
+ // the last frame.
+ if (gf_motion_error < this_error)
+ sr_coded_error += gf_motion_error;
+ else
+ sr_coded_error += this_error;
+ } else {
+ sr_coded_error += motion_error;
+ }
+ } else {
+ sr_coded_error += motion_error;
+ }
+
+ // Start by assuming that intra mode is best.
+ best_ref_mv.row = 0;
+ best_ref_mv.col = 0;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // intra predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+ if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+ } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+ }
+ }
+#endif
+
+ if (motion_error <= this_error) {
+ aom_clear_system_state();
+
+ // Keep a count of cases where the inter and intra were very close
+ // and very low. This helps with scene cut detection for example in
+ // cropped clips with black bars at the sides or top and bottom.
+ if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+ (this_error < (2 * intrapenalty))) {
+ neutral_count += 1.0;
+ // Also track cases where the intra is not much worse than the inter
+ // and use this in limiting the GF/arf group length.
+ } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+ (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+ neutral_count +=
+ (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+ }
+
+ mv.row *= 8;
+ mv.col *= 8;
+ this_error = motion_error;
+ xd->mi[0]->mbmi.mode = NEWMV;
+ xd->mi[0]->mbmi.mv[0].as_mv = mv;
+ xd->mi[0]->mbmi.tx_size = TX_4X4;
+ xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+ xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+ av1_build_inter_predictors_sby(xd, mb_row * mb_scale,
+ mb_col * mb_scale, NULL, bsize);
+ av1_encode_sby_pass1(cm, x, bsize);
+ sum_mvr += mv.row;
+ sum_mvr_abs += abs(mv.row);
+ sum_mvc += mv.col;
+ sum_mvc_abs += abs(mv.col);
+ sum_mvrs += mv.row * mv.row;
+ sum_mvcs += mv.col * mv.col;
+ ++intercount;
+
+ best_ref_mv = mv;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // inter predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+ if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LARGE_MASK;
+ } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_SMALL_MASK;
+ }
+ }
+#endif
+
+ if (!is_zero_mv(&mv)) {
+ ++mvcount;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] &=
+ ~FPMB_MOTION_ZERO_MASK;
+ // check estimated motion direction
+ if (mv.col > 0 && mv.col >= abs(mv.row)) {
+ // right direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_RIGHT_MASK;
+ } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) {
+ // up direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_UP_MASK;
+ } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) {
+ // left direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_LEFT_MASK;
+ } else {
+ // down direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_DOWN_MASK;
+ }
+ }
+#endif
+
+ // Non-zero vector, was it different from the last non zero vector?
+ if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
+ lastmv = mv;
+
+ // Does the row vector point inwards or outwards?
+ if (mb_row < cm->mb_rows / 2) {
+ if (mv.row > 0)
+ --sum_in_vectors;
+ else if (mv.row < 0)
+ ++sum_in_vectors;
+ } else if (mb_row > cm->mb_rows / 2) {
+ if (mv.row > 0)
+ ++sum_in_vectors;
+ else if (mv.row < 0)
+ --sum_in_vectors;
+ }
+
+ // Does the col vector point inwards or outwards?
+ if (mb_col < cm->mb_cols / 2) {
+ if (mv.col > 0)
+ --sum_in_vectors;
+ else if (mv.col < 0)
+ ++sum_in_vectors;
+ } else if (mb_col > cm->mb_cols / 2) {
+ if (mv.col > 0)
+ ++sum_in_vectors;
+ else if (mv.col < 0)
+ --sum_in_vectors;
+ }
+ }
+ }
+ } else {
+ sr_coded_error += (int64_t)this_error;
+ }
+ coded_error += (int64_t)this_error;
+
+ // Adjust to the next column of MBs.
+ x->plane[0].src.buf += 16;
+ x->plane[1].src.buf += uv_mb_height;
+ x->plane[2].src.buf += uv_mb_height;
+
+ recon_yoffset += 16;
+ recon_uvoffset += uv_mb_height;
+ }
+
+ // Adjust to the next row of MBs.
+ x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
+ x->plane[1].src.buf +=
+ uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+ x->plane[2].src.buf +=
+ uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+
+ aom_clear_system_state();
+ }
+
+#if CONFIG_PVQ
+#if CONFIG_DAALA_EC
+ od_ec_enc_clear(&x->daala_enc.w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+
+ x->pvq_q->last_pos = x->pvq_q->curr_pos;
+ x->pvq_q->curr_pos = 0;
+ x->pvq_q = NULL;
+
+ aom_free(pvq_q.buf);
+#endif
+
+ // Clamp the image start to rows/2. This number of rows is discarded top
+ // and bottom as dead data so rows / 2 means the frame is blank.
+ if ((image_data_start_row > cm->mb_rows / 2) ||
+ (image_data_start_row == INVALID_ROW)) {
+ image_data_start_row = cm->mb_rows / 2;
+ }
+ // Exclude any image dead zone
+ if (image_data_start_row > 0) {
+ intra_skip_count =
+ AOMMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+ }
+
+ {
+ FIRSTPASS_STATS fps;
+ // The minimum error here insures some bit allocation to frames even
+ // in static regions. The allocation per MB declines for larger formats
+ // where the typical "real" energy per MB also falls.
+ // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+ // number of mbs is proportional to the image area.
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const double min_err = 200 * sqrt(num_mbs);
+
+ intra_factor = intra_factor / (double)num_mbs;
+ brightness_factor = brightness_factor / (double)num_mbs;
+ fps.weight = intra_factor * brightness_factor;
+
+ fps.frame = cm->current_video_frame;
+ fps.coded_error = (double)(coded_error >> 8) + min_err;
+ fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
+ fps.intra_error = (double)(intra_error >> 8) + min_err;
+ fps.count = 1.0;
+ fps.pcnt_inter = (double)intercount / num_mbs;
+ fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+ fps.pcnt_neutral = (double)neutral_count / num_mbs;
+ fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+ fps.inactive_zone_rows = (double)image_data_start_row;
+ fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix
+
+ if (mvcount > 0) {
+ fps.MVr = (double)sum_mvr / mvcount;
+ fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+ fps.MVc = (double)sum_mvc / mvcount;
+ fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+ fps.MVrv =
+ ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+ fps.MVcv =
+ ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
+ fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
+ fps.new_mv_count = new_mv_count;
+ fps.pcnt_motion = (double)mvcount / num_mbs;
+ } else {
+ fps.MVr = 0.0;
+ fps.mvr_abs = 0.0;
+ fps.MVc = 0.0;
+ fps.mvc_abs = 0.0;
+ fps.MVrv = 0.0;
+ fps.MVcv = 0.0;
+ fps.mv_in_out_count = 0.0;
+ fps.new_mv_count = 0.0;
+ fps.pcnt_motion = 0.0;
+ }
+
+ // TODO(paulwilkins): Handle the case when duration is set to 0, or
+ // something less than the full time between subsequent values of
+ // cpi->source_time_stamp.
+ fps.duration = (double)(source->ts_end - source->ts_start);
+
+ // Don't want to do output stats with a stack variable!
+ twopass->this_frame_stats = fps;
+ output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+ accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs,
+ cpi->output_pkt_list);
+ }
+#endif
+ }
+
+ // Copy the previous Last Frame back into gf and and arf buffers if
+ // the prediction is good enough... but also don't allow it to lag too far.
+ if ((twopass->sr_update_lag > 3) ||
+ ((cm->current_video_frame > 0) &&
+ (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+ ((twopass->this_frame_stats.intra_error /
+ DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+ if (gld_yv12 != NULL) {
+#if CONFIG_EXT_REFS
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif // CONFIG_EXT_REFS
+ }
+ twopass->sr_update_lag = 1;
+ } else {
+ ++twopass->sr_update_lag;
+ }
+
+ aom_extend_frame_borders(new_yv12);
+
+// The frame we just compressed now becomes the last frame.
+#if CONFIG_EXT_REFS
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]],
+ cm->new_fb_idx);
+#else
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+ cm->new_fb_idx);
+#endif // CONFIG_EXT_REFS
+
+ // Special case for the first frame. Copy into the GF buffer as a second
+ // reference.
+ if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
+#if CONFIG_EXT_REFS
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif // CONFIG_EXT_REFS
+ }
+
+ // Use this to see what the first pass reconstruction looks like.
+ if (0) {
+ char filename[512];
+ FILE *recon_file;
+ snprintf(filename, sizeof(filename), "enc%04d.yuv",
+ (int)cm->current_video_frame);
+
+ if (cm->current_video_frame == 0)
+ recon_file = fopen(filename, "wb");
+ else
+ recon_file = fopen(filename, "ab");
+
+ (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+ fclose(recon_file);
+ }
+
+ ++cm->current_video_frame;
+}
+
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+ double pt_low, double pt_high, int q,
+ aom_bit_depth_t bit_depth) {
+ const double error_term = err_per_mb / err_divisor;
+
+ // Adjustment based on actual quantizer to power term.
+ const double power_term =
+ AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+ // Calculate correction factor.
+ if (power_term < 1.0) assert(error_term >= 0.0);
+
+ return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+#define ERR_DIVISOR 100.0
+static int get_twopass_worst_quality(const AV1_COMP *cpi,
+ const double section_err,
+ double inactive_zone,
+ int section_target_bandwidth,
+ double group_weight_factor) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+ if (section_target_bandwidth <= 0) {
+ return rc->worst_quality; // Highest value allowed
+ } else {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+ const double av_err_per_mb = section_err / active_mbs;
+ const double speed_term = 1.0 + 0.04 * oxcf->speed;
+ double ediv_size_correction;
+ const int target_norm_bits_per_mb =
+ (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
+ active_mbs;
+ int q;
+
+ // Larger image formats are expected to be a little harder to code
+ // relatively given the same prediction error score. This in part at
+ // least relates to the increased size and hence coding overheads of
+ // motion vectors. Some account of this is made through adjustment of
+ // the error divisor.
+ ediv_size_correction =
+ AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
+ if (ediv_size_correction < 1.0)
+ ediv_size_correction = -(1.0 / ediv_size_correction);
+ ediv_size_correction *= 4.0;
+
+ // Try and pick a max Q that will be high enough to encode the
+ // content at the given rate.
+ for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+ const double factor = calc_correction_factor(
+ av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
+ FACTOR_PT_HIGH, q, cpi->common.bit_depth);
+ const int bits_per_mb = av1_rc_bits_per_mb(
+ INTER_FRAME, q, factor * speed_term * group_weight_factor,
+ cpi->common.bit_depth);
+ if (bits_per_mb <= target_norm_bits_per_mb) break;
+ }
+
+ // Restriction on active max q for constrained quality mode.
+ if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+ return q;
+ }
+}
+
+static void setup_rf_level_maxq(AV1_COMP *cpi) {
+ int i;
+ RATE_CONTROL *const rc = &cpi->rc;
+ for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+ int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality);
+ rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality);
+ }
+}
+
+void av1_init_subsampling(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int w = cm->width;
+ const int h = cm->height;
+ int i;
+
+ for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+ // Note: Frames with odd-sized dimensions may result from this scaling.
+ rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+ rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+ }
+
+ setup_rf_level_maxq(cpi);
+}
+
+void av1_calculate_coded_size(AV1_COMP *cpi, int *scaled_frame_width,
+ int *scaled_frame_height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+ *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ double frame_rate;
+ FIRSTPASS_STATS *stats;
+
+ zero_stats(&twopass->total_stats);
+ zero_stats(&twopass->total_left_stats);
+
+ if (!twopass->stats_in_end) return;
+
+ stats = &twopass->total_stats;
+
+ *stats = *twopass->stats_in_end;
+ twopass->total_left_stats = *stats;
+
+ frame_rate = 10000000.0 * stats->count / stats->duration;
+ // Each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However, the sum duration is not.
+ // It is calculated based on the actual durations of all frames from the
+ // first pass.
+ av1_new_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ // Scan the first pass file and calculate a modified total error based upon
+ // the bias/power function used to allocate bits.
+ {
+ const double avg_error =
+ stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double modified_error_total = 0.0;
+ twopass->modified_error_min =
+ (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+ twopass->modified_error_max =
+ (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+ while (s < twopass->stats_in_end) {
+ modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+ ++s;
+ }
+ twopass->modified_error_left = modified_error_total;
+ }
+
+ // Reset the vbr bits off target counters
+ cpi->rc.vbr_bits_off_target = 0;
+ cpi->rc.vbr_bits_off_target_fast = 0;
+
+ cpi->rc.rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+
+ if (oxcf->resize_mode != RESIZE_NONE) {
+ av1_init_subsampling(cpi);
+ }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+
+static double get_sr_decay_rate(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *frame) {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+ double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+ double sr_decay = 1.0;
+ double modified_pct_inter;
+ double modified_pcnt_intra;
+ const double motion_amplitude_factor =
+ frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+ modified_pct_inter = frame->pcnt_inter;
+ if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+ (double)NCOUNT_FRAME_II_THRESH) {
+ modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+ }
+ modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+ if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+ sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
+ sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+ (MOTION_AMP_PART * motion_amplitude_factor) -
+ (INTRA_PART * modified_pcnt_intra);
+ }
+ return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *frame) {
+ const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+ double sr_decay = get_sr_decay_rate(cpi, frame);
+ return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *next_frame) {
+ const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+ const double zero_motion_factor =
+ (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+ ZM_POWER_FACTOR));
+
+ return AOMMAX(zero_motion_factor,
+ (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
+ int still_interval,
+ double loop_decay_rate,
+ double last_decay_rate) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+ last_decay_rate < 0.9) {
+ int j;
+
+ // Look ahead a few frames to see if static condition persists...
+ for (j = 0; j < still_interval; ++j) {
+ const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+ if (stats >= twopass->stats_in_end) break;
+
+ if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+ }
+
+ // Only if it does do we signal a transition to still.
+ return j == still_interval;
+ }
+
+ return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+ const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+ // What we are looking for here is a situation where there is a
+ // brief break in prediction (such as a flash) but subsequent frames
+ // are reasonably well predicted by an earlier (pre flash) frame.
+ // The recovery after a flash is indicated by a high pcnt_second_ref
+ // compared to pcnt_inter.
+ return next_frame != NULL &&
+ next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+ next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+ double *mv_in_out,
+ double *mv_in_out_accumulator,
+ double *abs_mv_in_out_accumulator,
+ double *mv_ratio_accumulator) {
+ const double pct = stats->pcnt_motion;
+
+ // Accumulate Motion In/Out of frame stats.
+ *mv_in_out = stats->mv_in_out_count * pct;
+ *mv_in_out_accumulator += *mv_in_out;
+ *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+ // Accumulate a measure of how uniform (or conversely how random) the motion
+ // field is (a ratio of abs(mv) / mv).
+ if (pct > 0.05) {
+ const double mvr_ratio =
+ fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+ const double mvc_ratio =
+ fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+ *mv_ratio_accumulator +=
+ pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+ *mv_ratio_accumulator +=
+ pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+ }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+ double this_frame_mv_in_out, double max_boost) {
+ double frame_boost;
+ const double lq = av1_convert_qindex_to_q(
+ cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+ const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+ int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+
+ // Correct for any inactive region in the image
+ num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+ frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+ // Increase boost for frames where new data coming into frame (e.g. zoom out).
+ // Slightly reduce boost if there is a net balance of motion out of the frame
+ // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+ // In the extreme case the boost is halved.
+ else
+ frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
+ int *f_boost, int *b_boost) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ int i;
+ double boost_score = 0.0;
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ int arf_boost;
+ int flash_detected = 0;
+
+ // Search forward from the proposed arf/next gf position.
+ for (i = 0; i < f_frames; ++i) {
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, i + offset) ||
+ detect_flash(twopass, i + offset + 1);
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ }
+
+ *f_boost = (int)boost_score;
+
+ // Reset for backward looking loop.
+ boost_score = 0.0;
+ mv_ratio_accumulator = 0.0;
+ decay_accumulator = 1.0;
+ this_frame_mv_in_out = 0.0;
+ mv_in_out_accumulator = 0.0;
+ abs_mv_in_out_accumulator = 0.0;
+
+ // Search backward towards last gf position.
+ for (i = -1; i >= -b_frames; --i) {
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, i + offset) ||
+ detect_flash(twopass, i + offset + 1);
+
+ // Cumulative effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ }
+ *b_boost = (int)boost_score;
+
+ arf_boost = (*f_boost + *b_boost);
+ if (arf_boost < ((b_frames + f_frames) * 20))
+ arf_boost = ((b_frames + f_frames) * 20);
+ arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+ return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+ const FIRSTPASS_STATS *end,
+ int section_length) {
+ const FIRSTPASS_STATS *s = begin;
+ double intra_error = 0.0;
+ double coded_error = 0.0;
+ int i = 0;
+
+ while (s < end && i < section_length) {
+ intra_error += s->intra_error;
+ coded_error += s->coded_error;
+ ++s;
+ ++i;
+ }
+
+ return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+ double gf_group_err) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+ int64_t total_group_bits;
+
+ // Calculate the bits to be allocated to the group as a whole.
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+ total_group_bits = (int64_t)(twopass->kf_group_bits *
+ (gf_group_err / twopass->kf_group_error_left));
+ } else {
+ total_group_bits = 0;
+ }
+
+ // Clamp odd edge cases.
+ total_group_bits = (total_group_bits < 0)
+ ? 0
+ : (total_group_bits > twopass->kf_group_bits)
+ ? twopass->kf_group_bits
+ : total_group_bits;
+
+ // Clip based on user supplied data rate variability limit.
+ if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+ total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+ return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+ int64_t total_group_bits) {
+ int allocation_chunks;
+
+ // return 0 for invalid inputs (could arise e.g. through rounding errors)
+ if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+
+ allocation_chunks = (frame_count * 100) + boost;
+
+ // Prevent overflow.
+ if (boost > 1023) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of extra bits for use in the boosted frame or frames.
+ return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+ 0);
+}
+
+#if !CONFIG_EXT_REFS
+// Current limit on maximum number of active arfs in a GF/ARF group.
+#define MAX_ACTIVE_ARFS 2
+#define ARF_SLOT1 2
+#define ARF_SLOT2 3
+// This function indirects the choice of buffers for arfs.
+// At the moment the values are fixed but this may change as part of
+// the integration process with other codec features that swap buffers around.
+static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
+ arf_buffer_indices[0] = ARF_SLOT1;
+ arf_buffer_indices[1] = ARF_SLOT2;
+}
+#endif
+
+static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
+ double group_error, int gf_arf_bits) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ FIRSTPASS_STATS frame_stats;
+ int i;
+ int frame_index = 0;
+ int target_frame_size;
+ int key_frame;
+ const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+ int64_t total_group_bits = gf_group_bits;
+ double modified_err = 0.0;
+ double err_fraction;
+ int mid_boost_bits = 0;
+#if CONFIG_EXT_REFS
+ // The use of bi-predictive frames are only enabled when following 3
+ // conditions are met:
+ // (1) Alt-ref is enabled;
+ // (2) The bi-predictive group interval is at least 2; and
+ // (3) The bi-predictive group interval is strictly smaller than the
+ // golden group interval.
+ const int is_bipred_enabled =
+ rc->source_alt_ref_pending && rc->bipred_group_interval &&
+ rc->bipred_group_interval <=
+ (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+ int bipred_group_end = 0;
+ int bipred_frame_index = 0;
+
+ int arf_pos[MAX_EXT_ARFS + 1];
+ const unsigned char ext_arf_interval =
+ (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
+ int which_arf = cpi->num_extra_arfs;
+ int subgroup_interval[MAX_EXT_ARFS + 1];
+ int ext_arf_boost[MAX_EXT_ARFS];
+ int is_sg_bipred_enabled = is_bipred_enabled;
+ int accumulative_subgroup_interval = 0;
+#else
+ int mid_frame_idx;
+ unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+#endif // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+ av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
+#endif // CONFIG_EXT_REFS
+
+ key_frame = cpi->common.frame_type == KEY_FRAME;
+
+#if !CONFIG_EXT_REFS
+ get_arf_buffer_indices(arf_buffer_indices);
+#endif // !CONFIG_EXT_REFS
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ if (!key_frame) {
+ if (rc->source_alt_ref_active) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->bit_allocation[frame_index] = 0;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ gf_group->bit_allocation[frame_index] = gf_arf_bits;
+ }
+#if CONFIG_EXT_REFS
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+#else
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+#endif // CONFIG_EXT_REFS
+ // Step over the golden frame / overlay frame
+ if (EOF == input_stats(twopass, &frame_stats)) return;
+ }
+
+#if CONFIG_EXT_REFS
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+#endif // CONFIG_EXT_REFS
+
+ // Deduct the boost bits for arf (or gf if it is not a key frame)
+ // from the group total.
+ if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+ frame_index++;
+
+#if CONFIG_EXT_REFS
+ bipred_frame_index++;
+#endif // CONFIG_EXT_REFS
+
+ // Store the bits to spend on the ARF if there is one.
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+ gf_group->arf_src_offset[frame_index] =
+ (unsigned char)(rc->baseline_gf_interval - 1);
+
+#if CONFIG_EXT_REFS
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+// NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+#else
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[frame_index] =
+ arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
+ rc->source_alt_ref_active];
+#endif // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+ // Work out the ARFs' positions in this gf group
+ // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
+ // order (except for the original ARF). In the example of three ALT_REF's,
+ // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
+ // but code them in the following order:
+ // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
+ arf_pos[0] =
+ frame_index + cpi->num_extra_arfs + gf_group->arf_src_offset[1] + 1;
+ for (i = 0; i < cpi->num_extra_arfs; ++i) {
+ arf_pos[i + 1] =
+ frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
+ subgroup_interval[i] = arf_pos[i] - arf_pos[i + 1] - (i == 0 ? 1 : 2);
+ }
+ subgroup_interval[cpi->num_extra_arfs] = arf_pos[cpi->num_extra_arfs] -
+ frame_index -
+ (cpi->num_extra_arfs == 0 ? 1 : 2);
+#endif // CONFIG_EXT_REFS
+
+ ++frame_index;
+
+#if CONFIG_EXT_REFS
+ // Insert an extra ARF
+ if (cpi->num_extra_arfs) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ // Note (weitinglin): GF_ARF_LOW is also used as an identifier
+ // for internal ALT_REF's:
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+ gf_group->arf_update_idx[frame_index] = which_arf;
+ gf_group->arf_ref_idx[frame_index] = 0;
+ ++frame_index;
+ }
+ accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
+#else
+ if (cpi->multi_arf_enabled) {
+ // Set aside a slot for a level 1 arf.
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->arf_src_offset[frame_index] =
+ (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+ gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+ ++frame_index;
+ }
+#endif // CONFIG_EXT_ARFS
+ }
+
+#if !CONFIG_EXT_REFS
+ // Define middle frame
+ mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+#endif // !CONFIG_EXT_REFS
+
+ // Allocate bits to the other frames in the group.
+ for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+#if !CONFIG_EXT_REFS
+ int arf_idx = 0;
+#endif // !CONFIG_EXT_REFS
+
+ if (EOF == input_stats(twopass, &frame_stats)) break;
+
+ modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+
+ if (group_error > 0)
+ err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+ else
+ err_fraction = 0.0;
+
+ target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+ if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+ mid_boost_bits += (target_frame_size >> 4);
+ target_frame_size -= (target_frame_size >> 4);
+#if !CONFIG_EXT_REFS
+ if (frame_index <= mid_frame_idx) arf_idx = 1;
+#endif // !CONFIG_EXT_REFS
+ }
+
+#if CONFIG_EXT_REFS
+ gf_group->arf_update_idx[frame_index] = which_arf;
+ gf_group->arf_ref_idx[frame_index] = which_arf;
+#else
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+ gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+#endif // CONFIG_EXT_REFS
+
+ target_frame_size =
+ clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
+
+#if CONFIG_EXT_REFS
+ // If we are going to have ARFs, check if we can have BWDREF in this
+ // subgroup.
+ if (rc->source_alt_ref_pending) {
+ is_sg_bipred_enabled =
+ is_bipred_enabled &&
+ (subgroup_interval[which_arf] > rc->bipred_group_interval);
+ }
+
+ // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
+ // frame group interval is strictly smaller than that of the GOLDEN
+ // FRAME group interval.
+ // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
+ if (is_sg_bipred_enabled && !bipred_group_end) {
+ const int cur_brf_src_offset = rc->bipred_group_interval - 1;
+
+ // --- BRF_UPDATE ---
+ if (bipred_frame_index == 1) {
+ gf_group->update_type[frame_index] = BRF_UPDATE;
+ gf_group->bidir_pred_enabled[frame_index] = 1;
+ gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
+ // --- LAST_BIPRED_UPDATE ---
+ } else if (bipred_frame_index == rc->bipred_group_interval) {
+ gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
+ gf_group->bidir_pred_enabled[frame_index] = 1;
+ gf_group->brf_src_offset[frame_index] = 0;
+ // Reset the bi-predictive frame index.
+ bipred_frame_index = 0;
+ // --- BIPRED_UPDATE ---
+ } else {
+ gf_group->update_type[frame_index] = BIPRED_UPDATE;
+ gf_group->bidir_pred_enabled[frame_index] = 1;
+ gf_group->brf_src_offset[frame_index] = 0;
+ }
+
+ bipred_frame_index++;
+ // Check whether the next bi-predictive frame group would entirely be
+ // included within the current golden frame group.
+ // In addition, we need to avoid coding a BRF right before an ARF.
+ if (bipred_frame_index == 1 &&
+ (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) {
+ bipred_group_end = 1;
+ }
+ } else {
+#endif // CONFIG_EXT_REFS
+ gf_group->update_type[frame_index] = LF_UPDATE;
+#if CONFIG_EXT_REFS
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+ }
+#endif // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+ if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+ // Boost up the allocated bits on BWDREF_FRAME
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->bit_allocation[frame_index] =
+ target_frame_size + (target_frame_size >> 2);
+ } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
+ // Press down the allocated bits on LAST_BIPRED_UPDATE frames
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->bit_allocation[frame_index] =
+ target_frame_size - (target_frame_size >> 1);
+ } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
+ // TODO(zoeliu): To investigate whether the allocated bits on
+ // BIPRED_UPDATE frames need to be further adjusted.
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->bit_allocation[frame_index] = target_frame_size;
+ } else {
+#endif // CONFIG_EXT_REFS
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->bit_allocation[frame_index] = target_frame_size;
+#if CONFIG_EXT_REFS
+ }
+#endif // CONFIG_EXT_REFS
+
+ ++frame_index;
+
+#if CONFIG_EXT_REFS
+ // Check if we need to update the ARF
+ if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
+ frame_index > arf_pos[which_arf]) {
+ --which_arf;
+ accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
+ // Meet the new subgroup. Reset the bipred_group_end flag;
+ bipred_group_end = 0;
+ // Insert another extra ARF after the overlay frame
+ if (which_arf) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+ gf_group->arf_update_idx[frame_index] = which_arf;
+ gf_group->arf_ref_idx[frame_index] = 0;
+ ++frame_index;
+ }
+ }
+#endif // CONFIG_EXT_REFS
+ }
+
+// Note:
+// We need to configure the frame at the end of the sequence + 1 that will be
+// the start frame for the next group. Otherwise prior to the call to
+// av1_rc_get_second_pass_params() the data will be undefined.
+#if CONFIG_EXT_REFS
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+#else
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+#endif // CONFIG_EXT_REFS
+
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+#if CONFIG_EXT_REFS
+ if (cpi->num_extra_arfs) {
+ for (i = cpi->num_extra_arfs; i > 0; --i) {
+ int arf_pos_in_gf = (i == cpi->num_extra_arfs ? 2 : arf_pos[i + 1] + 1);
+ gf_group->bit_allocation[arf_pos_in_gf] =
+ gf_group->bit_allocation[arf_pos[i]];
+ gf_group->update_type[arf_pos[i]] = INTNL_OVERLAY_UPDATE;
+ gf_group->bit_allocation[arf_pos[i]] = 0;
+ gf_group->rf_level[arf_pos[i]] = INTER_NORMAL;
+ }
+ }
+#else
+ // Final setup for second arf and its overlay.
+ if (cpi->multi_arf_enabled) {
+ gf_group->bit_allocation[2] =
+ gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+ gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+ gf_group->bit_allocation[mid_frame_idx] = 0;
+ }
+#endif // CONFIG_EXT_REFS
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ }
+
+#if CONFIG_EXT_REFS
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+#endif // CONFIG_EXT_REFS
+
+ // Note whether multi-arf was enabled this group for next time.
+ cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
+}
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+ int i;
+
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+ double gf_group_raw_error = 0.0;
+#endif
+ double gf_group_skip_pct = 0.0;
+ double gf_group_inactive_zone_rows = 0.0;
+ double gf_first_frame_err = 0.0;
+ double mod_frame_err = 0.0;
+
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double zero_motion_accumulator = 1.0;
+
+ double loop_decay_rate = 1.00;
+ double last_loop_decay_rate = 1.00;
+
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ double mv_ratio_accumulator_thresh;
+ unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+ int f_boost = 0;
+ int b_boost = 0;
+ int flash_detected;
+ int active_max_gf_interval;
+ int active_min_gf_interval;
+ int64_t gf_group_bits;
+ double gf_group_error_left;
+ int gf_arf_bits;
+ const int is_key_frame = frame_is_intra_only(cm);
+ const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (is_key_frame == 0) {
+ av1_zero(twopass->gf_group);
+ }
+
+ aom_clear_system_state();
+ av1_zero(next_frame);
+
+ // Load stats for the current frame.
+ mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Note the error of the frame at the start of the group. This will be
+ // the GF frame error if we code a normal gf.
+ gf_first_frame_err = mod_frame_err;
+
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ if (arf_active_or_kf) {
+ gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error -= this_frame->coded_error;
+#endif
+ gf_group_skip_pct -= this_frame->intra_skip_pct;
+ gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+ }
+
+ // Motion breakout threshold for loop below depends on image size.
+ mv_ratio_accumulator_thresh =
+ (cpi->initial_height + cpi->initial_width) / 4.0;
+
+ // Set a maximum and minimum interval for the GF group.
+ // If the image appears almost completely static we can extend beyond this.
+ {
+ int int_max_q = (int)(av1_convert_qindex_to_q(twopass->active_worst_quality,
+ cpi->common.bit_depth));
+ int int_lbq = (int)(av1_convert_qindex_to_q(rc->last_boosted_qindex,
+ cpi->common.bit_depth));
+
+ active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
+ if (active_min_gf_interval > rc->max_gf_interval)
+ active_min_gf_interval = rc->max_gf_interval;
+
+ if (cpi->multi_arf_allowed) {
+ active_max_gf_interval = rc->max_gf_interval;
+ } else {
+ // The value chosen depends on the active Q range. At low Q we have
+ // bits to spare and are better with a smaller interval and smaller boost.
+ // At high Q when there are few bits to spare we are better with a longer
+ // interval to spread the cost of the GF.
+ active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
+
+ // We have: active_min_gf_interval <= rc->max_gf_interval
+ if (active_max_gf_interval < active_min_gf_interval)
+ active_max_gf_interval = active_min_gf_interval;
+ else if (active_max_gf_interval > rc->max_gf_interval)
+ active_max_gf_interval = rc->max_gf_interval;
+ }
+ }
+
+ i = 0;
+ while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+ ++i;
+
+ // Accumulate error score of frames in this gf group.
+ mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error += this_frame->coded_error;
+#endif
+ gf_group_skip_pct += this_frame->intra_skip_pct;
+ gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(twopass, 0);
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ last_loop_decay_rate = loop_decay_rate;
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+
+ // Monitor for static sections.
+ zero_motion_accumulator = AOMMIN(
+ zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+ // Break clause to detect very still sections after motion. For example,
+ // a static image after a fade or other transition.
+ if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+ last_loop_decay_rate)) {
+ allow_alt_ref = 0;
+ break;
+ }
+ }
+
+ // Calculate a boost number for this frame.
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+
+ // Break out conditions.
+ if (
+ // Break at active_max_gf_interval unless almost totally static.
+ (i >= (active_max_gf_interval + arf_active_or_kf) &&
+ zero_motion_accumulator < 0.995) ||
+ (
+ // Don't break out with a very short interval.
+ (i >= active_min_gf_interval + arf_active_or_kf) &&
+ (!flash_detected) &&
+ ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+ (abs_mv_in_out_accumulator > 3.0) ||
+ (mv_in_out_accumulator < -2.0) ||
+ ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
+ boost_score = old_boost_score;
+ break;
+ }
+
+ *this_frame = next_frame;
+ old_boost_score = boost_score;
+ }
+
+ twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
+ // Was the group length constrained by the requirement for a new KF?
+ rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+ // Should we use the alternate reference frame.
+ if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+ (i >= rc->min_gf_interval)) {
+ // Calculate the boost for alt ref.
+ rc->gfu_boost =
+ calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+ rc->source_alt_ref_pending = 1;
+
+ // Test to see if multi arf is appropriate.
+ cpi->multi_arf_enabled =
+ (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
+ (zero_motion_accumulator < 0.995))
+ ? 1
+ : 0;
+ } else {
+ rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
+ rc->source_alt_ref_pending = 0;
+ }
+
+ // Set the interval until the next gf.
+ rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+
+#if CONFIG_EXT_REFS
+ // Compute how many extra alt_refs we can have
+ cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
+ rc->source_alt_ref_pending);
+ // Currently at maximum two extra ARFs' are allowed
+ assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+#endif // CONFIG_EXT_REFS
+
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if CONFIG_EXT_REFS
+ rc->bipred_group_interval = BFG_INTERVAL;
+ // The minimum bi-predictive frame group interval is 2.
+ if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
+#endif // CONFIG_EXT_REFS
+
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
+
+ // Calculate the bits to be allocated to the gf/arf group as a whole
+ gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more agressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+ const int vbr_group_bits_per_frame =
+ (int)(gf_group_bits / rc->baseline_gf_interval);
+ const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+ const double group_av_skip_pct =
+ gf_group_skip_pct / rc->baseline_gf_interval;
+ const double group_av_inactive_zone =
+ ((gf_group_inactive_zone_rows * 2) /
+ (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+ int tmp_q;
+ // rc factor is a weight factor that corrects for local rate control drift.
+ double rc_factor = 1.0;
+ if (rc->rate_error_estimate > 0) {
+ rc_factor = AOMMAX(RC_FACTOR_MIN,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ } else {
+ rc_factor = AOMMIN(RC_FACTOR_MAX,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ }
+ tmp_q = get_twopass_worst_quality(
+ cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+ vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
+ twopass->active_worst_quality =
+ AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
+ }
+#endif
+
+ // Calculate the extra bits to be used for boosted frame(s)
+ gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
+ gf_group_bits);
+
+ // Adjust KF group bits and error remaining.
+ twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+ // If this is an arf update we want to remove the score for the overlay
+ // frame at the end which will usually be very cheap to code.
+ // The overlay frame has already, in effect, been coded so we want to spread
+ // the remaining bits among the other frames.
+ // For normal GFs remove the score for the GF itself unless this is
+ // also a key frame in which case it has already been accounted for.
+ if (rc->source_alt_ref_pending) {
+ gf_group_error_left = gf_group_err - mod_frame_err;
+ } else if (is_key_frame == 0) {
+ gf_group_error_left = gf_group_err - gf_first_frame_err;
+ } else {
+ gf_group_error_left = gf_group_err;
+ }
+
+ // Allocate bits to each of the frames in the GF group.
+ allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ if (cpi->common.frame_type != KEY_FRAME) {
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+ }
+
+ if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+ // Default to starting GF groups at normal frame size.
+ cpi->rc.next_frame_size_selector = UNSCALED;
+ }
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+static int test_candidate_kf(TWO_PASS *twopass,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *next_frame) {
+ int is_viable_kf = 0;
+ double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+ double modified_pcnt_inter =
+ this_frame->pcnt_inter - this_frame->pcnt_neutral;
+
+ // Does the frame satisfy the primary criteria of a key frame?
+ // See above for an explanation of the test criteria.
+ // If so, then examine how well it predicts subsequent frames.
+ if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+ ((pcnt_intra > MIN_INTRA_LEVEL) &&
+ (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+ ((this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+ KF_II_ERR_THRESHOLD) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ ((next_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+ II_IMPROVEMENT_THRESHOLD))))) {
+ int i;
+ const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+ FIRSTPASS_STATS local_next_frame = *next_frame;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+
+ // Examine how well the key frame predicts subsequent frames.
+ for (i = 0; i < 16; ++i) {
+ double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+ if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+ // Cumulative effect of decay in prediction quality.
+ if (local_next_frame.pcnt_inter > 0.85)
+ decay_accumulator *= local_next_frame.pcnt_inter;
+ else
+ decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+ // Keep a running total.
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses.
+ if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+ (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+ 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 3.0) ||
+ (local_next_frame.intra_error < 200)) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+
+ // Get the next frame details
+ if (EOF == input_stats(twopass, &local_next_frame)) break;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then
+ // break out else discard this potential key frame and move on
+ if (boost_score > 30.0 && (i > 3)) {
+ is_viable_kf = 1;
+ } else {
+ // Reset the file position
+ reset_fpf_position(twopass, start_pos);
+
+ is_viable_kf = 0;
+ }
+ }
+
+ return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ int i, j;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const FIRSTPASS_STATS first_frame = *this_frame;
+ const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS last_frame;
+ int kf_bits = 0;
+ int loop_decay_counter = 0;
+ double decay_accumulator = 1.0;
+ double av_decay_accumulator = 0.0;
+ double zero_motion_accumulator = 1.0;
+ double boost_score = 0.0;
+ double kf_mod_err = 0.0;
+ double kf_group_err = 0.0;
+ double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+ av1_zero(next_frame);
+
+ cpi->common.frame_type = KEY_FRAME;
+
+ // Reset the GF group data structures.
+ av1_zero(*gf_group);
+
+ // Is this a forced key frame by interval.
+ rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+ // Clear the alt ref active flag and last group multi arf flags as they
+ // can never be set for a key frame.
+ rc->source_alt_ref_active = 0;
+ cpi->multi_arf_last_grp_enabled = 0;
+
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+
+ rc->frames_to_key = 1;
+
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0; // Group modified error score.
+
+ kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Initialize the decay rates for the recent frames to check
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+ // Find the next keyframe.
+ i = 0;
+ while (twopass->stats_in < twopass->stats_in_end &&
+ rc->frames_to_key < cpi->oxcf.key_freq) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Load the next frame's stats.
+ last_frame = *this_frame;
+ input_stats(twopass, this_frame);
+
+ // Provided that we are not at the end of the file...
+ if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+ double loop_decay_rate;
+
+ // Check for a scene cut.
+ if (test_candidate_kf(twopass, &last_frame, this_frame,
+ twopass->stats_in))
+ break;
+
+ // How fast is the prediction quality decaying?
+ loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concerned with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+ decay_accumulator *= recent_loop_decay[j];
+
+ // Special check for transition or high motion followed by a
+ // static scene.
+ if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+ loop_decay_rate, decay_accumulator))
+ break;
+
+ // Step on to the next frame.
+ ++rc->frames_to_key;
+
+ // If we don't have a real key frame within the next two
+ // key_freq intervals then break out of the loop.
+ if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+ } else {
+ ++rc->frames_to_key;
+ }
+ ++i;
+ }
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural interval
+ // is between 1x and 2x.
+ if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+ FIRSTPASS_STATS tmp_frame = first_frame;
+
+ rc->frames_to_key /= 2;
+
+ // Reset to the start of the group.
+ reset_fpf_position(twopass, start_position);
+
+ kf_group_err = 0.0;
+
+ // Rescan to get the correct error data for the forced kf group.
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+ input_stats(twopass, &tmp_frame);
+ }
+ rc->next_key_frame_forced = 1;
+ } else if (twopass->stats_in == twopass->stats_in_end ||
+ rc->frames_to_key >= cpi->oxcf.key_freq) {
+ rc->next_key_frame_forced = 1;
+ } else {
+ rc->next_key_frame_forced = 0;
+ }
+
+ // Special case for the last key frame of the file.
+ if (twopass->stats_in >= twopass->stats_in_end) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+ // Maximum number of bits for a single normal frame (not key frame).
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+ // Maximum number of bits allocated to the key frame group.
+ int64_t max_grp_bits;
+
+ // Default allocation based on bits left and relative
+ // complexity of the section.
+ twopass->kf_group_bits = (int64_t)(
+ twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+
+ // Clip based on maximum per frame rate defined by the user.
+ max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+ if (twopass->kf_group_bits > max_grp_bits)
+ twopass->kf_group_bits = max_grp_bits;
+ } else {
+ twopass->kf_group_bits = 0;
+ }
+ twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+ // Reset the first pass file position.
+ reset_fpf_position(twopass, start_position);
+
+ // Scan through the kf group collating various stats used to determine
+ // how many bits to spend on it.
+ decay_accumulator = 1.0;
+ boost_score = 0.0;
+ for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ // Monitor for static sections.
+ zero_motion_accumulator = AOMMIN(zero_motion_accumulator,
+ get_zero_motion_factor(cpi, &next_frame));
+
+ // Not all frames in the group are necessarily used in calculating boost.
+ if ((i <= rc->max_gf_interval) ||
+ ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+ const double frame_boost =
+ calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+
+ // How fast is prediction quality decaying.
+ if (!detect_flash(twopass, 0)) {
+ const double loop_decay_rate =
+ get_prediction_decay_rate(cpi, &next_frame);
+ decay_accumulator *= loop_decay_rate;
+ decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
+ av_decay_accumulator += decay_accumulator;
+ ++loop_decay_counter;
+ }
+ boost_score += (decay_accumulator * frame_boost);
+ }
+ }
+ av_decay_accumulator /= (double)loop_decay_counter;
+
+ reset_fpf_position(twopass, start_position);
+
+ // Store the zero motion percentage
+ twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_position, twopass->stats_in_end, rc->frames_to_key);
+
+ // Apply various clamps for min and max boost
+ rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+ rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
+ rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+
+ // Work out how many bits to allocate for the key frame itself.
+ kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+ twopass->kf_group_bits);
+
+ // Work out the fraction of the kf group bits reserved for the inter frames
+ // within the group after discounting the bits for the kf itself.
+ if (twopass->kf_group_bits) {
+ twopass->kfgroup_inter_fraction =
+ (double)(twopass->kf_group_bits - kf_bits) /
+ (double)twopass->kf_group_bits;
+ } else {
+ twopass->kfgroup_inter_fraction = 1.0;
+ }
+
+ twopass->kf_group_bits -= kf_bits;
+
+ // Save the bits to spend on the key frame.
+ gf_group->bit_allocation[0] = kf_bits;
+ gf_group->update_type[0] = KF_UPDATE;
+ gf_group->rf_level[0] = KF_STD;
+
+ // Note the total error score of the kf group minus the key frame itself.
+ twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
+ twopass->modified_error_left -= kf_group_err;
+
+ if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+ // Default to normal-sized frame on keyframes.
+ cpi->rc.next_frame_size_selector = UNSCALED;
+ }
+}
+
+// Define the reference buffers that will be updated post encode.
+static void configure_buffer_updates(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->twopass;
+
+ // Wei-Ting: Should we define another function to take care of
+ // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+ cpi->rc.is_src_frame_alt_ref = 0;
+#if CONFIG_EXT_REFS
+ cpi->rc.is_bwd_ref_frame = 0;
+ cpi->rc.is_last_bipred_frame = 0;
+ cpi->rc.is_bipred_frame = 0;
+ cpi->rc.is_src_frame_ext_arf = 0;
+#endif // CONFIG_EXT_REFS
+
+ switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+ case KF_UPDATE:
+#if CONFIG_EXT_REFS
+ cpi->refresh_bwd_ref_frame = 1;
+#endif // CONFIG_EXT_REFS
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ break;
+
+ case LF_UPDATE:
+#if CONFIG_EXT_REFS
+ // If we have extra ALT_REFs, we can use the farthest ALT (ALT0) as
+ // the BWD_REF.
+ if (cpi->num_extra_arfs) {
+ int tmp = cpi->bwd_fb_idx;
+
+ cpi->bwd_fb_idx = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = cpi->arf_map[0];
+ cpi->arf_map[0] = tmp;
+
+ cpi->rc.is_bwd_ref_frame = 1;
+ } else {
+ cpi->rc.is_bwd_ref_frame = 0;
+ }
+#endif // CONFIG_EXT_REFS
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+
+ case GF_UPDATE:
+#if CONFIG_EXT_REFS
+ cpi->refresh_bwd_ref_frame = 0;
+#endif // CONFIG_EXT_REFS
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+
+ case OVERLAY_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 1;
+#if CONFIG_EXT_REFS
+ cpi->refresh_bwd_ref_frame = 0;
+#endif // CONFIG_EXT_REFS
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+
+ case ARF_UPDATE:
+#if CONFIG_EXT_REFS
+ cpi->refresh_bwd_ref_frame = 1;
+#endif // CONFIG_EXT_REFS
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 1;
+ break;
+
+#if CONFIG_EXT_REFS
+ case BRF_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_bwd_ref_frame = 1;
+ if (cpi->num_extra_arfs) {
+ // Allow BRF use the farthest ALT_REF (ALT0) as BWD_REF by swapping
+ // the virtual indices.
+ // NOTE: The indices will be swapped back after this frame is encoded
+ // (in av1_update_reference_frames()).
+ int tmp = cpi->bwd_fb_idx;
+
+ cpi->bwd_fb_idx = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = cpi->arf_map[0];
+ cpi->arf_map[0] = tmp;
+ }
+ break;
+
+ case LAST_BIPRED_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_last_bipred_frame = 1;
+ break;
+
+ case BIPRED_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_bipred_frame = 1;
+ break;
+
+ case INTNL_OVERLAY_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_src_frame_alt_ref = 1;
+ cpi->rc.is_src_frame_ext_arf = 1;
+ break;
+#endif // CONFIG_EXT_REFS
+
+ default: assert(0); break;
+ }
+}
+
+static int is_skippable_frame(const AV1_COMP *cpi) {
+ // If the current frame does not have non-zero motion vector detected in the
+ // first pass, and so do its previous and forward frames, then this frame
+ // can be skipped for partition check, and the partition size is assigned
+ // according to the variance
+ const TWO_PASS *const twopass = &cpi->twopass;
+
+ return (!frame_is_intra_only(&cpi->common) &&
+ twopass->stats_in - 2 > twopass->stats_in_start &&
+ twopass->stats_in < twopass->stats_in_end &&
+ (twopass->stats_in - 1)->pcnt_inter -
+ (twopass->stats_in - 1)->pcnt_motion ==
+ 1 &&
+ (twopass->stats_in - 2)->pcnt_inter -
+ (twopass->stats_in - 2)->pcnt_motion ==
+ 1 &&
+ twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ int frames_left;
+ FIRSTPASS_STATS this_frame;
+
+ int target_rate;
+
+ frames_left = (int)(twopass->total_stats.count - cm->current_video_frame);
+
+ if (!twopass->stats_in) return;
+
+ // If this is an arf frame then we dont want to read the stats file or
+ // advance the input pointer as we already have what we need.
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ configure_buffer_updates(cpi);
+ target_rate = gf_group->bit_allocation[gf_group->index];
+ target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+ rc->base_frame_target = target_rate;
+
+ cm->frame_type = INTER_FRAME;
+
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
+ return;
+ }
+
+ aom_clear_system_state();
+
+ if (cpi->oxcf.rc_mode == AOM_Q) {
+ twopass->active_worst_quality = cpi->oxcf.cq_level;
+ } else if (cm->current_video_frame == 0) {
+ // Special case code for first frame.
+ const int section_target_bandwidth =
+ (int)(twopass->bits_left / frames_left);
+ const double section_length = twopass->total_left_stats.count;
+ const double section_error =
+ twopass->total_left_stats.coded_error / section_length;
+ const double section_intra_skip =
+ twopass->total_left_stats.intra_skip_pct / section_length;
+ const double section_inactive_zone =
+ (twopass->total_left_stats.inactive_zone_rows * 2) /
+ ((double)cm->mb_rows * section_length);
+ const int tmp_q = get_twopass_worst_quality(
+ cpi, section_error, section_intra_skip + section_inactive_zone,
+ section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+ twopass->active_worst_quality = tmp_q;
+ twopass->baseline_active_worst_quality = tmp_q;
+ rc->ni_av_qi = tmp_q;
+ rc->last_q[INTER_FRAME] = tmp_q;
+ rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->bit_depth);
+ rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+ rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+ rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+ }
+
+ av1_zero(this_frame);
+ if (EOF == input_stats(twopass, &this_frame)) return;
+
+ // Set the frame content type flag.
+ if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+ twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+ else
+ twopass->fr_content_type = FC_NORMAL;
+
+ // Keyframe and section processing.
+ if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ FIRSTPASS_STATS this_frame_copy;
+ this_frame_copy = this_frame;
+ // Define next KF group and assign bits to it.
+ find_next_key_frame(cpi, &this_frame);
+ this_frame = this_frame_copy;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (rc->frames_till_gf_update_due == 0) {
+ define_gf_group(cpi, &this_frame);
+
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame,
+ rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
+ rc->gfu_boost);
+
+ fclose(fpfile);
+ }
+#endif
+ }
+
+ configure_buffer_updates(cpi);
+
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
+ target_rate = gf_group->bit_allocation[gf_group->index];
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
+ else
+ target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+
+ rc->base_frame_target = target_rate;
+
+ {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy =
+ log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+ }
+
+ // Update the total stats remaining structure.
+ subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int bits_used = rc->base_frame_target;
+
+ // VBR correction is done through rc->vbr_bits_off_target. Based on the
+ // sign of this value, a limited % adjustment is made to the target rate
+ // of subsequent frames, to try and push it back towards 0. This method
+ // is designed to prevent extreme behaviour at the end of a clip
+ // or group of frames.
+ rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+ twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+
+ // Calculate the pct rc error.
+ if (rc->total_actual_bits) {
+ rc->rate_error_estimate =
+ (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+ rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+ } else {
+ rc->rate_error_estimate = 0;
+ }
+
+ if (cpi->common.frame_type != KEY_FRAME) {
+ twopass->kf_group_bits -= bits_used;
+ twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+ }
+ twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+ // Increment the gf group index ready for the next frame.
+ ++twopass->gf_group.index;
+
+ // If the rate control is drifting consider adjustment to min or maxq.
+ if ((cpi->oxcf.rc_mode != AOM_Q) &&
+ (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+ !cpi->rc.is_src_frame_alt_ref) {
+ const int maxq_adj_limit =
+ rc->worst_quality - twopass->active_worst_quality;
+ const int minq_adj_limit =
+ (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+ // Undershoot.
+ if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+ --twopass->extend_maxq;
+ if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+ ++twopass->extend_minq;
+ // Overshoot.
+ } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+ --twopass->extend_minq;
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ ++twopass->extend_maxq;
+ } else {
+ // Adjustment for extreme local overshoot.
+ if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+ rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+ ++twopass->extend_maxq;
+
+ // Unwind undershoot or overshoot adjustment.
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ --twopass->extend_minq;
+ else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+ --twopass->extend_maxq;
+ }
+
+ twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+ twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+ // If there is a big and undexpected undershoot then feed the extra
+ // bits back in quickly. One situation where this may happen is if a
+ // frame is unexpectedly almost perfectly predicted by the ARF or GF
+ // but not very well predcited by the previous frame.
+ if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+ int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+ if (rc->projected_frame_size < fast_extra_thresh) {
+ rc->vbr_bits_off_target_fast +=
+ fast_extra_thresh - rc->projected_frame_size;
+ rc->vbr_bits_off_target_fast =
+ AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+ // Fast adaptation of minQ if necessary to use up the extra bits.
+ if (rc->avg_frame_bandwidth) {
+ twopass->extend_minq_fast =
+ (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+ }
+ twopass->extend_minq_fast = AOMMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else if (rc->vbr_bits_off_target_fast) {
+ twopass->extend_minq_fast = AOMMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else {
+ twopass->extend_minq_fast = 0;
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
new file mode 100644
index 0000000000..db459cc229
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_FIRSTPASS_H_
+#define AV1_ENCODER_FIRSTPASS_H_
+
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
+typedef struct {
+ uint8_t *mb_stats_start;
+ uint8_t *mb_stats_end;
+} FIRSTPASS_MB_STATS;
+#endif
+
+#if CONFIG_EXT_REFS
+// Length of the bi-predictive frame group (BFG)
+// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
+// number of bi-predictive frames.
+#define BFG_INTERVAL 2
+// The maximum number of extra ALT_REF's
+// NOTE: This number cannot be greater than 2 or the reference frame buffer will
+// overflow.
+#define MAX_EXT_ARFS 2
+#define MIN_EXT_ARF_INTERVAL 4
+#endif // CONFIG_EXT_REFS
+
+#define VLOW_MOTION_THRESHOLD 950
+
+typedef struct {
+ double frame;
+ double weight;
+ double intra_error;
+ double coded_error;
+ double sr_coded_error;
+ double pcnt_inter;
+ double pcnt_motion;
+ double pcnt_second_ref;
+ double pcnt_neutral;
+ double intra_skip_pct;
+ double inactive_zone_rows; // Image mask rows top and bottom.
+ double inactive_zone_cols; // Image mask columns at left and right edges.
+ double MVr;
+ double mvr_abs;
+ double MVc;
+ double mvc_abs;
+ double MVrv;
+ double MVcv;
+ double mv_in_out_count;
+ double new_mv_count;
+ double duration;
+ double count;
+} FIRSTPASS_STATS;
+
+typedef enum {
+ KF_UPDATE = 0,
+ LF_UPDATE = 1,
+ GF_UPDATE = 2,
+ ARF_UPDATE = 3,
+ OVERLAY_UPDATE = 4,
+#if CONFIG_EXT_REFS
+ BRF_UPDATE = 5, // Backward Reference Frame
+ LAST_BIPRED_UPDATE = 6, // Last Bi-predictive Frame
+ BIPRED_UPDATE = 7, // Bi-predictive Frame, but not the last one
+ INTNL_OVERLAY_UPDATE = 8, // Internal Overlay Frame
+ FRAME_UPDATE_TYPES = 9
+#else
+ FRAME_UPDATE_TYPES = 5
+#endif // CONFIG_EXT_REFS
+} FRAME_UPDATE_TYPE;
+
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+ FC_NORMAL = 0,
+ FC_GRAPHICS_ANIMATION = 1,
+ FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
+typedef struct {
+ unsigned char index;
+ RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+ FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if CONFIG_EXT_REFS
+ unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+#endif // CONFIG_EXT_REFS
+ int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
+typedef struct {
+ unsigned int section_intra_rating;
+ FIRSTPASS_STATS total_stats;
+ FIRSTPASS_STATS this_frame_stats;
+ const FIRSTPASS_STATS *stats_in;
+ const FIRSTPASS_STATS *stats_in_start;
+ const FIRSTPASS_STATS *stats_in_end;
+ FIRSTPASS_STATS total_left_stats;
+ int first_pass_done;
+ int64_t bits_left;
+ double modified_error_min;
+ double modified_error_max;
+ double modified_error_left;
+ double mb_av_energy;
+
+#if CONFIG_FP_MB_STATS
+ uint8_t *frame_mb_stats_buf;
+ uint8_t *this_frame_mb_stats;
+ FIRSTPASS_MB_STATS firstpass_mb_stats;
+#endif
+ // An indication of the content type of the current frame
+ FRAME_CONTENT_TYPE fr_content_type;
+
+ // Projected total bits available for a key frame group of frames
+ int64_t kf_group_bits;
+
+ // Error score of frames still to be coded in kf group
+ int64_t kf_group_error_left;
+
+ // The fraction for a kf groups total bits allocated to the inter frames
+ double kfgroup_inter_fraction;
+
+ int sr_update_lag;
+
+ int kf_zeromotion_pct;
+ int last_kfgroup_zeromotion_pct;
+ int gf_zeromotion_pct;
+ int active_worst_quality;
+ int baseline_active_worst_quality;
+ int extend_minq;
+ int extend_maxq;
+ int extend_minq_fast;
+
+ GF_GROUP gf_group;
+} TWO_PASS;
+
+struct AV1_COMP;
+
+void av1_init_first_pass(struct AV1_COMP *cpi);
+void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
+void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+// Post encode update of the rate control parameters for 2-pass
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+void av1_init_subsampling(struct AV1_COMP *cpi);
+
+void av1_calculate_coded_size(struct AV1_COMP *cpi, int *scaled_frame_width,
+ int *scaled_frame_height);
+
+#if CONFIG_EXT_REFS
+static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
+ if (arf_pending && MAX_EXT_ARFS > 0)
+ return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
+ ? MAX_EXT_ARFS
+ : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS
+ ? MAX_EXT_ARFS - 1
+ : 0;
+ else
+ return 0;
+}
+#endif // CONFIG_EXT_REFS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/generic_encoder.c b/third_party/aom/av1/encoder/generic_encoder.c
new file mode 100644
index 0000000000..a31bb9ef69
--- /dev/null
+++ b/third_party/aom/av1/encoder/generic_encoder.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/bitwriter.h"
+#include "av1/common/generic_code.h"
+#include "av1/common/odintrin.h"
+#include "pvq_encoder.h"
+
+/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] w multi-symbol entropy encoder
+ * @param [in] val variable being encoded
+ * @param [in,out] cdf CDF of the variable (Q15)
+ * @param [in] n number of values possible
+ * @param [in,out] count number of symbols encoded with that cdf so far
+ * @param [in] rate adaptation rate shift (smaller is faster)
+ */
+void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n,
+ int *count, int rate) {
+ int i;
+ if (*count == 0) {
+ /* On the first call, we normalize the cdf to (32768 - n). This should
+ eventually be moved to the state init, but for now it makes it much
+ easier to experiment and convert symbols to the Q15 adaptation.*/
+ int ft;
+ ft = cdf[n - 1];
+ for (i = 0; i < n; i++) {
+ cdf[i] = AOM_ICDF(cdf[i]*32768/ft);
+ }
+ }
+ aom_write_cdf(w, val, cdf, n);
+ aom_cdf_adapt_q15(val, cdf, n, count, rate);
+}
+
+/** Encodes a random variable using a "generic" model, assuming that the
+ * distribution is one-sided (zero and up), has a single mode, and decays
+ * exponentially past the model.
+ *
+ * @param [in,out] w multi-symbol entropy encoder
+ * @param [in,out] model generic probability model
+ * @param [in] x variable being encoded
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @param [in] integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ */
+void generic_encode(aom_writer *w, generic_encoder *model, int x,
+ int *ex_q16, int integration) {
+ int lg_q1;
+ int shift;
+ int id;
+ uint16_t *cdf;
+ int xs;
+ lg_q1 = log_ex(*ex_q16);
+ OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
+ "%d %d", *ex_q16, lg_q1));
+ /* If expectation is too large, shift x to ensure that
+ all we have past xs=15 is the exponentially decaying tail
+ of the distribution */
+ shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+ /* Choose the cdf to use: we have two per "octave" of ExQ16 */
+ id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+ cdf = model->cdf[id];
+ xs = (x + (1 << shift >> 1)) >> shift;
+ aom_write_symbol_pvq(w, OD_MINI(15, xs), cdf, 16);
+ if (xs >= 15) {
+ int e;
+ unsigned decay;
+ /* Estimate decay based on the assumption that the distribution is close
+ to Laplacian for large values. We should probably have an adaptive
+ estimate instead. Note: The 2* is a kludge that's not fully understood
+ yet. */
+ OD_ASSERT(*ex_q16 < INT_MAX >> 1);
+ e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
+ decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
+ /* Encode the tail of the distribution assuming exponential decay. */
+ aom_laplace_encode_special(w, xs - 15, decay);
+ }
+ if (shift != 0) {
+ int special;
+ /* Because of the rounding, there's only half the number of possibilities
+ for xs=0. */
+ special = xs == 0;
+ if (shift - special > 0) {
+ aom_write_literal(w, x - (xs << shift) + (!special << (shift - 1)),
+ shift - special);
+ }
+ }
+ generic_model_update(ex_q16, x, integration);
+ OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
+ "enc: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, enc->rng));
+}
+
+/** Estimates the cost of encoding a value with generic_encode().
+ *
+ * @param [in,out] model generic probability model
+ * @param [in] x variable being encoded
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @return number of bits (approximation)
+ */
+double generic_encode_cost(generic_encoder *model, int x, int *ex_q16) {
+ int lg_q1;
+ int shift;
+ int id;
+ uint16_t *cdf;
+ int xs;
+ int extra;
+ lg_q1 = log_ex(*ex_q16);
+ /* If expectation is too large, shift x to ensure that
+ all we have past xs=15 is the exponentially decaying tail
+ of the distribution */
+ shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+ /* Choose the cdf to use: we have two per "octave" of ExQ16 */
+ id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+ cdf = model->cdf[id];
+ xs = (x + (1 << shift >> 1)) >> shift;
+ extra = 0;
+ if (shift) extra = shift - (xs == 0);
+ xs = OD_MINI(15, xs);
+ /* Shortcut: assume it's going to cost 2 bits for the Laplace coder. */
+ if (xs == 15) extra += 2;
+ return
+ extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/cdf[15]);
+}
+
+/*Estimates the cost of encoding a value with a given CDF.*/
+double od_encode_cdf_cost(int val, uint16_t *cdf, int n) {
+ int total_prob;
+ int prev_prob;
+ double val_prob;
+ OD_ASSERT(n > 0);
+ total_prob = cdf[n - 1];
+ if (val == 0) {
+ prev_prob = 0;
+ }
+ else {
+ prev_prob = cdf[val - 1];
+ }
+ val_prob = (cdf[val] - prev_prob) / (double)total_prob;
+ return -OD_LOG2(val_prob);
+}
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
new file mode 100644
index 0000000000..2a62049391
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/corner_match.h"
+#include "av1/encoder/ransac.h"
+
+#define MAX_CORNERS 4096
+#define MIN_INLIER_PROB 0.1
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+#define ERRORADV_MAX_THRESH 0.995
+#define ERRORADV_COST_PRODUCT_THRESH 26000
+
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
+ return best_erroradvantage < ERRORADV_MAX_THRESH &&
+ best_erroradvantage * params_cost < ERRORADV_COST_PRODUCT_THRESH;
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+ int i;
+ int alpha_present = 0;
+ model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+ model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+
+ for (i = 2; i < 6; ++i) {
+ const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+ model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+ model[i] =
+ (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+ alpha_present |= (model[i] != 0);
+ model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+ }
+ for (; i < 8; ++i) {
+ model[i] = (int32_t)floor(params[i] * (1 << GM_ROW3HOMO_PREC_BITS) + 0.5);
+ model[i] = (int32_t)clamp(model[i], GM_ROW3HOMO_MIN, GM_ROW3HOMO_MAX) *
+ GM_ROW3HOMO_DECODE_FACTOR;
+ alpha_present |= (model[i] != 0);
+ }
+
+ if (!alpha_present) {
+ if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) {
+ model[0] = 0;
+ model[1] = 0;
+ }
+ }
+}
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model) {
+ convert_to_params(params, model->wmmat);
+ model->wmtype = get_gmtype(model);
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+ int32_t offset) {
+ const int scale_vals[3] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF,
+ GM_ROW3HOMO_PREC_DIFF };
+ const int clamp_vals[3] = { GM_TRANS_MAX, GM_ALPHA_MAX, GM_ROW3HOMO_MAX };
+ // type of param: 0 - translation, 1 - affine, 2 - homography
+ const int param_type = (param_index < 2 ? 0 : (param_index < 6 ? 1 : 2));
+ const int is_one_centered = (param_index == 2 || param_index == 5);
+
+ // Make parameter zero-centered and offset the shift that was done to make
+ // it compatible with the warped model
+ param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+ scale_vals[param_type];
+ // Add desired offset to the rescaled/zero-centered parameter
+ param_value += offset;
+ // Clamp the parameter so it does not overflow the number of bits allotted
+ // to it in the bitstream
+ param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+ clamp_vals[param_type]);
+ // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+ // with the warped motion library
+ param_value *= (1 << scale_vals[param_type]);
+
+ // Undo the zero-centering step if necessary
+ return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+ switch (wmtype) {
+ case IDENTITY: wm->wmmat[0] = 0; wm->wmmat[1] = 0;
+ case TRANSLATION:
+ wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+ wm->wmmat[3] = 0;
+ case ROTZOOM: wm->wmmat[4] = -wm->wmmat[3]; wm->wmmat[5] = wm->wmmat[2];
+ case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
+ case HORTRAPEZOID: wm->wmmat[6] = wm->wmmat[4] = 0; break;
+ case VERTRAPEZOID: wm->wmmat[7] = wm->wmmat[3] = 0; break;
+ case HOMOGRAPHY: break;
+ default: assert(0);
+ }
+ wm->wmtype = wmtype;
+}
+
+double refine_integerized_param(WarpedMotionParams *wm,
+ TransformationType wmtype,
+#if CONFIG_HIGHBITDEPTH
+ int use_hbd, int bd,
+#endif // CONFIG_HIGHBITDEPTH
+ uint8_t *ref, int r_width, int r_height,
+ int r_stride, uint8_t *dst, int d_width,
+ int d_height, int d_stride, int n_refinements) {
+ static const int max_trans_model_params[TRANS_TYPES] = {
+ 0, 2, 4, 6, 8, 8, 8
+ };
+ const int border = ERRORADV_BORDER;
+ int i = 0, p;
+ int n_params = max_trans_model_params[wmtype];
+ int32_t *param_mat = wm->wmmat;
+ double step_error;
+ int32_t step;
+ int32_t *param;
+ int32_t curr_param;
+ int32_t best_param;
+ double best_error;
+
+ force_wmtype(wm, wmtype);
+ best_error = av1_warp_erroradv(wm,
+#if CONFIG_HIGHBITDEPTH
+ use_hbd, bd,
+#endif // CONFIG_HIGHBITDEPTH
+ ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, border,
+ border, d_width - 2 * border,
+ d_height - 2 * border, d_stride, 0, 0, 16, 16);
+ step = 1 << (n_refinements + 1);
+ for (i = 0; i < n_refinements; i++, step >>= 1) {
+ for (p = 0; p < n_params; ++p) {
+ int step_dir = 0;
+ // Skip searches for parameters that are forced to be 0
+ if (wmtype == HORTRAPEZOID && (p == 4 || p == 6)) continue;
+ if (wmtype == VERTRAPEZOID && (p == 3 || p == 7)) continue;
+ param = param_mat + p;
+ curr_param = *param;
+ best_param = curr_param;
+ // look to the left
+ *param = add_param_offset(p, curr_param, -step);
+ step_error = av1_warp_erroradv(
+ wm,
+#if CONFIG_HIGHBITDEPTH
+ use_hbd, bd,
+#endif // CONFIG_HIGHBITDEPTH
+ ref, r_width, r_height, r_stride, dst + border * d_stride + border,
+ border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
+ 0, 0, 16, 16);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = -1;
+ }
+
+ // look to the right
+ *param = add_param_offset(p, curr_param, step);
+ step_error = av1_warp_erroradv(
+ wm,
+#if CONFIG_HIGHBITDEPTH
+ use_hbd, bd,
+#endif // CONFIG_HIGHBITDEPTH
+ ref, r_width, r_height, r_stride, dst + border * d_stride + border,
+ border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
+ 0, 0, 16, 16);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = 1;
+ }
+ *param = best_param;
+
+ // look to the direction chosen above repeatedly until error increases
+ // for the biggest step size
+ while (step_dir) {
+ *param = add_param_offset(p, best_param, step * step_dir);
+ step_error = av1_warp_erroradv(
+ wm,
+#if CONFIG_HIGHBITDEPTH
+ use_hbd, bd,
+#endif // CONFIG_HIGHBITDEPTH
+ ref, r_width, r_height, r_stride, dst + border * d_stride + border,
+ border, border, d_width - 2 * border, d_height - 2 * border,
+ d_stride, 0, 0, 16, 16);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ } else {
+ *param = best_param;
+ step_dir = 0;
+ }
+ }
+ }
+ }
+ force_wmtype(wm, wmtype);
+ wm->wmtype = get_gmtype(wm);
+ return best_error;
+}
+
+static INLINE RansacFunc get_ransac_type(TransformationType type) {
+ switch (type) {
+ case HOMOGRAPHY: return ransac_homography;
+ case HORTRAPEZOID: return ransac_hortrapezoid;
+ case VERTRAPEZOID: return ransac_vertrapezoid;
+ case AFFINE: return ransac_affine;
+ case ROTZOOM: return ransac_rotzoom;
+ case TRANSLATION: return ransac_translation;
+ default: assert(0); return NULL;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
+ int bit_depth) {
+ int i, j;
+ uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
+ uint8_t *buf = malloc(frm->y_height * frm->y_stride * sizeof(*buf));
+
+ for (i = 0; i < frm->y_height; ++i)
+ for (j = 0; j < frm->y_width; ++j)
+ buf[i * frm->y_stride + j] =
+ orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
+
+ return buf;
+}
+#endif
+
+int compute_global_motion_feature_based(
+ TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
+#if CONFIG_HIGHBITDEPTH
+ int bit_depth,
+#endif
+ int *num_inliers_by_motion, double *params_by_motion, int num_motions) {
+ int i;
+ int num_frm_corners, num_ref_corners;
+ int num_correspondences;
+ int *correspondences;
+ int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS];
+ unsigned char *frm_buffer = frm->y_buffer;
+ unsigned char *ref_buffer = ref->y_buffer;
+ RansacFunc ransac = get_ransac_type(type);
+
+#if CONFIG_HIGHBITDEPTH
+ if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
+ // The frame buffer is 16-bit, so we need to convert to 8 bits for the
+ // following code. We cache the result until the frame is released.
+ if (frm->y_buffer_8bit)
+ frm_buffer = frm->y_buffer_8bit;
+ else
+ frm_buffer = frm->y_buffer_8bit = downconvert_frame(frm, bit_depth);
+ }
+ if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (ref->y_buffer_8bit)
+ ref_buffer = ref->y_buffer_8bit;
+ else
+ ref_buffer = ref->y_buffer_8bit = downconvert_frame(ref, bit_depth);
+ }
+#endif
+
+ // compute interest points in images using FAST features
+ num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height,
+ frm->y_stride, frm_corners, MAX_CORNERS);
+ num_ref_corners = fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
+ ref->y_stride, ref_corners, MAX_CORNERS);
+
+ // find correspondences between the two images
+ correspondences =
+ (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+ num_correspondences = determine_correspondence(
+ frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
+ (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height,
+ frm->y_stride, ref->y_stride, correspondences);
+
+ ransac(correspondences, num_correspondences, num_inliers_by_motion,
+ params_by_motion, num_motions);
+
+ free(correspondences);
+
+ // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+ for (i = 0; i < num_motions; ++i) {
+ if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+ num_inliers_by_motion[i] = 0;
+ }
+ }
+
+ // Return true if any one of the motions has inliers.
+ for (i = 0; i < num_motions; ++i) {
+ if (num_inliers_by_motion[i] > 0) return 1;
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
new file mode 100644
index 0000000000..8fc757f387
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_GLOBAL_MOTION_H_
+#define AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANSAC_NUM_MOTIONS 1
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model);
+
+int is_enough_erroradvantage(double erroradv, int params_cost);
+
+double refine_integerized_param(WarpedMotionParams *wm,
+ TransformationType wmtype,
+#if CONFIG_HIGHBITDEPTH
+ int use_hbd, int bd,
+#endif // CONFIG_HIGHBITDEPTH
+ uint8_t *ref, int r_width, int r_height,
+ int r_stride, uint8_t *dst, int d_width,
+ int d_height, int d_stride, int n_refinements);
+
+/*
+ Computes "num_motions" candidate global motion parameters between two frames.
+ The array "params_by_motion" should be length 8 * "num_motions". The ordering
+ of each set of parameters is best described by the homography:
+
+ [x' (m2 m3 m0 [x
+ z . y' = m4 m5 m1 * y
+ 1] m6 m7 1) 1]
+
+ where m{i} represents the ith value in any given set of parameters.
+
+ "num_inliers" should be length "num_motions", and will be populated with the
+ number of inlier feature points for each motion. Params for which the
+ num_inliers entry is 0 should be ignored by the caller.
+*/
+int compute_global_motion_feature_based(
+ TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
+#if CONFIG_HIGHBITDEPTH
+ int bit_depth,
+#endif
+ int *num_inliers_by_motion, double *params_by_motion, int num_motions);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 0000000000..4fd5631632
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+#if CONFIG_CB4X4
+static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type, int lossless) {
+ tran_high_t a1 = src_diff[0];
+ tran_high_t b1 = src_diff[1];
+ tran_high_t c1 = src_diff[diff_stride];
+ tran_high_t d1 = src_diff[1 + diff_stride];
+
+ tran_high_t a2 = a1 + c1;
+ tran_high_t b2 = b1 + d1;
+ tran_high_t c2 = a1 - c1;
+ tran_high_t d2 = b1 - d1;
+
+ a1 = a2 + b2;
+ b1 = a2 - b2;
+ c1 = c2 + d2;
+ d1 = c2 - d2;
+
+ coeff[0] = (tran_low_t)(4 * a1);
+ coeff[1] = (tran_low_t)(4 * b1);
+ coeff[2] = (tran_low_t)(4 * c1);
+ coeff[3] = (tran_low_t)(4 * d1);
+
+ (void)tx_type;
+ (void)lossless;
+}
+#endif
+
+static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type, int lossless) {
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_fwht4x4(src_diff, coeff, diff_stride);
+ return;
+ }
+
+ av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht8x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht16x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht16x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+ av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+#if CONFIG_TX64X64
+static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt) {
+ (void)fwd_txfm_opt;
+#if CONFIG_EXT_TX
+ if (tx_type == IDTX)
+ av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type);
+ else
+#endif
+ av1_fht64x64(src_diff, coeff, diff_stride, tx_type);
+}
+#endif // CONFIG_TX64X64
+
+#if CONFIG_HIGHBITDEPTH
+#if CONFIG_CB4X4
+static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type, int lossless,
+ const int bd) {
+ tran_high_t a1 = src_diff[0];
+ tran_high_t b1 = src_diff[1];
+ tran_high_t c1 = src_diff[diff_stride];
+ tran_high_t d1 = src_diff[1 + diff_stride];
+
+ tran_high_t a2 = a1 + c1;
+ tran_high_t b2 = b1 + d1;
+ tran_high_t c2 = a1 - c1;
+ tran_high_t d2 = b1 - d1;
+
+ a1 = a2 + b2;
+ b1 = a2 - b2;
+ c1 = c2 + d2;
+ d1 = c2 - d2;
+
+ coeff[0] = (tran_low_t)(4 * a1);
+ coeff[1] = (tran_low_t)(4 * b1);
+ coeff[2] = (tran_low_t)(4 * c1);
+ coeff[3] = (tran_low_t)(4 * d1);
+
+ (void)tx_type;
+ (void)lossless;
+ (void)bd;
+}
+#endif
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type, int lossless,
+ const int bd) {
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
+ return;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+ break;
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ av1_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+ break;
+ case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type); break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ (void)bd;
+ av1_highbd_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ (void)bd;
+ av1_highbd_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ (void)bd;
+ av1_highbd_fht8x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ (void)bd;
+ av1_highbd_fht16x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ (void)bd;
+ av1_highbd_fht16x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ (void)bd;
+ av1_highbd_fht32x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+ break;
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ // Use C version since DST exists only in C
+ av1_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+ break;
+ case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type); break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+ break;
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ // Use C version since DST exists only in C
+ av1_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+ break;
+ case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type); break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
+ break;
+#if CONFIG_EXT_TX
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ av1_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+ break;
+ case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+}
+
+#if CONFIG_TX64X64
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TX_TYPE tx_type,
+ FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+ (void)fwd_txfm_opt;
+ (void)bd;
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
+ break;
+#if CONFIG_EXT_TX
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
+ break;
+ case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+}
+#endif // CONFIG_TX64X64
+#endif // CONFIG_HIGHBITDEPTH
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ FWD_TXFM_PARAM *fwd_txfm_param) {
+ const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+ const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+ const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+ const int lossless = fwd_txfm_param->lossless;
+ switch (tx_size) {
+#if CONFIG_TX64X64
+ case TX_64X64:
+ fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+#endif // CONFIG_TX64X64
+ case TX_32X32:
+ fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_16X16:
+ fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_8X8:
+ fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_4X8:
+ fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_8X4:
+ fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_8X16:
+ fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_16X8:
+ fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_16X32:
+ fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_32X16:
+ fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+ break;
+ case TX_4X4:
+ fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+ break;
+#if CONFIG_CB4X4
+ case TX_2X2:
+ fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless);
+ break;
+#endif
+ default: assert(0); break;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param) {
+ const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+ const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+ const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+ const int lossless = fwd_txfm_param->lossless;
+ const int bd = fwd_txfm_param->bd;
+ switch (tx_size) {
+#if CONFIG_TX64X64
+ case TX_64X64:
+ highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+#endif // CONFIG_TX64X64
+ case TX_32X32:
+ highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_16X16:
+ highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_8X8:
+ highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_4X8:
+ highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_8X4:
+ highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_8X16:
+ highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_16X8:
+ highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_16X32:
+ highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_32X16:
+ highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+ bd);
+ break;
+ case TX_4X4:
+ highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd);
+ break;
+#if CONFIG_CB4X4
+ case TX_2X2:
+ highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless, bd);
+ break;
+#endif
+ default: assert(0); break;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 0000000000..e6fd17275e
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "./aom_config.h"
+
+typedef enum FWD_TXFM_OPT { FWD_TXFM_OPT_NORMAL } FWD_TXFM_OPT;
+
+typedef struct FWD_TXFM_PARAM {
+ TX_TYPE tx_type;
+ TX_SIZE tx_size;
+ int lossless;
+#if CONFIG_HIGHBITDEPTH
+ int bd;
+#endif // CONFIG_HIGHBITDEPTH
+} FWD_TXFM_PARAM;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ FWD_TXFM_PARAM *fwd_txfm_param);
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
+#endif // CONFIG_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/laplace_encoder.c b/third_party/aom/av1/encoder/laplace_encoder.c
new file mode 100644
index 0000000000..54ffc88fb5
--- /dev/null
+++ b/third_party/aom/av1/encoder/laplace_encoder.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/bitwriter.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/pvq.h"
+#include "pvq_encoder.h"
+
+static void aom_encode_pvq_split(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ int count, int sum, int ctx) {
+ int shift;
+ int rest;
+ int fctx;
+ if (sum == 0) return;
+ shift = OD_MAXI(0, OD_ILOG(sum) - 3);
+ if (shift) {
+ rest = count & ((1 << shift) - 1);
+ count >>= shift;
+ sum >>= shift;
+ }
+ fctx = 7*ctx + sum - 1;
+ aom_write_symbol_pvq(w, count, adapt->pvq_split_cdf[fctx], sum + 1);
+ if (shift) aom_write_literal(w, rest, shift);
+}
+
+void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ const int *y, int n, int k, int level) {
+ int mid;
+ int i;
+ int count_right;
+ if (n <= 1 || k == 0) return;
+ if (k == 1 && n <= 16) {
+ int cdf_id;
+ int pos;
+ cdf_id = od_pvq_k1_ctx(n, level == 0);
+ for (pos = 0; !y[pos]; pos++);
+ OD_ASSERT(pos < n);
+ aom_write_symbol_pvq(w, pos, adapt->pvq_k1_cdf[cdf_id], n);
+ }
+ else {
+ mid = n >> 1;
+ count_right = k;
+ for (i = 0; i < mid; i++) count_right -= abs(y[i]);
+ aom_encode_pvq_split(w, adapt, count_right, k, od_pvq_size_ctx(n));
+ aom_encode_band_pvq_splits(w, adapt, y, mid, k - count_right, level + 1);
+ aom_encode_band_pvq_splits(w, adapt, y + mid, n - mid, count_right,
+ level + 1);
+ }
+}
+
+/** Encodes the tail of a Laplace-distributed variable, i.e. it doesn't
+ * do anything special for the zero case.
+ *
+ * @param [in,out] enc range encoder
+ * @param [in] x variable to encode (has to be positive)
+ * @param [in] decay decay factor of the distribution in Q8 format,
+ * i.e. pdf ~= decay^x
+ */
+void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay) {
+ int shift;
+ int xs;
+ int sym;
+ const uint16_t *cdf;
+ shift = 0;
+ /* We don't want a large decay value because that would require too many
+ symbols. */
+ while (decay > 235) {
+ decay = (decay*decay + 128) >> 8;
+ shift++;
+ }
+ decay = OD_MINI(decay, 254);
+ decay = OD_MAXI(decay, 2);
+ xs = x >> shift;
+ cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
+ OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d", decay));
+ do {
+ sym = OD_MINI(xs, 15);
+ {
+ int i;
+ OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d %d\n", x, xs, shift,
+ sym, max));
+ for (i = 0; i < 16; i++) {
+ OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
+ }
+ OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
+ }
+ aom_write_cdf(w, sym, cdf, 16);
+ xs -= 15;
+ } while (sym >= 15);
+ if (shift) aom_write_literal(w, x & ((1 << shift) - 1), shift);
+}
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
new file mode 100644
index 0000000000..591ca61521
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./aom_config.h"
+
+#include "av1/common/common.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+ int index = *idx;
+ struct lookahead_entry *buf = ctx->buf + index;
+
+ assert(index < ctx->max_sz);
+ if (++index >= ctx->max_sz) index -= ctx->max_sz;
+ *idx = index;
+ return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+ if (ctx) {
+ if (ctx->buf) {
+ int i;
+
+ for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+ free(ctx->buf);
+ }
+ free(ctx);
+ }
+}
+
+struct lookahead_ctx *av1_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int subsampling_x,
+ unsigned int subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ unsigned int depth) {
+ struct lookahead_ctx *ctx = NULL;
+
+ // Clamp the lookahead queue depth
+ depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+
+ // Allocate memory to keep previous source frames available.
+ depth += MAX_PRE_FRAMES;
+
+ // Allocate the lookahead structures
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx) {
+ const int legacy_byte_alignment = 0;
+ unsigned int i;
+ ctx->max_sz = depth;
+ ctx->buf = calloc(depth, sizeof(*ctx->buf));
+ if (!ctx->buf) goto bail;
+ for (i = 0; i < depth; i++)
+ if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
+ subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
+ goto bail;
+ }
+ return ctx;
+bail:
+ av1_lookahead_destroy(ctx);
+ return NULL;
+}
+
+#define USE_PARTIAL_COPY 0
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end,
+#if CONFIG_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ aom_enc_frame_flags_t flags) {
+ struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
+ int row, col, active_end;
+ int mb_rows = (src->y_height + 15) >> 4;
+ int mb_cols = (src->y_width + 15) >> 4;
+#endif
+ int width = src->y_crop_width;
+ int height = src->y_crop_height;
+ int uv_width = src->uv_crop_width;
+ int uv_height = src->uv_crop_height;
+ int subsampling_x = src->subsampling_x;
+ int subsampling_y = src->subsampling_y;
+ int larger_dimensions, new_dimensions;
+
+ if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1;
+ ctx->sz++;
+ buf = pop(ctx, &ctx->write_idx);
+
+ new_dimensions = width != buf->img.y_crop_width ||
+ height != buf->img.y_crop_height ||
+ uv_width != buf->img.uv_crop_width ||
+ uv_height != buf->img.uv_crop_height;
+ larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+ uv_width > buf->img.uv_width ||
+ uv_height > buf->img.uv_height;
+ assert(!larger_dimensions || new_dimensions);
+
+#if USE_PARTIAL_COPY
+ // TODO(jkoleszar): This is disabled for now, as
+ // av1_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
+ // Only do this partial copy if the following conditions are all met:
+ // 1. Lookahead queue has has size of 1.
+ // 2. Active map is provided.
+ // 3. This is not a key frame, golden nor altref frame.
+ if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
+ for (row = 0; row < mb_rows; ++row) {
+ col = 0;
+
+ while (1) {
+ // Find the first active macroblock in this row.
+ for (; col < mb_cols; ++col) {
+ if (active_map[col]) break;
+ }
+
+ // No more active macroblock in this row.
+ if (col == mb_cols) break;
+
+ // Find the end of active region in this row.
+ active_end = col;
+
+ for (; active_end < mb_cols; ++active_end) {
+ if (!active_map[active_end]) break;
+ }
+
+ // Only copy this active region.
+ av1_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
+ 16, (active_end - col) << 4);
+
+ // Start again from the end of this active region.
+ col = active_end;
+ }
+
+ active_map += mb_cols;
+ }
+ } else {
+#endif
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+ subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ AOM_BORDER_IN_PIXELS, 0))
+ return 1;
+ aom_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
+ }
+ // Partial copy not implemented yet
+ av1_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
+ }
+#endif
+
+ buf->ts_start = ts_start;
+ buf->ts_end = ts_end;
+ buf->flags = flags;
+ return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx,
+ int drain) {
+ struct lookahead_entry *buf = NULL;
+
+ if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+ buf = pop(ctx, &ctx->read_idx);
+ ctx->sz--;
+ }
+ return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+ int index) {
+ struct lookahead_entry *buf = NULL;
+
+ if (index >= 0) {
+ // Forward peek
+ if (index < ctx->sz) {
+ index += ctx->read_idx;
+ if (index >= ctx->max_sz) index -= ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ } else if (index < 0) {
+ // Backward peek
+ if (-index <= MAX_PRE_FRAMES) {
+ index += (int)(ctx->read_idx);
+ if (index < 0) index += (int)(ctx->max_sz);
+ buf = ctx->buf + index;
+ }
+ }
+
+ return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; }
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
new file mode 100644
index 0000000000..19f75d7e45
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_LOOKAHEAD_H_
+#define AV1_ENCODER_LOOKAHEAD_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_entry {
+ YV12_BUFFER_CONFIG img;
+ int64_t ts_start;
+ int64_t ts_end;
+ aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+struct lookahead_ctx {
+ int max_sz; /* Absolute size of the queue */
+ int sz; /* Number of buffers currently in the queue */
+ int read_idx; /* Read index */
+ int write_idx; /* Write index */
+ struct lookahead_entry *buf; /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int subsampling_x,
+ unsigned int subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ unsigned int depth);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] src Pointer to the image to enqueue
+ * \param[in] ts_start Timestamp for the start of this frame
+ * \param[in] ts_end Timestamp for the end of this frame
+ * \param[in] flags Flags set on this frame
+ * \param[in] active_map Map that specifies which macroblock is active
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end,
+#if CONFIG_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] drain Flag indicating the buffer should be drained
+ * (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] index Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+ int index);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
new file mode 100644
index 0000000000..1296027dcb
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/common/blockd.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
+ int mb_row, int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ const aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ MV ref_full;
+ int cost_list[5];
+
+ // Further step/diamond searches as necessary
+ int step_param = mv_sf->reduce_first_step_size;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ av1_set_mv_search_range(&x->mv_limits, ref_mv);
+
+ ref_full.col = ref_mv->col >> 3;
+ ref_full.row = ref_mv->row >> 3;
+
+ /*cpi->sf.search_method == HEX*/
+ av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+ cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
+
+ // Try sub-pixel MC
+ // if (bestsme > error_thresh && bestsme < INT_MAX)
+ {
+ int distortion;
+ unsigned int sse;
+ cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &v_fn_ptr, 0,
+ mv_sf->subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), NULL, NULL,
+ &distortion, &sse, NULL, 0, 0, 0);
+ }
+
+#if CONFIG_EXT_INTER
+ if (has_second_ref(&xd->mi[0]->mbmi))
+ xd->mi[0]->mbmi.mode = NEW_NEWMV;
+ else
+#endif // CONFIG_EXT_INTER
+ xd->mi[0]->mbmi.mode = NEWMV;
+
+ xd->mi[0]->mbmi.mv[0] = x->best_mv;
+#if CONFIG_EXT_INTER
+ xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+#endif // CONFIG_EXT_INTER
+
+ av1_build_inter_predictors_sby(xd, mb_row, mb_col, NULL, BLOCK_16X16);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ return aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+}
+
+static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
+ int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err, tmp_err;
+ MV best_mv;
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+ best_mv.col = best_mv.row = 0;
+
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search
+ tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
+ if (tmp_err < err) {
+ err = tmp_err;
+ best_mv = x->best_mv.as_mv;
+ }
+
+ // If the current best reference mv is not centered on 0,0 then do a 0,0
+ // based search as well.
+ if (ref_mv->row != 0 || ref_mv->col != 0) {
+ MV zero_ref_mv = { 0, 0 };
+
+ tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
+ if (tmp_err < err) {
+ err = tmp_err;
+ best_mv = x->best_mv.as_mv;
+ }
+ }
+
+ x->best_mv.as_mv = best_mv;
+ return err;
+}
+
+static int do_16x16_zerozero_search(AV1_COMP *cpi, int_mv *dst_mv) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err;
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+
+ dst_mv->as_int = 0;
+
+ return err;
+}
+static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ PREDICTION_MODE best_mode = -1, mode;
+ unsigned int best_err = INT_MAX;
+
+ // calculate SATD for each intra prediction mode;
+ // we're intentionally not doing 4x4, we just want a rough estimate
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ unsigned int err;
+
+ xd->mi[0]->mbmi.mode = mode;
+ av1_predict_intra_block(xd, 16, 16, BLOCK_16X16, mode, x->plane[0].src.buf,
+ x->plane[0].src.stride, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, 0, 0, 0);
+ err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+
+ // find best
+ if (err < best_err) {
+ best_err = err;
+ best_mode = mode;
+ }
+ }
+
+ if (pbest_mode) *pbest_mode = best_mode;
+
+ return best_err;
+}
+
+static void update_mbgraph_mb_stats(AV1_COMP *cpi, MBGRAPH_MB_STATS *stats,
+ YV12_BUFFER_CONFIG *buf, int mb_y_offset,
+ YV12_BUFFER_CONFIG *golden_ref,
+ const MV *prev_golden_ref_mv,
+ YV12_BUFFER_CONFIG *alt_ref, int mb_row,
+ int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int intra_error;
+ AV1_COMMON *cm = &cpi->common;
+
+ // FIXME in practice we're completely ignoring chroma here
+ x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+ x->plane[0].src.stride = buf->y_stride;
+
+ xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+ xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
+
+ // do intra 16x16 prediction
+ intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode);
+ if (intra_error <= 0) intra_error = 1;
+ stats->ref[INTRA_FRAME].err = intra_error;
+
+ // Golden frame MV search, if it exists and is different than last frame
+ if (golden_ref) {
+ int g_motion_error;
+ xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+ xd->plane[0].pre[0].stride = golden_ref->y_stride;
+ g_motion_error =
+ do_16x16_motion_search(cpi, prev_golden_ref_mv, mb_row, mb_col);
+ stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
+ stats->ref[GOLDEN_FRAME].err = g_motion_error;
+ } else {
+ stats->ref[GOLDEN_FRAME].err = INT_MAX;
+ stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+ }
+
+ // Do an Alt-ref frame MV search, if it exists and is different than
+ // last/golden frame.
+ if (alt_ref) {
+ int a_motion_error;
+ xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+ xd->plane[0].pre[0].stride = alt_ref->y_stride;
+ a_motion_error =
+ do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv);
+
+ stats->ref[ALTREF_FRAME].err = a_motion_error;
+ } else {
+ stats->ref[ALTREF_FRAME].err = INT_MAX;
+ stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+ }
+}
+
+static void update_mbgraph_frame_stats(AV1_COMP *cpi,
+ MBGRAPH_FRAME_STATS *stats,
+ YV12_BUFFER_CONFIG *buf,
+ YV12_BUFFER_CONFIG *golden_ref,
+ YV12_BUFFER_CONFIG *alt_ref) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ AV1_COMMON *const cm = &cpi->common;
+
+ int mb_col, mb_row, offset = 0;
+ int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+ MV gld_top_mv = { 0, 0 };
+ MODE_INFO mi_local;
+
+ av1_zero(mi_local);
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
+ x->mv_limits.row_min = -BORDER_MV_PIXELS_B16;
+ x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
+ xd->up_available = 0;
+ xd->plane[0].dst.stride = buf->y_stride;
+ xd->plane[0].pre[0].stride = buf->y_stride;
+ xd->plane[1].dst.stride = buf->uv_stride;
+ xd->mi[0] = &mi_local;
+ mi_local.mbmi.sb_type = BLOCK_16X16;
+ mi_local.mbmi.ref_frame[0] = LAST_FRAME;
+ mi_local.mbmi.ref_frame[1] = NONE_FRAME;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ MV gld_left_mv = gld_top_mv;
+ int mb_y_in_offset = mb_y_offset;
+ int arf_y_in_offset = arf_y_offset;
+ int gld_y_in_offset = gld_y_offset;
+
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
+ x->mv_limits.col_min = -BORDER_MV_PIXELS_B16;
+ x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
+ xd->left_available = 0;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+ update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref,
+ &gld_left_mv, alt_ref, mb_row, mb_col);
+ gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
+ if (mb_col == 0) {
+ gld_top_mv = gld_left_mv;
+ }
+ xd->left_available = 1;
+ mb_y_in_offset += 16;
+ gld_y_in_offset += 16;
+ arf_y_in_offset += 16;
+ x->mv_limits.col_min -= 16;
+ x->mv_limits.col_max -= 16;
+ }
+ xd->up_available = 1;
+ mb_y_offset += buf->y_stride * 16;
+ gld_y_offset += golden_ref->y_stride * 16;
+ if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
+ x->mv_limits.row_min -= 16;
+ x->mv_limits.row_max -= 16;
+ offset += cm->mb_cols;
+ }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int mb_col, mb_row, offset, i;
+ int mi_row, mi_col;
+ int ncnt[4] = { 0 };
+ int n_frames = cpi->mbgraph_n_frames;
+
+ int *arf_not_zz;
+
+ CHECK_MEM_ERROR(
+ cm, arf_not_zz,
+ aom_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+
+ // We are not interested in results beyond the alt ref itself.
+ if (n_frames > cpi->rc.frames_till_gf_update_due)
+ n_frames = cpi->rc.frames_till_gf_update_due;
+
+ // defer cost to reference frames
+ for (i = n_frames - 1; i >= 0; i--) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+ for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+ offset += cm->mb_cols, mb_row++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
+
+ int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+ int intra_err = mb_stats->ref[INTRA_FRAME].err;
+ int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+ // Test for altref vs intra and gf and that its mv was 0,0.
+ if (altref_err > 1000 || altref_err > intra_err ||
+ altref_err > golden_err) {
+ arf_not_zz[offset + mb_col]++;
+ }
+ }
+ }
+ }
+
+ // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+ // of bound access in segmentation_map
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ // If any of the blocks in the sequence failed then the MB
+ // goes in segment 0
+ if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
+ ncnt[0]++;
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
+ } else {
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
+ ncnt[1]++;
+ }
+ }
+ }
+
+ // Only bother with segmentation if over 10% of the MBs in static segment
+ // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+ if (1) {
+ // Note % of blocks that are marked as static
+ if (cm->MBs)
+ cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+ // This error case should not be reachable as this function should
+ // never be called with the common data structure uninitialized.
+ else
+ cpi->static_mb_pct = 0;
+
+ av1_enable_segmentation(&cm->seg);
+ } else {
+ cpi->static_mb_pct = 0;
+ av1_disable_segmentation(&cm->seg);
+ }
+
+ // Free localy allocated storage
+ aom_free(arf_not_zz);
+}
+
+void av1_update_mbgraph_stats(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i, n_frames = av1_lookahead_depth(cpi->lookahead);
+ YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+ assert(golden_ref != NULL);
+
+ // we need to look ahead beyond where the ARF transitions into
+ // being a GF - so exit if we don't look ahead beyond that
+ if (n_frames <= cpi->rc.frames_till_gf_update_due) return;
+
+ if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS;
+
+ cpi->mbgraph_n_frames = n_frames;
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ memset(frame_stats->mb_stats, 0,
+ cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+ }
+
+ // do motion search to find contribution of each reference to data
+ // later on in this GF group
+ // FIXME really, the GF/last MC search should be done forward, and
+ // the ARF MC search backwards, to get optimal results for MV caching
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ struct lookahead_entry *q_cur = av1_lookahead_peek(cpi->lookahead, i);
+
+ assert(q_cur != NULL);
+
+ update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref,
+ cpi->source);
+ }
+
+ aom_clear_system_state();
+
+ separate_arf_mbs(cpi);
+}
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
new file mode 100644
index 0000000000..758e2ad152
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_MBGRAPH_H_
+#define AV1_ENCODER_MBGRAPH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ struct {
+ int err;
+ union {
+ int_mv mv;
+ PREDICTION_MODE mode;
+ } m;
+ } ref[TOTAL_REFS_PER_FRAME];
+} MBGRAPH_MB_STATS;
+
+typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+
+struct AV1_COMP;
+
+void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_MBGRAPH_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
new file mode 100644
index 0000000000..d069eefb0d
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -0,0 +1,3493 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+
+// #define NEW_DIAMOND_SEARCH
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+ const MV *mv) {
+ return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
+ int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+ int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+ int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+ int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+ col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
+ row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
+ col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
+ row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+
+ // Get intersection of UMV window and valid MV window to reduce # of checks
+ // in diamond search.
+ if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+ if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+ if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+ if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+}
+
+static void av1_set_subpel_mv_search_range(const MvLimits *mv_limits,
+ int *col_min, int *col_max,
+ int *row_min, int *row_max,
+ const MV *ref_mv) {
+ const int max_mv = MAX_FULL_PEL_VAL * 8;
+ const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv);
+ const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv);
+ const int minr = AOMMAX(mv_limits->row_min * 8, ref_mv->row - max_mv);
+ const int maxr = AOMMIN(mv_limits->row_max * 8, ref_mv->row + max_mv);
+
+ *col_min = AOMMAX(MV_LOW + 1, minc);
+ *col_max = AOMMIN(MV_UPP - 1, maxc);
+ *row_min = AOMMAX(MV_LOW + 1, minr);
+ *row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+int av1_init_search_range(int size) {
+ int sr = 0;
+ // Minimum search size no matter what the passed in value.
+ size = AOMMAX(16, size);
+
+ while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+ sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+ return sr;
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+ int *const comp_cost[2]) {
+ return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+ comp_cost[1][mv->col];
+}
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int weight) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int error_per_bit) {
+ if (mvcost) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ return (int)ROUND_POWER_OF_TWO_64(
+ (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+ RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+ PIXEL_TRANSFORM_ERROR_SCALE);
+ }
+ return 0;
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
+ int sad_per_bit) {
+ const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 };
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->mvsadcost) * sad_per_bit,
+ AV1_PROB_COST_SHIFT);
+}
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+ int len, ss_count = 1;
+
+ cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+ cfg->ss[0].offset = 0;
+
+ for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+ // Generate offsets for 4 search sites per step.
+ const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } };
+ int i;
+ for (i = 0; i < 4; ++i) {
+ search_site *const ss = &cfg->ss[ss_count++];
+ ss->mv = ss_mvs[i];
+ ss->offset = ss->mv.row * stride + ss->mv.col;
+ }
+ }
+
+ cfg->ss_count = ss_count;
+ cfg->searches_per_step = 4;
+}
+
+void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
+ int len, ss_count = 1;
+
+ cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+ cfg->ss[0].offset = 0;
+
+ for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+ // Generate offsets for 8 search sites per step.
+ const MV ss_mvs[8] = { { -len, 0 }, { len, 0 }, { 0, -len },
+ { 0, len }, { -len, -len }, { -len, len },
+ { len, -len }, { len, len } };
+ int i;
+ for (i = 0; i < 8; ++i) {
+ search_site *const ss = &cfg->ss[ss_count++];
+ ss->mv = ss_mvs[i];
+ ss->offset = ss->mv.row * stride + ss->mv.col;
+ }
+ }
+
+ cfg->ss_count = ss_count;
+ cfg->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// convert motion vector component to offset for sv[a]f calc
+static INLINE int sp(int x) { return x & 7; }
+
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+ return &buf[(r >> 3) * stride + (c >> 3)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ MV this_mv = { r, c }; \
+ v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+ if (second_pred == NULL) \
+ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+ src_address, src_stride, &sse); \
+ else \
+ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+ src_address, src_stride, &sse, second_pred); \
+ v += thismse; \
+ if (v < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+static INLINE const uint8_t *upre(const uint8_t *buf, int stride, int r,
+ int c) {
+ return &buf[(r)*stride + (c)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ MV this_mv = { r, c }; \
+ thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, \
+ upre(y, y_stride, r, c), y_stride, \
+ second_pred, w, h, &sse); \
+ v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+ v += thismse; \
+ if (v < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#define FIRST_LEVEL_CHECKS \
+ { \
+ unsigned int left, right, up, down, diag; \
+ CHECK_BETTER(left, tr, tc - hstep); \
+ CHECK_BETTER(right, tr, tc + hstep); \
+ CHECK_BETTER(up, tr - hstep, tc); \
+ CHECK_BETTER(down, tr + hstep, tc); \
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); \
+ switch (whichdir) { \
+ case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \
+ case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \
+ case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
+ case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
+ } \
+ }
+
+#define SECOND_LEVEL_CHECKS \
+ { \
+ int kr, kc; \
+ unsigned int second; \
+ if (tr != br && tc != bc) { \
+ kr = br - tr; \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + kr, tc + 2 * kc); \
+ CHECK_BETTER(second, tr + 2 * kr, tc + kc); \
+ } else if (tr == br && tc != bc) { \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \
+ CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \
+ switch (whichdir) { \
+ case 0: \
+ case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \
+ case 2: \
+ case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \
+ } \
+ } else if (tr != br && tc == bc) { \
+ kr = br - tr; \
+ CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \
+ CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \
+ switch (whichdir) { \
+ case 0: \
+ case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \
+ case 1: \
+ case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
+ } \
+ } \
+ }
+
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST(k) \
+ { \
+ unsigned int second; \
+ int br0 = br; \
+ int bc0 = bc; \
+ assert(tr == br || tc == bc); \
+ if (tr == br && tc != bc) { \
+ kc = bc - tc; \
+ } else if (tr != br && tc == bc) { \
+ kr = br - tr; \
+ } \
+ CHECK_BETTER##k(second, br0 + kr, bc0); \
+ CHECK_BETTER##k(second, br0, bc0 + kc); \
+ if (br0 != br || bc0 != bc) { \
+ CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \
+ } \
+ }
+
+#define SETUP_SUBPEL_SEARCH \
+ const uint8_t *const src_address = x->plane[0].src.buf; \
+ const int src_stride = x->plane[0].src.stride; \
+ const MACROBLOCKD *xd = &x->e_mbd; \
+ unsigned int besterr = INT_MAX; \
+ unsigned int sse; \
+ unsigned int whichdir; \
+ int thismse; \
+ MV *bestmv = &x->best_mv.as_mv; \
+ const unsigned int halfiters = iters_per_step; \
+ const unsigned int quarteriters = iters_per_step; \
+ const unsigned int eighthiters = iters_per_step; \
+ const int y_stride = xd->plane[0].pre[0].stride; \
+ const int offset = bestmv->row * y_stride + bestmv->col; \
+ const uint8_t *const y = xd->plane[0].pre[0].buf; \
+ \
+ int br = bestmv->row * 8; \
+ int bc = bestmv->col * 8; \
+ int hstep = 4; \
+ int minc, maxc, minr, maxr; \
+ int tr = br; \
+ int tc = bc; \
+ \
+ av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
+ ref_mv); \
+ \
+ bestmv->row *= 8; \
+ bestmv->col *= 8;
+
+static unsigned int setup_center_error(
+ const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+ int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+ const uint8_t *const src, const int src_stride, const uint8_t *const y,
+ int y_stride, const uint8_t *second_pred, int w, int h, int offset,
+ int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
+ unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+ if (second_pred != NULL) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+ aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+ y_stride);
+ besterr =
+ vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+ aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ }
+ } else {
+ besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#else
+ (void)xd;
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+ aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ } else {
+ besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif // CONFIG_HIGHBITDEPTH
+ return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+ return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+ return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+ cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
+ *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+ (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+ *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+ (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int av1_find_best_sub_pixel_tree_pruned_evenmore(
+ MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+ int use_upsampled_ref) {
+ SETUP_SUBPEL_SEARCH;
+ besterr = setup_center_error(
+ xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+ y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+ (void)halfiters;
+ (void)quarteriters;
+ (void)eighthiters;
+ (void)whichdir;
+ (void)allow_hp;
+ (void)forced_stop;
+ (void)hstep;
+ (void)use_upsampled_ref;
+
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ int ir, ic;
+ unsigned int minpt;
+ get_cost_surf_min(cost_list, &ir, &ic, 2);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ if (allow_hp && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+ MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+ int use_upsampled_ref) {
+ SETUP_SUBPEL_SEARCH;
+ (void)use_upsampled_ref;
+
+ besterr = setup_center_error(
+ xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+ y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ unsigned int minpt;
+ int ir, ic;
+ get_cost_surf_min(cost_list, &ir, &ic, 1);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ tr = br;
+ tc = bc;
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ if (allow_hp && forced_stop == 0) {
+ tr = br;
+ tc = bc;
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+ MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+ int use_upsampled_ref) {
+ SETUP_SUBPEL_SEARCH;
+ (void)use_upsampled_ref;
+
+ besterr = setup_center_error(
+ xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+ y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX) {
+ unsigned int left, right, up, down, diag;
+ whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+ (cost_list[2] < cost_list[4] ? 0 : 2);
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+
+ if (allow_hp && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+/* clang-format off */
+static const MV search_step_table[12] = {
+ // left, right, up, down
+ { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
+ { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
+ { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
+};
+/* clang-format on */
+
+static int upsampled_pref_error(const MACROBLOCKD *xd,
+ const aom_variance_fn_ptr_t *vfp,
+ const uint8_t *const src, const int src_stride,
+ const uint8_t *const y, int y_stride,
+ const uint8_t *second_pred, int w, int h,
+ unsigned int *sse) {
+ unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ if (second_pred != NULL)
+ aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
+ y_stride);
+ else
+ aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+ besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ (void)xd;
+#endif // CONFIG_HIGHBITDEPTH
+ if (second_pred != NULL)
+ aom_comp_avg_upsampled_pred(pred, second_pred, w, h, y, y_stride);
+ else
+ aom_upsampled_pred(pred, w, h, y, y_stride);
+
+ besterr = vfp->vf(pred, w, src, src_stride, sse);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+ return besterr;
+}
+
+static unsigned int upsampled_setup_center_error(
+ const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+ int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+ const uint8_t *const src, const int src_stride, const uint8_t *const y,
+ int y_stride, const uint8_t *second_pred, int w, int h, int offset,
+ int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
+ unsigned int besterr = upsampled_pref_error(
+ xd, vfp, src, src_stride, y + offset, y_stride, second_pred, w, h, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
+ int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred, int w, int h,
+ int use_upsampled_ref) {
+ const uint8_t *const src_address = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int besterr = INT_MAX;
+ unsigned int sse;
+ unsigned int thismse;
+ const int y_stride = xd->plane[0].pre[0].stride;
+ MV *bestmv = &x->best_mv.as_mv;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int iter, round = 3 - forced_stop;
+ int tr = br;
+ int tc = bc;
+ const MV *search_step = search_step_table;
+ int idx, best_idx = -1;
+ unsigned int cost_array[5];
+ int kr, kc;
+ int minc, maxc, minr, maxr;
+
+ av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+ ref_mv);
+
+ if (!allow_hp)
+ if (round == 3) round = 2;
+
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+
+ // use_upsampled_ref can be 0 or 1
+ if (use_upsampled_ref)
+ besterr = upsampled_setup_center_error(
+ xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+ y_stride, second_pred, w, h, (offset * 8), mvjcost, mvcost, sse1,
+ distortion);
+ else
+ besterr = setup_center_error(
+ xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+ y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+
+ (void)cost_list; // to silence compiler warning
+
+ for (iter = 0; iter < round; ++iter) {
+ // Check vertical and horizontal sub-pixel positions.
+ for (idx = 0; idx < 4; ++idx) {
+ tr = br + search_step[idx].row;
+ tc = bc + search_step[idx].col;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_upsampled_ref) {
+ const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+ thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+ pre_address, y_stride, second_pred, w,
+ h, &sse);
+ } else {
+ const uint8_t *const pre_address =
+ y + (tr >> 3) * y_stride + (tc >> 3);
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse, second_pred);
+ }
+
+ cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+ mvcost, error_per_bit);
+
+ if (cost_array[idx] < besterr) {
+ best_idx = idx;
+ besterr = cost_array[idx];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+ }
+
+ // Check diagonal sub-pixel position
+ kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+ kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+ tc = bc + kc;
+ tr = br + kr;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_upsampled_ref) {
+ const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+ thismse =
+ upsampled_pref_error(xd, vfp, src_address, src_stride, pre_address,
+ y_stride, second_pred, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+ src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse, second_pred);
+ }
+
+ cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+ error_per_bit);
+
+ if (cost_array[4] < besterr) {
+ best_idx = 4;
+ besterr = cost_array[4];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+
+ if (best_idx < 4 && best_idx >= 0) {
+ br += search_step[best_idx].row;
+ bc += search_step[best_idx].col;
+ } else if (best_idx == 4) {
+ br = tr;
+ bc = tc;
+ }
+
+ if (iters_per_step > 1 && best_idx != -1) {
+ if (use_upsampled_ref) {
+ SECOND_LEVEL_CHECKS_BEST(1);
+ } else {
+ SECOND_LEVEL_CHECKS_BEST(0);
+ }
+ }
+
+ search_step += 4;
+ hstep >>= 1;
+ best_idx = -1;
+ }
+
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+#undef PRE
+#undef CHECK_BETTER
+
+static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
+ int range) {
+ return ((row - range) >= mv_limits->row_min) &
+ ((row + range) <= mv_limits->row_max) &
+ ((col - range) >= mv_limits->col_min) &
+ ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
+ return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) &&
+ (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max);
+}
+
+#define CHECK_BETTER \
+ { \
+ if (thissad < bestsad) { \
+ if (use_mvcost) \
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \
+ if (thissad < bestsad) { \
+ bestsad = thissad; \
+ best_site = i; \
+ } \
+ } \
+ }
+
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
+
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+ const MV *const ref_mv, int sadpb,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *best_mv, int *cost_list) {
+ static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+ const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ const int br = best_mv->row;
+ const int bc = best_mv->col;
+ int i;
+ unsigned int sse;
+ const MV this_mv = { br, bc };
+
+ cost_list[0] =
+ fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+ in_what->stride, &sse) +
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ if (check_bounds(&x->mv_limits, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &neighbor_mv),
+ in_what->stride, &sse) +
+ mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost,
+ x->mvcost, x->errorperbit);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (!is_mv_in(&x->mv_limits, &neighbor_mv))
+ cost_list[i + 1] = INT_MAX;
+ else
+ cost_list[i + 1] =
+ fn_ptr->vf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &neighbor_mv), in_what->stride,
+ &sse) +
+ mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit);
+ }
+ }
+}
+
+static INLINE void calc_int_sad_list(const MACROBLOCK *x,
+ const MV *const ref_mv, int sadpb,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *best_mv, int *cost_list,
+ const int use_mvcost, const int bestsad) {
+ static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+ const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int i;
+ const int br = best_mv->row;
+ const int bc = best_mv->col;
+
+ if (cost_list[0] == INT_MAX) {
+ cost_list[0] = bestsad;
+ if (check_bounds(&x->mv_limits, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ cost_list[i + 1] =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv))
+ cost_list[i + 1] = INT_MAX;
+ else
+ cost_list[i + 1] =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ }
+ }
+ } else {
+ if (use_mvcost) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (cost_list[i + 1] != INT_MAX) {
+ cost_list[i + 1] += mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ }
+ }
+ }
+ }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+//
+static int pattern_search(
+ MACROBLOCK *x, MV *start_mv, int search_param, int sad_per_bit,
+ int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost, const MV *center_mv,
+ const int num_candidates[MAX_PATTERN_SCALES],
+ const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, s, t;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const int last_is_4 = num_candidates[0] == 4;
+ int br, bc;
+ int bestsad = INT_MAX;
+ int thissad;
+ int k = -1;
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ int best_init_s = search_param_to_steps[search_param];
+ // adjust ref_mv to make sure it is within MV range
+ clamp_mv(start_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ br = start_mv->row;
+ bc = start_mv->col;
+ if (cost_list != NULL) {
+ cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+ INT_MAX;
+ }
+
+ // Work out the start point for the search
+ bestsad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, start_mv), in_what->stride) +
+ mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit);
+
+ // Search all possible scales upto the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ int best_site = -1;
+ if (check_bounds(&x->mv_limits, br, bc, 1 << t)) {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
+ }
+ if (best_init_s != -1) {
+ br += candidates[best_init_s][k].row;
+ bc += candidates[best_init_s][k].col;
+ }
+ }
+
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ const int last_s = (last_is_4 && cost_list != NULL);
+ int best_site = -1;
+ s = best_init_s;
+
+ for (; s >= last_s; s--) {
+ // No need to search all points the 1st time if initial search was used
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ } while (best_site != -1);
+ }
+
+ // Note: If we enter the if below, then cost_list must be non-NULL.
+ if (s == 0) {
+ cost_list[0] = bestsad;
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ cost_list[i + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ cost_list[i + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+ while (best_site != -1) {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+ cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+ cost_list[((k + 2) % 4) + 1] = cost_list[0];
+ cost_list[0] = bestsad;
+
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ cost_list[next_chkpts_indices[i] + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) {
+ cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+ continue;
+ }
+ cost_list[next_chkpts_indices[i] + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ }
+ }
+ }
+
+ // Returns the one-away integer pel cost/sad around the best as follows:
+ // cost_list[0]: cost/sad at the best integer pel
+ // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel
+ // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+ // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel
+ // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel
+ if (cost_list) {
+ const MV best_int_mv = { br, bc };
+ if (last_is_4) {
+ calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, cost_list,
+ use_mvcost, bestsad);
+ } else {
+ calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_int_mv,
+ cost_list);
+ }
+ }
+ x->best_mv.as_mv.row = br;
+ x->best_mv.as_mv.col = bc;
+ return bestsad;
+}
+
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+ in_what->stride, &unused) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+ what->buf, what->stride, &unused, second_pred) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ // First scale has 8-closest points, the rest have 6 points in hex shape
+ // at increasing scales
+ static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6 };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
+ { -1, 0 } },
+ { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+ { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+ { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+ { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
+ { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+ { -32, 0 } },
+ { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+ { -64, 0 } },
+ { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
+ { -128, 0 } },
+ { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
+ { -256, 0 } },
+ { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
+ { -512, 0 } },
+ { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+ { -512, 1024 }, { -1024, 0 } },
+ };
+ /* clang-format on */
+ return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+ cost_list, vfp, use_mvcost, center_mv,
+ hex_num_candidates, hex_candidates);
+}
+
+static int bigdia_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ // First scale has 4-closest points, the rest have 8 points in diamond
+ // shape at increasing scales
+ static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+ 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV
+ bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+ { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+ { -1, 1 }, { -2, 0 } },
+ { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+ { -2, 2 }, { -4, 0 } },
+ { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+ { -4, 4 }, { -8, 0 } },
+ { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+ { -8, 8 }, { -16, 0 } },
+ { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+ { 0, 32 }, { -16, 16 }, { -32, 0 } },
+ { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+ { 0, 64 }, { -32, 32 }, { -64, 0 } },
+ { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+ { 0, 128 }, { -64, 64 }, { -128, 0 } },
+ { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
+ { 0, 256 }, { -128, 128 }, { -256, 0 } },
+ { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
+ { 0, 512 }, { -256, 256 }, { -512, 0 } },
+ { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+ { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+ };
+ /* clang-format on */
+ return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+ cost_list, vfp, use_mvcost, center_mv,
+ bigdia_num_candidates, bigdia_candidates);
+}
+
+static int square_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ // All scales have 8 closest points in square shape
+ static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV
+ square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+ { -1, 1 }, { -1, 0 } },
+ { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+ { -2, 2 }, { -2, 0 } },
+ { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+ { -4, 4 }, { -4, 0 } },
+ { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+ { -8, 8 }, { -8, 0 } },
+ { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+ { 0, 16 }, { -16, 16 }, { -16, 0 } },
+ { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+ { 0, 32 }, { -32, 32 }, { -32, 0 } },
+ { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+ { 0, 64 }, { -64, 64 }, { -64, 0 } },
+ { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
+ { 0, 128 }, { -128, 128 }, { -128, 0 } },
+ { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
+ { 0, 256 }, { -256, 256 }, { -256, 0 } },
+ { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
+ { 0, 512 }, { -512, 512 }, { -512, 0 } },
+ { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+ { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+ };
+ /* clang-format on */
+ return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+ cost_list, vfp, use_mvcost, center_mv,
+ square_num_candidates, square_candidates);
+}
+
+static int fast_hex_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit,
+ int do_init_search, // must be zero for fast_hex
+ int *cost_list, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost, const MV *center_mv) {
+ return av1_hex_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+ sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+ center_mv);
+}
+
+static int fast_dia_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ return bigdia_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+ sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+ center_mv);
+}
+
+#undef CHECK_BETTER
+
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+ int range, int step, int sad_per_bit,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ MV fcenter_mv = { center_mv->row, center_mv->col };
+ unsigned int best_sad = INT_MAX;
+ int r, c, i;
+ int start_col, end_col, start_row, end_row;
+ int col_step = (step > 1) ? step : 4;
+
+ assert(step >= 1);
+
+ clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ *best_mv = fcenter_mv;
+ best_sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+ mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+ start_row = AOMMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
+ start_col = AOMMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
+ end_row = AOMMIN(range, x->mv_limits.row_max - fcenter_mv.row);
+ end_col = AOMMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+
+ for (r = start_row; r <= end_row; r += step) {
+ for (c = start_col; c <= end_col; c += col_step) {
+ // Step > 1 means we are not checking every location in this pass.
+ if (step > 1) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ x->second_best_mv.as_mv = *best_mv;
+ *best_mv = mv;
+ }
+ }
+ } else {
+ // 4 sads in a single call if we are checking every location
+ if (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ addrs[i] = get_buf_from_mv(in_what, &mv);
+ }
+ fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ if (sads[i] < best_sad) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ const unsigned int sad =
+ sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ x->second_best_mv.as_mv = *best_mv;
+ *best_mv = mv;
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < end_col - c; ++i) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ x->second_best_mv.as_mv = *best_mv;
+ *best_mv = mv;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return best_sad;
+}
+
+int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
+ MV *ref_mv, MV *best_mv, int search_param,
+ int sad_per_bit, int *num00,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ int i, j, step;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ uint8_t *what = x->plane[0].src.buf;
+ const int what_stride = x->plane[0].src.stride;
+ const uint8_t *in_what;
+ const int in_what_stride = xd->plane[0].pre[0].stride;
+ const uint8_t *best_address;
+
+ unsigned int bestsad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row;
+ int ref_col;
+
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
+ const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+ const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ ref_row = ref_mv->row;
+ ref_col = ref_mv->col;
+ *num00 = 0;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // Work out the start point for the search
+ in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+ best_address = in_what;
+
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+ i = 1;
+
+ for (step = 0; step < tot_steps; step++) {
+ int all_in = 1, t;
+
+ // All_in is true if every one of the points we are checking are within
+ // the bounds of the image.
+ all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_limits.row_min);
+ all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_limits.row_max);
+ all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_limits.col_min);
+ all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_limits.col_max);
+
+ // If all the pixels are within the bounds we don't check whether the
+ // search point is valid in this loop, otherwise we check each point
+ // for validity..
+ if (all_in) {
+ unsigned int sad_array[4];
+
+ for (j = 0; j < cfg->searches_per_step; j += 4) {
+ unsigned char const *block_offset[4];
+
+ for (t = 0; t < 4; t++)
+ block_offset[t] = ss[i + t].offset + best_address;
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
+
+ for (t = 0; t < 4; t++, i++) {
+ if (sad_array[t] < bestsad) {
+ const MV this_mv = { best_mv->row + ss[i].mv.row,
+ best_mv->col + ss[i].mv.col };
+ sad_array[t] +=
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (sad_array[t] < bestsad) {
+ bestsad = sad_array[t];
+ best_site = i;
+ }
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < cfg->searches_per_step; j++) {
+ // Trap illegal vectors
+ const MV this_mv = { best_mv->row + ss[i].mv.row,
+ best_mv->col + ss[i].mv.col };
+
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ const uint8_t *const check_here = ss[i].offset + best_address;
+ unsigned int thissad =
+ fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ i++;
+ }
+ }
+ if (best_site != last_site) {
+ x->second_best_mv.as_mv = *best_mv;
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+ best_mv->col + ss[best_site].mv.col };
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ const uint8_t *const check_here = ss[best_site].offset + best_address;
+ unsigned int thissad =
+ fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ continue;
+ }
+ }
+ }
+ break;
+ }
+#endif
+ } else if (best_address == in_what) {
+ (*num00)++;
+ }
+ }
+ return bestsad;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+ int best_sad = INT_MAX;
+ int this_sad;
+ int d;
+ int center, offset = 0;
+ int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
+ for (d = 0; d <= bw; d += 16) {
+ this_sad = aom_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+
+ for (d = -8; d <= 8; d += 16) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -4; d <= 4; d += 8) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -2; d <= 2; d += 4) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -1; d <= 1; d += 2) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+
+ return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+ { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+};
+
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
+ int idx;
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int search_width = bw << 1;
+ const int search_height = bh << 1;
+ const int src_stride = x->plane[0].src.stride;
+ const int ref_stride = xd->plane[0].pre[0].stride;
+ uint8_t const *ref_buf, *src_buf;
+ MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
+ unsigned int best_sad, tmp_sad, sad_arr[4];
+ MV this_mv;
+ const int norm_factor = 3 + (bw >> 5);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+ av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
+
+#if CONFIG_HIGHBITDEPTH
+ {
+ unsigned int this_sad;
+ tmp_mv->row = 0;
+ tmp_mv->col = 0;
+ this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return this_sad;
+ }
+#endif
+
+ // Set up prediction 1-D reference set
+ ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+ for (idx = 0; idx < search_width; idx += 16) {
+ aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+ ref_buf += 16;
+ }
+
+ ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+ for (idx = 0; idx < search_height; ++idx) {
+ vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
+ ref_buf += ref_stride;
+ }
+
+ // Set up src 1-D reference set
+ for (idx = 0; idx < bw; idx += 16) {
+ src_buf = x->plane[0].src.buf + idx;
+ aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+ }
+
+ src_buf = x->plane[0].src.buf;
+ for (idx = 0; idx < bh; ++idx) {
+ src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
+ src_buf += src_stride;
+ }
+
+ // Find the best match per 1-D search
+ tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+ tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+ this_mv = *tmp_mv;
+ src_buf = x->plane[0].src.buf;
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+ best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+ {
+ const uint8_t *const pos[4] = {
+ ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
+ };
+
+ cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, sad_arr);
+ }
+
+ for (idx = 0; idx < 4; ++idx) {
+ if (sad_arr[idx] < best_sad) {
+ best_sad = sad_arr[idx];
+ tmp_mv->row = search_pos[idx].row + this_mv.row;
+ tmp_mv->col = search_pos[idx].col + this_mv.col;
+ }
+ }
+
+ if (sad_arr[0] < sad_arr[3])
+ this_mv.row -= 1;
+ else
+ this_mv.row += 1;
+
+ if (sad_arr[1] < sad_arr[2])
+ this_mv.col -= 1;
+ else
+ this_mv.col += 1;
+
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+ tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+ if (best_sad > tmp_sad) {
+ *tmp_mv = this_mv;
+ best_sad = tmp_sad;
+ }
+
+ tmp_mv->row *= 8;
+ tmp_mv->col *= 8;
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
+ return best_sad;
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+ point as the best match, we will do a final 1-away diamond
+ refining search */
+static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine, int *cost_list,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv) {
+ MV temp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+ step_param, sadpb, &n, fn_ptr, ref_mv);
+ if (bestsme < INT_MAX)
+ bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+ x->best_mv.as_mv = temp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ if (n > further_steps) do_refine = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+ step_param + n, sadpb, &num00, fn_ptr,
+ ref_mv);
+ if (thissme < INT_MAX)
+ thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+
+ // check to see if refining search is needed.
+ if (num00 > further_steps - n) do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ x->best_mv.as_mv = temp_mv;
+ }
+ }
+ }
+
+ // final 1-away diamond refining search
+ if (do_refine) {
+ const int search_range = 8;
+ MV best_mv = x->best_mv.as_mv;
+ thissme = av1_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
+ ref_mv);
+ if (thissme < INT_MAX)
+ thissme = av1_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ x->best_mv.as_mv = best_mv;
+ }
+ }
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
+ }
+ return bestsme;
+}
+
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const MV *centre_mv_full, int sadpb,
+ int *cost_list,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+ MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int bestsme;
+ int i;
+ int interval = sf->mesh_patterns[0].interval;
+ int range = sf->mesh_patterns[0].range;
+ int baseline_interval_divisor;
+
+ // Keep track of number of exhaustive calls (this frame in this thread).
+ ++(*x->ex_search_count_ptr);
+
+ // Trap illegal values for interval and range for this function.
+ if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+ (interval > range))
+ return INT_MAX;
+
+ baseline_interval_divisor = range / interval;
+
+ // Check size of proposed first range against magnitude of the centre
+ // value used as a starting point.
+ range = AOMMAX(range, (5 * AOMMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+ range = AOMMIN(range, MAX_RANGE);
+ interval = AOMMAX(interval, range / baseline_interval_divisor);
+
+ // initial search
+ bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+ sadpb, fn_ptr, &temp_mv);
+
+ if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+ // Progressive searches with range and step size decreasing each time
+ // till we reach a step size of 1. Then break out.
+ for (i = 1; i < MAX_MESH_STEP; ++i) {
+ // First pass with coarser step and longer range
+ bestsme = exhuastive_mesh_search(
+ x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
+ sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
+
+ if (sf->mesh_patterns[i].interval == 1) break;
+ }
+ }
+
+ if (bestsme < INT_MAX)
+ bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+ *dst_mv = temp_mv;
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+ }
+ return bestsme;
+}
+
+int av1_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
+ int sad_per_bit, int distance,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, MV *best_mv) {
+ int r, c;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
+ const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
+ const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
+ const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ int best_sad =
+ fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+ *best_mv = *ref_mv;
+
+ for (r = row_min; r < row_max; ++r) {
+ for (c = col_min; c < col_max; ++c) {
+ const MV mv = { r, c };
+ const int sad =
+ fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride) +
+ mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ }
+ return best_sad;
+}
+
+int av1_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
+ int sad_per_bit, int distance,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, MV *best_mv) {
+ int r;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
+ const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
+ const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
+ const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ unsigned int best_sad =
+ fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+ *best_mv = *ref_mv;
+
+ for (r = row_min; r < row_max; ++r) {
+ int c = col_min;
+ const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+ if (fn_ptr->sdx3f != NULL) {
+ while ((c + 2) < col_max) {
+ int i;
+ DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+ fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+ sads);
+
+ for (i = 0; i < 3; ++i) {
+ unsigned int sad = sads[i];
+ if (sad < best_sad) {
+ const MV mv = { r, c };
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ ++check_here;
+ ++c;
+ }
+ }
+ }
+
+ while (c < col_max) {
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
+ if (sad < best_sad) {
+ const MV mv = { r, c };
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ ++check_here;
+ ++c;
+ }
+ }
+
+ return best_sad;
+}
+
+int av1_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
+ int sad_per_bit, int distance,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, MV *best_mv) {
+ int r;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
+ const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
+ const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
+ const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ unsigned int best_sad =
+ fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+ *best_mv = *ref_mv;
+
+ for (r = row_min; r < row_max; ++r) {
+ int c = col_min;
+ const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+ if (fn_ptr->sdx8f != NULL) {
+ while ((c + 7) < col_max) {
+ int i;
+ DECLARE_ALIGNED(16, uint32_t, sads[8]);
+
+ fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
+ sads);
+
+ for (i = 0; i < 8; ++i) {
+ unsigned int sad = sads[i];
+ if (sad < best_sad) {
+ const MV mv = { r, c };
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ ++check_here;
+ ++c;
+ }
+ }
+ }
+
+ if (fn_ptr->sdx3f != NULL) {
+ while ((c + 2) < col_max) {
+ int i;
+ DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+ fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+ sads);
+
+ for (i = 0; i < 3; ++i) {
+ unsigned int sad = sads[i];
+ if (sad < best_sad) {
+ const MV mv = { r, c };
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ ++check_here;
+ ++c;
+ }
+ }
+ }
+
+ while (c < col_max) {
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
+ if (sad < best_sad) {
+ const MV mv = { r, c };
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ ++check_here;
+ ++c;
+ }
+ }
+
+ return best_sad;
+}
+
+int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+ unsigned int best_sad =
+ fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+ int i, j;
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+ const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) &
+ ((ref_mv->row + 1) < x->mv_limits.row_max) &
+ ((ref_mv->col - 1) > x->mv_limits.col_min) &
+ ((ref_mv->col + 1) < x->mv_limits.col_max);
+
+ if (all_in) {
+ unsigned int sads[4];
+ const uint8_t *const positions[4] = { best_address - in_what->stride,
+ best_address - 1, best_address + 1,
+ best_address + in_what->stride };
+
+ fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+ for (j = 0; j < 4; ++j) {
+ if (sads[j] < best_sad) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+ sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sads[j] < best_sad) {
+ best_sad = sads[j];
+ best_site = j;
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < 4; ++j) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ x->second_best_mv.as_mv = *ref_mv;
+ ref_mv->row += neighbors[best_site].row;
+ ref_mv->col += neighbors[best_site].col;
+ best_address = get_buf_from_mv(in_what, ref_mv);
+ }
+ }
+
+ return best_sad;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, const uint8_t *second_pred) {
+ const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+ { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ MV *best_mv = &x->best_mv.as_mv;
+ unsigned int best_sad = INT_MAX;
+ int i, j;
+
+ clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ best_sad =
+ fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+ in_what->stride, second_pred) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+
+ for (i = 0; i < search_range; ++i) {
+ int best_site = -1;
+
+ for (j = 0; j < 8; ++j) {
+ const MV mv = { best_mv->row + neighbors[j].row,
+ best_mv->col + neighbors[j].col };
+
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad =
+ fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride, second_pred);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ best_mv->row += neighbors[best_site].row;
+ best_mv->col += neighbors[best_site].col;
+ }
+ }
+ return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const int max_ex =
+ AOMMAX(MIN_EX_SEARCH_LIMIT,
+ (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+ return sf->allow_exhaustive_searches &&
+ (sf->exhaustive_searches_thresh < INT_MAX) &&
+ (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
+}
+
+int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ MV *mvp_full, int step_param, int error_per_bit,
+ int *cost_list, const MV *ref_mv, int var_max,
+ int rd) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const SEARCH_METHODS method = sf->mv.search_method;
+ const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+ int var = 0;
+
+ if (cost_list) {
+ cost_list[0] = INT_MAX;
+ cost_list[1] = INT_MAX;
+ cost_list[2] = INT_MAX;
+ cost_list[3] = INT_MAX;
+ cost_list[4] = INT_MAX;
+ }
+
+ // Keep track of number of searches (this frame in this thread).
+ ++(*x->m_search_count_ptr);
+
+ switch (method) {
+ case FAST_DIAMOND:
+ var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+ cost_list, fn_ptr, 1, ref_mv);
+ break;
+ case FAST_HEX:
+ var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+ cost_list, fn_ptr, 1, ref_mv);
+ break;
+ case HEX:
+ var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv);
+ break;
+ case SQUARE:
+ var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv);
+ break;
+ case BIGDIA:
+ var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv);
+ break;
+ case NSTEP:
+ var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+ cost_list, fn_ptr, ref_mv);
+
+ // Should we allow a follow on exhaustive search?
+ if (is_exhaustive_allowed(cpi, x)) {
+ int exhuastive_thr = sf->exhaustive_searches_thresh;
+ exhuastive_thr >>=
+ 10 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+ // Threshold variance for an exhaustive full search.
+ if (var > exhuastive_thr) {
+ int var_ex;
+ MV tmp_mv_ex;
+ var_ex =
+ full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+ cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+
+ if (var_ex < var) {
+ var = var_ex;
+ x->best_mv.as_mv = tmp_mv_ex;
+ }
+ }
+ }
+ break;
+
+ break;
+ default: assert(0 && "Invalid search method.");
+ }
+
+ if (method != NSTEP && rd && var < var_max)
+ var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
+
+ return var;
+}
+
+#if CONFIG_EXT_INTER
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+ vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, src_stride, \
+ mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+
+#define MVC(r, c) \
+ (mvcost \
+ ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \
+ mvcost[1][((c)-rc)]) * \
+ error_per_bit + \
+ 4096) >> \
+ 13 \
+ : 0)
+
+#define CHECK_BETTER(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ thismse = (DIST(r, c)); \
+ if ((v = MVC(r, c) + thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ thismse = upsampled_masked_pref_error(xd, mask, mask_stride, vfp, z, \
+ src_stride, upre(y, y_stride, r, c), \
+ y_stride, w, h, &sse); \
+ if ((v = MVC(r, c) + thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+int av1_find_best_masked_sub_pixel_tree(
+ const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+ int is_second) {
+ const uint8_t *const z = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int besterr = INT_MAX;
+ unsigned int sse;
+ int thismse;
+ unsigned int whichdir;
+ unsigned int halfiters = iters_per_step;
+ unsigned int quarteriters = iters_per_step;
+ unsigned int eighthiters = iters_per_step;
+
+ const int y_stride = xd->plane[0].pre[is_second].stride;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ const uint8_t *const y = xd->plane[0].pre[is_second].buf;
+
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int tr = br;
+ int tc = bc;
+ int minc, maxc, minr, maxr;
+
+ av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+ ref_mv);
+
+ // central mv
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+
+ // calculate central point error
+ besterr =
+ vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+ // 1/2 pel
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+
+ if (allow_hp && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+static unsigned int setup_masked_center_error(
+ const uint8_t *mask, int mask_stride, const MV *bestmv, const MV *ref_mv,
+ int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+ const uint8_t *const src, const int src_stride, const uint8_t *const y,
+ int y_stride, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+ int *distortion) {
+ unsigned int besterr;
+ besterr =
+ vfp->mvf(y + offset, y_stride, src, src_stride, mask, mask_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
+ const uint8_t *mask, int mask_stride,
+ const aom_variance_fn_ptr_t *vfp,
+ const uint8_t *const src,
+ const int src_stride,
+ const uint8_t *const y, int y_stride,
+ int w, int h, unsigned int *sse) {
+ unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+ besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, mask,
+ mask_stride, sse);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ (void)xd;
+#endif // CONFIG_HIGHBITDEPTH
+ aom_upsampled_pred(pred, w, h, y, y_stride);
+
+ besterr = vfp->mvf(pred, w, src, src_stride, mask, mask_stride, sse);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+ return besterr;
+}
+
+static unsigned int upsampled_setup_masked_center_error(
+ const MACROBLOCKD *xd, const uint8_t *mask, int mask_stride,
+ const MV *bestmv, const MV *ref_mv, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+ const int src_stride, const uint8_t *const y, int y_stride, int w, int h,
+ int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+ int *distortion) {
+ unsigned int besterr =
+ upsampled_masked_pref_error(xd, mask, mask_stride, vfp, src, src_stride,
+ y + offset, y_stride, w, h, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+int av1_find_best_masked_sub_pixel_tree_up(
+ const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, int mask_stride,
+ int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, int is_second, int use_upsampled_ref) {
+ const uint8_t *const z = x->plane[0].src.buf;
+ const uint8_t *const src_address = z;
+ const int src_stride = x->plane[0].src.stride;
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ unsigned int besterr = INT_MAX;
+ unsigned int sse;
+ unsigned int thismse;
+
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int iter;
+ int round = 3 - forced_stop;
+ int tr = br;
+ int tc = bc;
+ const MV *search_step = search_step_table;
+ int idx, best_idx = -1;
+ unsigned int cost_array[5];
+ int kr, kc;
+ const int w = block_size_wide[mbmi->sb_type];
+ const int h = block_size_high[mbmi->sb_type];
+ int offset;
+ int y_stride;
+ const uint8_t *y;
+
+ const struct buf_2d backup_pred = pd->pre[is_second];
+ int minc, maxc, minr, maxr;
+
+ av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+ ref_mv);
+
+ if (use_upsampled_ref) {
+ int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+ const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+ setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
+ upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
+ upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+ (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+ pd->subsampling_y);
+ }
+ y = pd->pre[is_second].buf;
+ y_stride = pd->pre[is_second].stride;
+ offset = bestmv->row * y_stride + bestmv->col;
+
+ if (!allow_hp)
+ if (round == 3) round = 2;
+
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+
+ // use_upsampled_ref can be 0 or 1
+ if (use_upsampled_ref)
+ besterr = upsampled_setup_masked_center_error(
+ xd, mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z,
+ src_stride, y, y_stride, w, h, (offset * 8), mvjcost, mvcost, sse1,
+ distortion);
+ else
+ besterr = setup_masked_center_error(
+ mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y,
+ y_stride, offset, mvjcost, mvcost, sse1, distortion);
+
+ for (iter = 0; iter < round; ++iter) {
+ // Check vertical and horizontal sub-pixel positions.
+ for (idx = 0; idx < 4; ++idx) {
+ tr = br + search_step[idx].row;
+ tc = bc + search_step[idx].col;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_upsampled_ref) {
+ const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+ thismse = upsampled_masked_pref_error(
+ xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
+ y_stride, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address =
+ y + (tr >> 3) * y_stride + (tc >> 3);
+ thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, mask, mask_stride, &sse);
+ }
+
+ cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+ mvcost, error_per_bit);
+
+ if (cost_array[idx] < besterr) {
+ best_idx = idx;
+ besterr = cost_array[idx];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+ }
+
+ // Check diagonal sub-pixel position
+ kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+ kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+ tc = bc + kc;
+ tr = br + kr;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_upsampled_ref) {
+ const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+ thismse = upsampled_masked_pref_error(
+ xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
+ y_stride, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+ thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+ src_stride, mask, mask_stride, &sse);
+ }
+
+ cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+ error_per_bit);
+
+ if (cost_array[4] < besterr) {
+ best_idx = 4;
+ besterr = cost_array[4];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+
+ if (best_idx < 4 && best_idx >= 0) {
+ br += search_step[best_idx].row;
+ bc += search_step[best_idx].col;
+ } else if (best_idx == 4) {
+ br = tr;
+ bc = tc;
+ }
+
+ if (iters_per_step > 1 && best_idx != -1) {
+ if (use_upsampled_ref) {
+ SECOND_LEVEL_CHECKS_BEST(1);
+ } else {
+ SECOND_LEVEL_CHECKS_BEST(0);
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ search_step += 4;
+ hstep >>= 1;
+ best_idx = -1;
+ }
+
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ if (use_upsampled_ref) {
+ pd->pre[is_second] = backup_pred;
+ }
+
+ return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_masked_mvpred_var(const MACROBLOCK *x, const uint8_t *mask,
+ int mask_stride, const MV *best_mv,
+ const MV *center_mv,
+ const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost, int is_second) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->mvf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+ in_what->stride, mask, mask_stride, &unused) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int masked_refining_search_sad(const MACROBLOCK *x, const uint8_t *mask,
+ int mask_stride, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, int is_second) {
+ const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ unsigned int best_sad =
+ fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+ in_what->stride, mask, mask_stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+ int i, j;
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+
+ for (j = 0; j < 4; j++) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad =
+ fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride, mask, mask_stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ ref_mv->row += neighbors[best_site].row;
+ ref_mv->col += neighbors[best_site].col;
+ }
+ }
+ return best_sad;
+}
+
+int masked_diamond_search_sad(const MACROBLOCK *x,
+ const search_site_config *cfg,
+ const uint8_t *mask, int mask_stride, MV *ref_mv,
+ MV *best_mv, int search_param, int sad_per_bit,
+ int *num00, const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, int is_second) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ // search_param determines the length of the initial step and hence the number
+ // of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+ // (MAX_FIRST_STEP/4) pel... etc.
+ const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+ const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ const uint8_t *best_address, *in_what_ref;
+ int best_sad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+ int i, j, step;
+
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ in_what_ref = get_buf_from_mv(in_what, ref_mv);
+ best_address = in_what_ref;
+ *num00 = 0;
+ *best_mv = *ref_mv;
+
+ // Check the starting position
+ best_sad = fn_ptr->msdf(what->buf, what->stride, best_address,
+ in_what->stride, mask, mask_stride) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+ i = 1;
+
+ for (step = 0; step < tot_steps; step++) {
+ for (j = 0; j < cfg->searches_per_step; j++) {
+ const MV mv = { best_mv->row + ss[i].mv.row,
+ best_mv->col + ss[i].mv.col };
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ int sad =
+ fn_ptr->msdf(what->buf, what->stride, best_address + ss[i].offset,
+ in_what->stride, mask, mask_stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = i;
+ }
+ }
+ }
+
+ i++;
+ }
+
+ if (best_site != last_site) {
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+ best_mv->col + ss[best_site].mv.col };
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ int sad = fn_ptr->msdf(what->buf, what->stride,
+ best_address + ss[best_site].offset,
+ in_what->stride, mask, mask_stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ continue;
+ }
+ }
+ }
+ break;
+ }
+#endif
+ } else if (best_address == in_what_ref) {
+ (*num00)++;
+ }
+ }
+ return best_sad;
+}
+
+int av1_masked_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+ const uint8_t *mask, int mask_stride,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second) {
+ MV temp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg, mask, mask_stride,
+ mvp_full, &temp_mv, step_param, sadpb,
+ &n, fn_ptr, ref_mv, is_second);
+ if (bestsme < INT_MAX)
+ bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+ fn_ptr, 1, is_second);
+ *dst_mv = temp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ if (n > further_steps) do_refine = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = masked_diamond_search_sad(
+ x, &cpi->ss_cfg, mask, mask_stride, mvp_full, &temp_mv,
+ step_param + n, sadpb, &num00, fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+ fn_ptr, 1, is_second);
+
+ // check to see if refining search is needed.
+ if (num00 > further_steps - n) do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = temp_mv;
+ }
+ }
+ }
+
+ // final 1-away diamond refining search
+ if (do_refine) {
+ const int search_range = 8;
+ MV best_mv = *dst_mv;
+ thissme =
+ masked_refining_search_sad(x, mask, mask_stride, &best_mv, sadpb,
+ search_range, fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_masked_mvpred_var(x, mask, mask_stride, &best_mv, ref_mv,
+ fn_ptr, 1, is_second);
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = best_mv;
+ }
+ }
+ return bestsme;
+}
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+ vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(r, c) \
+ (mvcost \
+ ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \
+ mvcost[1][((c)-rc)]) * \
+ error_per_bit + \
+ 4096) >> \
+ 13 \
+ : 0)
+
+#define CHECK_BETTER(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ thismse = (DIST(r, c)); \
+ if ((v = MVC(r, c) + thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ thismse = upsampled_obmc_pref_error( \
+ xd, mask, vfp, z, upre(y, y_stride, r, c), y_stride, w, h, &sse); \
+ if ((v = MVC(r, c) + thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+static unsigned int setup_obmc_center_error(
+ const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+ const uint8_t *const y, int y_stride, int offset, int *mvjcost,
+ int *mvcost[2], unsigned int *sse1, int *distortion) {
+ unsigned int besterr;
+ besterr = vfp->ovf(y + offset, y_stride, wsrc, mask, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask,
+ const aom_variance_fn_ptr_t *vfp,
+ const int32_t *const wsrc,
+ const uint8_t *const y, int y_stride,
+ int w, int h, unsigned int *sse) {
+ unsigned int besterr;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+ besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ (void)xd;
+#endif // CONFIG_HIGHBITDEPTH
+ aom_upsampled_pred(pred, w, h, y, y_stride);
+
+ besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+ return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+ const MACROBLOCKD *xd, const int32_t *mask, const MV *bestmv,
+ const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+ const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w,
+ int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+ int *distortion) {
+ unsigned int besterr = upsampled_obmc_pref_error(
+ xd, mask, vfp, wsrc, y + offset, y_stride, w, h, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+ const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, MV *bestmv,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+ int is_second, int use_upsampled_ref) {
+ const int32_t *wsrc = x->wsrc_buf;
+ const int32_t *mask = x->mask_buf;
+ const int *const z = wsrc;
+ const int *const src_address = z;
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ unsigned int besterr = INT_MAX;
+ unsigned int sse;
+ unsigned int thismse;
+
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int iter;
+ int round = 3 - forced_stop;
+ int tr = br;
+ int tc = bc;
+ const MV *search_step = search_step_table;
+ int idx, best_idx = -1;
+ unsigned int cost_array[5];
+ int kr, kc;
+ const int w = block_size_wide[mbmi->sb_type];
+ const int h = block_size_high[mbmi->sb_type];
+ int offset;
+ int y_stride;
+ const uint8_t *y;
+
+ const struct buf_2d backup_pred = pd->pre[is_second];
+ int minc, maxc, minr, maxr;
+
+ av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+ ref_mv);
+
+ if (use_upsampled_ref) {
+ int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+ const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+ setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
+ upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
+ upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+ (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+ pd->subsampling_y);
+ }
+ y = pd->pre[is_second].buf;
+ y_stride = pd->pre[is_second].stride;
+ offset = bestmv->row * y_stride + bestmv->col;
+
+ if (!allow_hp)
+ if (round == 3) round = 2;
+
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+ // use_upsampled_ref can be 0 or 1
+ if (use_upsampled_ref)
+ besterr = upsampled_setup_obmc_center_error(
+ xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h,
+ (offset * 8), mvjcost, mvcost, sse1, distortion);
+ else
+ besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
+ z, y, y_stride, offset, mvjcost, mvcost,
+ sse1, distortion);
+
+ for (iter = 0; iter < round; ++iter) {
+ // Check vertical and horizontal sub-pixel positions.
+ for (idx = 0; idx < 4; ++idx) {
+ tr = br + search_step[idx].row;
+ tc = bc + search_step[idx].col;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_upsampled_ref) {
+ const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+ thismse = upsampled_obmc_pref_error(
+ xd, mask, vfp, src_address, pre_address, y_stride, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address =
+ y + (tr >> 3) * y_stride + (tc >> 3);
+ thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, mask, &sse);
+ }
+
+ cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+ mvcost, error_per_bit);
+ if (cost_array[idx] < besterr) {
+ best_idx = idx;
+ besterr = cost_array[idx];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+ }
+
+ // Check diagonal sub-pixel position
+ kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+ kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+ tc = bc + kc;
+ tr = br + kr;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_upsampled_ref) {
+ const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+ thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address,
+ pre_address, y_stride, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+ thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+ mask, &sse);
+ }
+
+ cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+ error_per_bit);
+
+ if (cost_array[4] < besterr) {
+ best_idx = 4;
+ besterr = cost_array[4];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+
+ if (best_idx < 4 && best_idx >= 0) {
+ br += search_step[best_idx].row;
+ bc += search_step[best_idx].col;
+ } else if (best_idx == 4) {
+ br = tr;
+ bc = tc;
+ }
+
+ if (iters_per_step > 1 && best_idx != -1) {
+ if (use_upsampled_ref) {
+ SECOND_LEVEL_CHECKS_BEST(1);
+ } else {
+ SECOND_LEVEL_CHECKS_BEST(0);
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ search_step += 4;
+ hstep >>= 1;
+ best_idx = -1;
+ }
+
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ if (use_upsampled_ref) {
+ pd->pre[is_second] = backup_pred;
+ }
+
+ return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_obmc_mvpred_var(const MACROBLOCK *x, const int32_t *wsrc,
+ const int32_t *mask, const MV *best_mv,
+ const MV *center_mv,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ int is_second) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride, wsrc,
+ mask, &unused) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int obmc_refining_search_sad(const MACROBLOCK *x, const int32_t *wsrc,
+ const int32_t *mask, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, int is_second) {
+ const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
+ in_what->stride, wsrc, mask) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+ int i, j;
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+
+ for (j = 0; j < 4; j++) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
+ in_what->stride, wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ ref_mv->row += neighbors[best_site].row;
+ ref_mv->col += neighbors[best_site].col;
+ }
+ }
+ return best_sad;
+}
+
+int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg,
+ const int32_t *wsrc, const int32_t *mask,
+ MV *ref_mv, MV *best_mv, int search_param,
+ int sad_per_bit, int *num00,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, int is_second) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ // search_param determines the length of the initial step and hence the number
+ // of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+ // (MAX_FIRST_STEP/4) pel... etc.
+ const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+ const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ const uint8_t *best_address, *in_what_ref;
+ int best_sad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+ int i, j, step;
+
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
+ best_address = in_what_ref;
+ *num00 = 0;
+ *best_mv = *ref_mv;
+
+ // Check the starting position
+ best_sad = fn_ptr->osdf(best_address, in_what->stride, wsrc, mask) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+ i = 1;
+
+ for (step = 0; step < tot_steps; step++) {
+ for (j = 0; j < cfg->searches_per_step; j++) {
+ const MV mv = { best_mv->row + ss[i].mv.row,
+ best_mv->col + ss[i].mv.col };
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+ wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = i;
+ }
+ }
+ }
+
+ i++;
+ }
+
+ if (best_site != last_site) {
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+ best_mv->col + ss[best_site].mv.col };
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
+ in_what->stride, wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ continue;
+ }
+ }
+ }
+ break;
+ }
+#endif
+ } else if (best_address == in_what_ref) {
+ (*num00)++;
+ }
+ }
+ return best_sad;
+}
+
+int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second) {
+ const int32_t *wsrc = x->wsrc_buf;
+ const int32_t *mask = x->mask_buf;
+ MV temp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme =
+ obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv,
+ step_param, sadpb, &n, fn_ptr, ref_mv, is_second);
+ if (bestsme < INT_MAX)
+ bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1,
+ is_second);
+ *dst_mv = temp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ if (n > further_steps) do_refine = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full,
+ &temp_mv, step_param + n, sadpb, &num00,
+ fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr,
+ 1, is_second);
+
+ // check to see if refining search is needed.
+ if (num00 > further_steps - n) do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = temp_mv;
+ }
+ }
+ }
+
+ // final 1-away diamond refining search
+ if (do_refine) {
+ const int search_range = 8;
+ MV best_mv = *dst_mv;
+ thissme = obmc_refining_search_sad(x, wsrc, mask, &best_mv, sadpb,
+ search_range, fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_obmc_mvpred_var(x, wsrc, mask, &best_mv, ref_mv, fn_ptr, 1,
+ is_second);
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = best_mv;
+ }
+ }
+ return bestsme;
+}
+#endif // CONFIG_MOTION_VAR
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+#define COMMON_MV_TEST \
+ SETUP_SUBPEL_SEARCH; \
+ \
+ (void)error_per_bit; \
+ (void)vfp; \
+ (void)src_address; \
+ (void)src_stride; \
+ (void)y; \
+ (void)y_stride; \
+ (void)second_pred; \
+ (void)w; \
+ (void)h; \
+ (void)use_upsampled_ref; \
+ (void)offset; \
+ (void)mvjcost; \
+ (void)mvcost; \
+ (void)sse1; \
+ (void)distortion; \
+ \
+ (void)halfiters; \
+ (void)quarteriters; \
+ (void)eighthiters; \
+ (void)whichdir; \
+ (void)forced_stop; \
+ (void)hstep; \
+ \
+ (void)tr; \
+ (void)tc; \
+ (void)sse; \
+ (void)thismse; \
+ (void)cost_list;
+// Return the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
+ int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred, int w, int h,
+ int use_upsampled_ref) {
+ COMMON_MV_TEST;
+ (void)minr;
+ (void)minc;
+ bestmv->row = maxr;
+ bestmv->col = maxc;
+ besterr = 0;
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp);
+ return besterr;
+}
+// Return the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
+ int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred, int w, int h,
+ int use_upsampled_ref) {
+ COMMON_MV_TEST;
+ (void)maxr;
+ (void)maxc;
+ bestmv->row = minr;
+ bestmv->col = minc;
+ besterr = 0;
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp);
+ return besterr;
+}
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
new file mode 100644
index 0000000000..8465860adf
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_MCOMP_H_
+#define AV1_ENCODER_MCOMP_H_
+
+#include "av1/encoder/block.h"
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
+
+// motion search site
+typedef struct search_site {
+ MV mv;
+ int offset;
+} search_site;
+
+typedef struct search_site_config {
+ search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+ int ss_count;
+ int searches_per_step;
+} search_site_config;
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv);
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost);
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost);
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+int av1_init_search_range(int size);
+
+int av1_refining_search_sad(struct macroblock *x, struct mv *ref_mv,
+ int sad_per_bit, int distance,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const struct mv *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
+int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine, int *cost_list,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv);
+
+// Perform integral projection based motion estimation.
+unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col);
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv);
+
+typedef int(fractional_mv_step_fp)(
+ MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
+ int h, int use_upsampled_ref);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+
+typedef int (*av1_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
+ int sad_per_bit, int distance,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, MV *best_mv);
+
+typedef int (*av1_diamond_search_fn_t)(
+ MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, const uint8_t *second_pred);
+
+struct AV1_COMP;
+
+int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+ int error_per_bit, int *cost_list, const MV *ref_mv,
+ int var_max, int rd);
+
+#if CONFIG_EXT_INTER
+int av1_find_best_masked_sub_pixel_tree(
+ const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+ int is_second);
+int av1_find_best_masked_sub_pixel_tree_up(
+ const struct AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask,
+ int mask_stride, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv,
+ int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, int iters_per_step, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1, int is_second, int use_upsampled_ref);
+int av1_masked_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ const uint8_t *mask, int mask_stride,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second);
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR
+int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second);
+int av1_find_best_obmc_sub_pixel_tree_up(
+ const struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+ MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+ int is_second, int use_upsampled_ref);
+#endif // CONFIG_MOTION_VAR
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c
new file mode 100644
index 0000000000..8d13af7ad9
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
+ static int64_t block_error_##BSize##size_msa( \
+ const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
+ int64_t err = 0; \
+ uint32_t loop_cnt; \
+ v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \
+ v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \
+ v2i64 sq_coeff_r, sq_coeff_l; \
+ v2i64 err0, err_dup0, err1, err_dup1; \
+ \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r, \
+ sq_coeff_l); \
+ DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ \
+ for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ } \
+ \
+ err_dup0 = __msa_splati_d(sq_coeff_r, 1); \
+ err_dup1 = __msa_splati_d(sq_coeff_l, 1); \
+ sq_coeff_r += err_dup0; \
+ sq_coeff_l += err_dup1; \
+ *ssz = __msa_copy_s_d(sq_coeff_r, 0); \
+ *ssz += __msa_copy_s_d(sq_coeff_l, 0); \
+ \
+ err_dup0 = __msa_splati_d(err0, 1); \
+ err_dup1 = __msa_splati_d(err1, 1); \
+ err0 += err_dup0; \
+ err1 += err_dup1; \
+ err = __msa_copy_s_d(err0, 0); \
+ err += __msa_copy_s_d(err1, 0); \
+ \
+ return err; \
+ }
+
+/* clang-format off */
+BLOCK_ERROR_BLOCKSIZE_MSA(16)
+BLOCK_ERROR_BLOCKSIZE_MSA(64)
+BLOCK_ERROR_BLOCKSIZE_MSA(256)
+BLOCK_ERROR_BLOCKSIZE_MSA(1024)
+/* clang-format on */
+
+int64_t av1_block_error_msa(const tran_low_t *coeff_ptr,
+ const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
+ int64_t *ssz) {
+ int64_t err;
+ const int16_t *coeff = (const int16_t *)coeff_ptr;
+ const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+ switch (blk_size) {
+ case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
+ case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
+ case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
+ case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
+ default:
+ err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+ break;
+ }
+
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
new file mode 100644
index 0000000000..4b0364d6c8
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "av1/encoder/mips/msa/fdct_msa.h"
+#include "aom_dsp/mips/fwd_txfm_msa.h"
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+ const int32_t *const0, int16_t *int_buf) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+ v4i32 k0, k1, k2, k3;
+
+ /* load input data */
+ r0 = LD_SH(input);
+ r15 = LD_SH(input + 15 * stride);
+ r7 = LD_SH(input + 7 * stride);
+ r8 = LD_SH(input + 8 * stride);
+ SLLI_4V(r0, r15, r7, r8, 2);
+
+ /* stage 1 */
+ LD_SW2(const0, 4, k0, k1);
+ LD_SW2(const0 + 8, 4, k2, k3);
+ MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ r3 = LD_SH(input + 3 * stride);
+ r4 = LD_SH(input + 4 * stride);
+ r11 = LD_SH(input + 11 * stride);
+ r12 = LD_SH(input + 12 * stride);
+ SLLI_4V(r3, r4, r11, r12, 2);
+
+ LD_SW2(const0 + 4 * 4, 4, k0, k1);
+ LD_SW2(const0 + 4 * 6, 4, k2, k3);
+ MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+ /* stage 2 */
+ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+ ST_SH2(tp0, tp2, int_buf, 8);
+ ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+ LD_SW2(const0 + 4 * 8, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 10);
+ MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+ ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+ ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+ r9 = LD_SH(input + 9 * stride);
+ r6 = LD_SH(input + 6 * stride);
+ r1 = LD_SH(input + stride);
+ r14 = LD_SH(input + 14 * stride);
+ SLLI_4V(r9, r6, r1, r14, 2);
+
+ LD_SW2(const0 + 4 * 11, 4, k0, k1);
+ LD_SW2(const0 + 4 * 13, 4, k2, k3);
+ MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+ r13 = LD_SH(input + 13 * stride);
+ r2 = LD_SH(input + 2 * stride);
+ r5 = LD_SH(input + 5 * stride);
+ r10 = LD_SH(input + 10 * stride);
+ SLLI_4V(r13, r2, r5, r10, 2);
+
+ LD_SW2(const0 + 4 * 15, 4, k0, k1);
+ LD_SW2(const0 + 4 * 17, 4, k2, k3);
+ MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+ ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+ BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+ ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_step2_msa_helper(int16_t *int_buf, const int32_t *const0,
+ int16_t *out, int16_t *out_ptr) {
+ v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v4i32 k0, k1, k2, k3;
+
+ LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+ LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+ LD_SW2(const0 + 4 * 19, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 21);
+ MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+ tp0 = LD_SH(int_buf + 4 * 8);
+ tp1 = LD_SH(int_buf + 5 * 8);
+ tp3 = LD_SH(int_buf + 10 * 8);
+ tp2 = LD_SH(int_buf + 14 * 8);
+ LD_SW2(const0 + 4 * 22, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 24);
+ MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ ST_SH(out4, (out + 3 * 16));
+ ST_SH(out5, (out_ptr + 4 * 16));
+
+ h1 = LD_SH(int_buf + 9 * 8);
+ h3 = LD_SH(int_buf + 12 * 8);
+ MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ ST_SH(out12, (out + 2 * 16));
+ ST_SH(out13, (out_ptr + 5 * 16));
+
+ tp0 = LD_SH(int_buf);
+ tp1 = LD_SH(int_buf + 8);
+ tp2 = LD_SH(int_buf + 2 * 8);
+ tp3 = LD_SH(int_buf + 6 * 8);
+
+ BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+ out1 = -out1;
+ ST_SH(out0, (out));
+ ST_SH(out1, (out_ptr + 7 * 16));
+
+ h0 = LD_SH(int_buf + 8 * 8);
+ h2 = LD_SH(int_buf + 13 * 8);
+
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+ ST_SH(out8, (out + 16));
+ ST_SH(out9, (out_ptr + 6 * 16));
+
+ /* stage 4 */
+ LD_SW2(const0 + 4 * 25, 4, k0, k1);
+ LD_SW2(const0 + 4 * 27, 4, k2, k3);
+ MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ ST_SH(out2, (out + 7 * 16));
+ ST_SH(out3, (out_ptr));
+
+ MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ ST_SH(out6, (out + 4 * 16));
+ ST_SH(out7, (out_ptr + 3 * 16));
+
+ MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ ST_SH(out10, (out + 6 * 16));
+ ST_SH(out11, (out_ptr + 16));
+
+ MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ ST_SH(out14, (out + 5 * 16));
+ ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+ int16_t *out) {
+ fadst16_step2_msa_helper(int_buf, const0, out, out + 128);
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ FDCT_POSTPROC_2V_NEG_H(r0, r1);
+ FDCT_POSTPROC_2V_NEG_H(r2, r3);
+ FDCT_POSTPROC_2V_NEG_H(r4, r5);
+ FDCT_POSTPROC_2V_NEG_H(r6, r7);
+ ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+ out += 64;
+
+ LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ FDCT_POSTPROC_2V_NEG_H(r8, r9);
+ FDCT_POSTPROC_2V_NEG_H(r10, r11);
+ FDCT_POSTPROC_2V_NEG_H(r12, r13);
+ FDCT_POSTPROC_2V_NEG_H(r14, r15);
+ ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+ out += 64;
+
+ /* load input data */
+ input += 128;
+ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ FDCT_POSTPROC_2V_NEG_H(r0, r1);
+ FDCT_POSTPROC_2V_NEG_H(r2, r3);
+ FDCT_POSTPROC_2V_NEG_H(r4, r5);
+ FDCT_POSTPROC_2V_NEG_H(r6, r7);
+ ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+ out += 64;
+
+ LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ FDCT_POSTPROC_2V_NEG_H(r8, r9);
+ FDCT_POSTPROC_2V_NEG_H(r10, r11);
+ FDCT_POSTPROC_2V_NEG_H(r12, r13);
+ FDCT_POSTPROC_2V_NEG_H(r14, r15);
+ ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+ int16_t *int_buf) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+ v4i32 k0, k1, k2, k3;
+
+ /* load input data */
+ r0 = LD_SH(input);
+ r7 = LD_SH(input + 7 * 8);
+ r8 = LD_SH(input + 8 * 8);
+ r15 = LD_SH(input + 15 * 8);
+
+ /* stage 1 */
+ LD_SW2(const0, 4, k0, k1);
+ LD_SW2(const0 + 4 * 2, 4, k2, k3);
+ MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ r3 = LD_SH(input + 3 * 8);
+ r4 = LD_SH(input + 4 * 8);
+ r11 = LD_SH(input + 11 * 8);
+ r12 = LD_SH(input + 12 * 8);
+
+ LD_SW2(const0 + 4 * 4, 4, k0, k1);
+ LD_SW2(const0 + 4 * 6, 4, k2, k3);
+ MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+ /* stage 2 */
+ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+ ST_SH2(tp0, tp1, int_buf, 4 * 8);
+ ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+ LD_SW2(const0 + 4 * 8, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 10);
+ MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+ ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+ ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+ r1 = LD_SH(input + 8);
+ r6 = LD_SH(input + 6 * 8);
+ r9 = LD_SH(input + 9 * 8);
+ r14 = LD_SH(input + 14 * 8);
+
+ LD_SW2(const0 + 4 * 11, 4, k0, k1);
+ LD_SW2(const0 + 4 * 13, 4, k2, k3);
+ MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+ ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+ r2 = LD_SH(input + 2 * 8);
+ r5 = LD_SH(input + 5 * 8);
+ r10 = LD_SH(input + 10 * 8);
+ r13 = LD_SH(input + 13 * 8);
+
+ LD_SW2(const0 + 4 * 15, 4, k0, k1);
+ LD_SW2(const0 + 4 * 17, 4, k2, k3);
+ MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+ ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+ BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+ ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+ int16_t *out) {
+ fadst16_step2_msa_helper(int_buf, const0, out, out + 8);
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+ l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+ ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+ out += 16 * 8;
+
+ /* load input data */
+ input += 128;
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+ l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+ ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+ int16_t *temp = intermediate;
+ int16_t *out = output;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+ v8i16 in12, in13, in14, in15;
+
+ LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ temp = intermediate + 8;
+ LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ FDCT_POSTPROC_2V_NEG_H(in0, in1);
+ FDCT_POSTPROC_2V_NEG_H(in2, in3);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ FDCT_POSTPROC_2V_NEG_H(in6, in7);
+ FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ FDCT_POSTPROC_2V_NEG_H(in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
+ tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+ temp = intermediate;
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ temp = intermediate;
+ LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
+ ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+ TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
+ out = output + 8;
+ ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride,
+ int32_t tx_type) {
+ DECLARE_ALIGNED(32, int16_t, tmp[256]);
+ DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+ int32_t i;
+ int16_t *ptmpbuf = &tmp_buf[0];
+ int16_t *trans = &trans_buf[0];
+ const int32_t const_arr[29 * 4] = {
+ 52707308, 52707308, 52707308, 52707308, -1072430300,
+ -1072430300, -1072430300, -1072430300, 795618043, 795618043,
+ 795618043, 795618043, -721080468, -721080468, -721080468,
+ -721080468, 459094491, 459094491, 459094491, 459094491,
+ -970646691, -970646691, -970646691, -970646691, 1010963856,
+ 1010963856, 1010963856, 1010963856, -361743294, -361743294,
+ -361743294, -361743294, 209469125, 209469125, 209469125,
+ 209469125, -1053094788, -1053094788, -1053094788, -1053094788,
+ 1053160324, 1053160324, 1053160324, 1053160324, 639644520,
+ 639644520, 639644520, 639644520, -862444000, -862444000,
+ -862444000, -862444000, 1062144356, 1062144356, 1062144356,
+ 1062144356, -157532337, -157532337, -157532337, -157532337,
+ 260914709, 260914709, 260914709, 260914709, -1041559667,
+ -1041559667, -1041559667, -1041559667, 920985831, 920985831,
+ 920985831, 920985831, -551995675, -551995675, -551995675,
+ -551995675, 596522295, 596522295, 596522295, 596522295,
+ 892853362, 892853362, 892853362, 892853362, -892787826,
+ -892787826, -892787826, -892787826, 410925857, 410925857,
+ 410925857, 410925857, -992012162, -992012162, -992012162,
+ -992012162, 992077698, 992077698, 992077698, 992077698,
+ 759246145, 759246145, 759246145, 759246145, -759180609,
+ -759180609, -759180609, -759180609, -759222975, -759222975,
+ -759222975, -759222975, 759288511, 759288511, 759288511,
+ 759288511
+ };
+
+ switch (tx_type) {
+ case DCT_DCT:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+ }
+ break;
+ case ADST_DCT:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+ fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+ }
+ break;
+ case DCT_ADST:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+ }
+
+ fadst16_transpose_postproc_msa(tmp, trans);
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+ fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+ }
+
+ fadst16_transpose_msa(tmp, output);
+ break;
+ case ADST_ADST:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+ fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+ }
+
+ fadst16_transpose_postproc_msa(tmp, trans);
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+ fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+ }
+
+ fadst16_transpose_msa(tmp, output);
+ break;
+ default: assert(0); break;
+ }
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
new file mode 100644
index 0000000000..da1ac74f06
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "av1/encoder/mips/msa/fdct_msa.h"
+
+void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3, in4;
+
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+ in0 += in1;
+ in3 -= in2;
+ in4 = (in0 - in3) >> 1;
+ SUB2(in4, in1, in4, in2, in1, in2);
+ in0 -= in2;
+ in3 += in1;
+
+ TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+ in0 += in2;
+ in1 -= in3;
+ in4 = (in0 - in1) >> 1;
+ SUB2(in4, in2, in4, in3, in2, in3);
+ in0 -= in3;
+ in1 += in2;
+
+ SLLI_4V(in0, in1, in2, in3, 2);
+
+ TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+ ST4x2_UB(in0, output, 4);
+ ST4x2_UB(in3, output + 4, 4);
+ ST4x2_UB(in1, output + 8, 4);
+ ST4x2_UB(in2, output + 12, 4);
+}
+
+void av1_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
+ int32_t tx_type) {
+ v8i16 in0, in1, in2, in3;
+
+ LD_SH4(input, stride, in0, in1, in2, in3);
+
+ /* fdct4 pre-process */
+ {
+ v8i16 temp, mask;
+ v16i8 zero = { 0 };
+ v16i8 one = __msa_ldi_b(1);
+
+ mask = (v8i16)__msa_sldi_b(zero, one, 15);
+ SLLI_4V(in0, in1, in2, in3, 4);
+ temp = __msa_ceqi_h(in0, 0);
+ temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
+ temp = mask & temp;
+ in0 += temp;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_DCT:
+ AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_ADST:
+ AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ default: assert(0); break;
+ }
+
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ SRA_4V(in0, in1, in2, in3, 2);
+ PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+ ST_SH2(in0, in2, output, 8);
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
new file mode 100644
index 0000000000..4cbf60a11d
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+#include "av1/encoder/mips/msa/fdct_msa.h"
+
+void av1_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
+ int32_t tx_type) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ case ADST_DCT:
+ AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ case DCT_ADST:
+ AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ case ADST_ADST:
+ AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ default: assert(0); break;
+ }
+
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
new file mode 100644
index 0000000000..52bcf790c9
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
+#define AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
+
+#include "aom_dsp/mips/fwd_txfm_msa.h"
+#include "aom_dsp/mips/txfm_macros_msa.h"
+#include "aom_ports/mem.h"
+
+#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
+ v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
+ cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
+ v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
+ cospi_24_64, -cospi_24_64, 0, 0 }; \
+ \
+ SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in7, in0, in4, in3); \
+ \
+ SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in5, in2, in6, in1); \
+ BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
+ out7 = -s0_m; \
+ out0 = s1_m; \
+ \
+ SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
+ \
+ ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ cnst1_m = cnst0_m; \
+ \
+ ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
+ cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
+ \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ out1 = -out1; \
+ out3 = -out3; \
+ out5 = -out5; \
+ }
+
+#define AOM_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \
+ v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \
+ \
+ UNPCK_R_SH_SW(in0, in0_r_m); \
+ UNPCK_R_SH_SW(in1, in1_r_m); \
+ UNPCK_R_SH_SW(in2, in2_r_m); \
+ UNPCK_R_SH_SW(in3, in3_r_m); \
+ \
+ constant_m = __msa_fill_w(sinpi_4_9); \
+ MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \
+ \
+ constant_m = __msa_fill_w(sinpi_1_9); \
+ s0_m += in0_r_m * constant_m; \
+ s1_m -= in1_r_m * constant_m; \
+ \
+ constant_m = __msa_fill_w(sinpi_2_9); \
+ s0_m += in1_r_m * constant_m; \
+ s1_m += in3_r_m * constant_m; \
+ \
+ s2_m = in0_r_m + in1_r_m - in3_r_m; \
+ \
+ constant_m = __msa_fill_w(sinpi_3_9); \
+ MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \
+ \
+ in0_r_m = s0_m + s3_m; \
+ s2_m = s1_m - s3_m; \
+ s3_m = s1_m - s0_m + s3_m; \
+ \
+ SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \
+ out0, out1, out2, out3); \
+ }
+#endif // AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 0000000000..4ec679642a
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
+ uint8_t *frm2_ptr, int32_t filt_sth,
+ int32_t filt_wgt, uint32_t *acc,
+ uint16_t *cnt) {
+ uint32_t row;
+ uint64_t f0, f1, f2, f3;
+ v16i8 frm2, frm1 = { 0 };
+ v16i8 frm4, frm3 = { 0 };
+ v16u8 frm_r, frm_l;
+ v8i16 frm2_r, frm2_l;
+ v8i16 diff0, diff1, mod0_h, mod1_h;
+ v4i32 cnst3, cnst16, filt_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+ v4i32 acc0, acc1, acc2, acc3;
+ v8i16 cnt0, cnt1;
+
+ filt_wt = __msa_fill_w(filt_wgt);
+ strength = __msa_fill_w(filt_sth);
+ cnst3 = __msa_ldi_w(3);
+ cnst16 = __msa_ldi_w(16);
+
+ for (row = 2; row--;) {
+ LD4(frm1_ptr, stride, f0, f1, f2, f3);
+ frm1_ptr += (4 * stride);
+
+ LD_SB2(frm2_ptr, 16, frm2, frm4);
+ frm2_ptr += 32;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ INSERT_D2_SB(f0, f1, frm1);
+ INSERT_D2_SB(f2, f3, frm3);
+ ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+ UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+ }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
+ uint8_t *frm2_ptr,
+ int32_t filt_sth, int32_t filt_wgt,
+ uint32_t *acc, uint16_t *cnt) {
+ uint32_t row;
+ v16i8 frm1, frm2, frm3, frm4;
+ v16u8 frm_r, frm_l;
+ v16i8 zero = { 0 };
+ v8u16 frm2_r, frm2_l;
+ v8i16 diff0, diff1, mod0_h, mod1_h;
+ v4i32 cnst3, cnst16, filt_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+ v4i32 acc0, acc1, acc2, acc3;
+ v8i16 cnt0, cnt1;
+
+ filt_wt = __msa_fill_w(filt_wgt);
+ strength = __msa_fill_w(filt_sth);
+ cnst3 = __msa_ldi_w(3);
+ cnst16 = __msa_ldi_w(16);
+
+ for (row = 8; row--;) {
+ LD_SB2(frm1_ptr, stride, frm1, frm3);
+ frm1_ptr += stride;
+
+ LD_SB2(frm2_ptr, 16, frm2, frm4);
+ frm2_ptr += 16;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ frm1_ptr += stride;
+ frm2_ptr += 16;
+ }
+}
+
+void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+ uint8_t *frame2_ptr, uint32_t blk_w,
+ uint32_t blk_h, int32_t strength,
+ int32_t filt_wgt, uint32_t *accu,
+ uint16_t *cnt) {
+ if (8 == (blk_w * blk_h)) {
+ temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
+ filt_wgt, accu, cnt);
+ } else if (16 == (blk_w * blk_h)) {
+ temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
+ filt_wgt, accu, cnt);
+ } else {
+ av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+ strength, filt_wgt, accu, cnt);
+ }
+}
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
new file mode 100644
index 0000000000..355141de55
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/palette.h"
+
+static float calc_dist(const float *p1, const float *p2, int dim) {
+ float dist = 0;
+ int i;
+ for (i = 0; i < dim; ++i) {
+ const float diff = p1[i] - p2[i];
+ dist += diff * diff;
+ }
+ return dist;
+}
+
+void av1_calc_indices(const float *data, const float *centroids,
+ uint8_t *indices, int n, int k, int dim) {
+ int i, j;
+ for (i = 0; i < n; ++i) {
+ float min_dist = calc_dist(data + i * dim, centroids, dim);
+ indices[i] = 0;
+ for (j = 1; j < k; ++j) {
+ const float this_dist =
+ calc_dist(data + i * dim, centroids + j * dim, dim);
+ if (this_dist < min_dist) {
+ min_dist = this_dist;
+ indices[i] = j;
+ }
+ }
+ }
+}
+
+// Generate a random number in the range [0, 32768).
+static unsigned int lcg_rand16(unsigned int *state) {
+ *state = (unsigned int)(*state * 1103515245ULL + 12345);
+ return *state / 65536 % 32768;
+}
+
+static void calc_centroids(const float *data, float *centroids,
+ const uint8_t *indices, int n, int k, int dim) {
+ int i, j, index;
+ int count[PALETTE_MAX_SIZE];
+ unsigned int rand_state = (unsigned int)data[0];
+
+ assert(n <= 32768);
+
+ memset(count, 0, sizeof(count[0]) * k);
+ memset(centroids, 0, sizeof(centroids[0]) * k * dim);
+
+ for (i = 0; i < n; ++i) {
+ index = indices[i];
+ assert(index < k);
+ ++count[index];
+ for (j = 0; j < dim; ++j) {
+ centroids[index * dim + j] += data[i * dim + j];
+ }
+ }
+
+ for (i = 0; i < k; ++i) {
+ if (count[i] == 0) {
+ memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
+ sizeof(centroids[0]) * dim);
+ } else {
+ const float norm = 1.0f / count[i];
+ for (j = 0; j < dim; ++j) centroids[i * dim + j] *= norm;
+ }
+ }
+
+ // Round to nearest integers.
+ for (i = 0; i < k * dim; ++i) {
+ centroids[i] = roundf(centroids[i]);
+ }
+}
+
+static float calc_total_dist(const float *data, const float *centroids,
+ const uint8_t *indices, int n, int k, int dim) {
+ float dist = 0;
+ int i;
+ (void)k;
+
+ for (i = 0; i < n; ++i)
+ dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim);
+
+ return dist;
+}
+
+void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n,
+ int k, int dim, int max_itr) {
+ int i;
+ float this_dist;
+ float pre_centroids[2 * PALETTE_MAX_SIZE];
+ uint8_t pre_indices[MAX_SB_SQUARE];
+
+ av1_calc_indices(data, centroids, indices, n, k, dim);
+ this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+
+ for (i = 0; i < max_itr; ++i) {
+ const float pre_dist = this_dist;
+ memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+ memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+
+ calc_centroids(data, centroids, indices, n, k, dim);
+ av1_calc_indices(data, centroids, indices, n, k, dim);
+ this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+
+ if (this_dist > pre_dist) {
+ memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
+ memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+ break;
+ }
+ if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
+ break;
+ }
+}
+
+static int float_comparer(const void *a, const void *b) {
+ const float fa = *(const float *)a;
+ const float fb = *(const float *)b;
+ return (fa > fb) - (fa < fb);
+}
+
+int av1_remove_duplicates(float *centroids, int num_centroids) {
+ int num_unique; // number of unique centroids
+ int i;
+ qsort(centroids, num_centroids, sizeof(*centroids), float_comparer);
+ // Remove duplicates.
+ num_unique = 1;
+ for (i = 1; i < num_centroids; ++i) {
+ if (centroids[i] != centroids[i - 1]) { // found a new unique centroid
+ centroids[num_unique++] = centroids[i];
+ }
+ }
+ return num_unique;
+}
+
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+ int n = 0, r, c, i, val_count[256];
+ uint8_t val;
+ memset(val_count, 0, sizeof(val_count));
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ val = src[r * stride + c];
+ ++val_count[val];
+ }
+ }
+
+ for (i = 0; i < 256; ++i) {
+ if (val_count[i]) {
+ ++n;
+ }
+ }
+
+ return n;
+}
+
+#if CONFIG_PALETTE_DELTA_ENCODING
+int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *min_bits) {
+ const int n = pmi->palette_size[0];
+ int max_d = 0, i;
+ *min_bits = bit_depth - 3;
+ for (i = 1; i < n; ++i) {
+ const int delta = pmi->palette_colors[i] - pmi->palette_colors[i - 1];
+ assert(delta > 0);
+ if (delta > max_d) max_d = delta;
+ }
+ return AOMMAX(av1_ceil_log2(max_d), *min_bits);
+}
+
+int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *min_bits) {
+ const int n = pmi->palette_size[1];
+ int max_d = 0, i;
+ *min_bits = bit_depth - 3;
+ for (i = 1; i < n; ++i) {
+ const int delta = pmi->palette_colors[PALETTE_MAX_SIZE + i] -
+ pmi->palette_colors[PALETTE_MAX_SIZE + i - 1];
+ assert(delta >= 0);
+ if (delta > max_d) max_d = delta;
+ }
+ return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count,
+ int *min_bits) {
+ const int n = pmi->palette_size[1];
+ const int max_val = 1 << bit_depth;
+ int max_d = 0, i;
+ *min_bits = bit_depth - 4;
+ *zero_count = 0;
+ for (i = 1; i < n; ++i) {
+ const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+ const int v = abs(delta);
+ const int d = AOMMIN(v, max_val - v);
+ if (d > max_d) max_d = d;
+ if (d == 0) ++(*zero_count);
+ }
+ return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth) {
+ const int n = pmi->palette_size[0];
+#if CONFIG_PALETTE_DELTA_ENCODING
+ int min_bits = 0;
+ const int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits);
+ return av1_cost_bit(128, 0) * (2 + bit_depth + bits * (n - 1));
+#else
+ return bit_depth * n * av1_cost_bit(128, 0);
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth) {
+ const int n = pmi->palette_size[1];
+#if CONFIG_PALETTE_DELTA_ENCODING
+ int cost = 0;
+ // U channel palette color cost.
+ int min_bits_u = 0;
+ const int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u);
+ cost += av1_cost_bit(128, 0) * (2 + bit_depth + bits_u * (n - 1));
+ // V channel palette color cost.
+ int zero_count = 0, min_bits_v = 0;
+ const int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int bits_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int bits_using_raw = bit_depth * n;
+ cost += av1_cost_bit(128, 0) * (1 + AOMMIN(bits_using_delta, bits_using_raw));
+ return cost;
+#else
+ return 2 * bit_depth * n * av1_cost_bit(128, 0);
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+}
+
+#if CONFIG_HIGHBITDEPTH
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+ int bit_depth) {
+ int n = 0, r, c, i;
+ uint16_t val;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ int val_count[1 << 12];
+
+ assert(bit_depth <= 12);
+ memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ val = src[r * stride + c];
+ ++val_count[val];
+ }
+ }
+
+ for (i = 0; i < (1 << bit_depth); ++i) {
+ if (val_count[i]) {
+ ++n;
+ }
+ }
+
+ return n;
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
new file mode 100644
index 0000000000..5403ac5e60
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_PALETTE_H_
+#define AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
+// calculate the centroid 'indices' for the data points.
+void av1_calc_indices(const float *data, const float *centroids,
+ uint8_t *indices, int n, int k, int dim);
+
+// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
+// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
+// updated 'centroids' and the centroid 'indices' for elements in 'data'.
+// Note: the output centroids are rounded off to nearest integers.
+void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n,
+ int k, int dim, int max_itr);
+
+// Given a list of centroids, returns the unique number of centroids 'k', and
+// puts these unique centroids in first 'k' indices of 'centroids' array.
+// Ideally, the centroids should be rounded to integers before calling this
+// method.
+int av1_remove_duplicates(float *centroids, int num_centroids);
+
+// Returns the number of colors in 'src'.
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
+#if CONFIG_HIGHBITDEPTH
+// Same as av1_count_colors(), but for high-bitdepth mode.
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+ int bit_depth);
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PALETTE_DELTA_ENCODING
+// Return the number of bits used to transmit each luma palette color delta.
+int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *min_bits);
+
+// Return the number of bits used to transmit each U palette color delta.
+int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *min_bits);
+
+// Return the number of bits used to transmit each v palette color delta;
+// assign zero_count with the number of deltas being 0.
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count, int *min_bits);
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+
+// Return the rate cost for transmitting luma palette color values.
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, int bit_depth);
+
+// Return the rate cost for transmitting chroma palette color values.
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* AV1_ENCODER_PALETTE_H_ */
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
new file mode 100644
index 0000000000..da64fb48d6
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "./aom_scale_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+
+#define TOTAL_STRENGTHS (DERING_STRENGTHS * CLPF_STRENGTHS)
+
+/* Search for the best strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS], int sb_count) {
+ uint64_t tot_mse[TOTAL_STRENGTHS];
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id = 0;
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ if (mse[i][lev[gi]] < best_mse) {
+ best_mse = mse[i][lev[gi]];
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < TOTAL_STRENGTHS; j++) {
+ uint64_t best = best_mse;
+ if (mse[i][j] < best) best = mse[i][j];
+ tot_mse[j] += best;
+ }
+ }
+ for (j = 0; j < TOTAL_STRENGTHS; j++) {
+ if (tot_mse[j] < best_tot_mse) {
+ best_tot_mse = tot_mse[j];
+ best_id = j;
+ }
+ }
+ lev[nb_strengths] = best_id;
+ return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS],
+ int sb_count) {
+ uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id0 = 0;
+ int best_id1 = 0;
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ uint64_t curr = mse[0][i][lev0[gi]];
+ curr += mse[1][i][lev1[gi]];
+ if (curr < best_mse) {
+ best_mse = curr;
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < TOTAL_STRENGTHS; j++) {
+ int k;
+ for (k = 0; k < TOTAL_STRENGTHS; k++) {
+ uint64_t best = best_mse;
+ uint64_t curr = mse[0][i][j];
+ curr += mse[1][i][k];
+ if (curr < best) best = curr;
+ tot_mse[j][k] += best;
+ }
+ }
+ }
+ for (j = 0; j < TOTAL_STRENGTHS; j++) {
+ int k;
+ for (k = 0; k < TOTAL_STRENGTHS; k++) {
+ if (tot_mse[j][k] < best_tot_mse) {
+ best_tot_mse = tot_mse[j][k];
+ best_id0 = j;
+ best_id1 = k;
+ }
+ }
+ }
+ lev0[nb_strengths] = best_id0;
+ lev1[nb_strengths] = best_id1;
+ return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS],
+ int sb_count) {
+ uint64_t best_tot_mse;
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse = search_one(best_lev, i, mse, sb_count);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+ best_tot_mse = search_one(best_lev, nb_strengths - 1, mse, sb_count);
+ }
+ return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+ int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS],
+ int sb_count) {
+ uint64_t best_tot_mse;
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse = search_one_dual(best_lev0, best_lev1, i, mse, sb_count);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) {
+ best_lev0[j] = best_lev0[j + 1];
+ best_lev1[j] = best_lev1[j + 1];
+ }
+ best_tot_mse =
+ search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, sb_count);
+ }
+ return best_tot_mse;
+}
+
+/* FIXME: SSE-optimize this. */
+static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
+ int src_voffset, int src_hoffset, int sstride,
+ int vsize, int hsize) {
+ int r, c;
+ const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
+ for (r = 0; r < vsize; r++) {
+ for (c = 0; c < hsize; c++) {
+ dst[r * dstride + c] = base[r * sstride + c];
+ }
+ }
+}
+
+static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride, int coeff_shift) {
+ uint64_t svar = 0;
+ uint64_t dvar = 0;
+ uint64_t sum_s = 0;
+ uint64_t sum_d = 0;
+ uint64_t sum_s2 = 0;
+ uint64_t sum_d2 = 0;
+ uint64_t sum_sd = 0;
+ int i, j;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ sum_s += src[i * sstride + j];
+ sum_d += dst[i * dstride + j];
+ sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+ sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+ sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+ }
+ }
+ /* Compute the variance -- the calculation cannot go negative. */
+ svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+ dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+ return (uint64_t)floor(
+ .5 +
+ (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+ (svar + dvar + (400 << 2 * coeff_shift)) /
+ (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
+}
+
+static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride) {
+ uint64_t sum = 0;
+ int i, j;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int e = dst[i * dstride + j] - src[i * sstride + j];
+ sum += e * e;
+ }
+ }
+ return sum;
+}
+
+static INLINE uint64_t mse_4x4_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride) {
+ uint64_t sum = 0;
+ int i, j;
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ int e = dst[i * dstride + j] - src[i * sstride + j];
+ sum += e * e;
+ }
+ }
+ return sum;
+}
+
+/* Compute MSE only on the blocks we filtered. */
+uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
+ dering_list *dlist, int dering_count,
+ BLOCK_SIZE bsize, int coeff_shift, int pli) {
+ uint64_t sum = 0;
+ int bi, bx, by;
+ if (bsize == BLOCK_8X8) {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ if (pli == 0) {
+ sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+ &src[bi << (3 + 3)], 8, coeff_shift);
+ } else {
+ sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+ &src[bi << (3 + 3)], 8);
+ }
+ }
+ } else if (bsize == BLOCK_4X8) {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
+ &src[bi << (3 + 2)], 4);
+ sum += mse_4x4_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)], dstride,
+ &src[(bi << (3 + 2)) + 4 * 4], 4);
+ }
+ } else if (bsize == BLOCK_8X4) {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
+ &src[bi << (2 + 3)], 8);
+ sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
+ &src[(bi << (2 + 3)) + 4], 8);
+ }
+ } else {
+ assert(bsize == BLOCK_4X4);
+ for (bi = 0; bi < dering_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
+ &src[bi << (2 + 2)], 4);
+ }
+ }
+ return sum >> 2 * coeff_shift;
+}
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+ AV1_COMMON *cm, MACROBLOCKD *xd) {
+ int r, c;
+ int sbr, sbc;
+ uint16_t *src[3];
+ uint16_t *ref_coeff[3];
+ dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+ int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+ int stride[3];
+ int bsize[3];
+ int mi_wide_l2[3];
+ int mi_high_l2[3];
+ int xdec[3];
+ int ydec[3];
+ int pli;
+ int dering_count;
+ int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ uint64_t tot_mse;
+ int sb_count;
+ int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+ int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+ int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
+ int *selected_strength = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
+ uint64_t(*mse[2])[TOTAL_STRENGTHS];
+ int clpf_damping = 3 + (cm->base_qindex >> 6);
+ int dering_damping = 6;
+ int i;
+ int nb_strengths;
+ int nb_strength_bits;
+ int quantizer;
+ double lambda;
+ int nplanes = 3;
+ DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
+ uint16_t *in;
+ DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SQUARE]);
+ int chroma_dering =
+ xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+ xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
+ quantizer =
+ av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+ lambda = .12 * quantizer * quantizer / 256.;
+
+ av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
+ mse[0] = aom_malloc(sizeof(**mse) * nvsb * nhsb);
+ mse[1] = aom_malloc(sizeof(**mse) * nvsb * nhsb);
+ for (pli = 0; pli < nplanes; pli++) {
+ uint8_t *ref_buffer;
+ int ref_stride;
+ switch (pli) {
+ case 0:
+ ref_buffer = ref->y_buffer;
+ ref_stride = ref->y_stride;
+ break;
+ case 1:
+ ref_buffer = ref->u_buffer;
+ ref_stride = ref->uv_stride;
+ break;
+ case 2:
+ ref_buffer = ref->v_buffer;
+ ref_stride = ref->uv_stride;
+ break;
+ }
+ src[pli] = aom_memalign(
+ 32, sizeof(*src) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+ ref_coeff[pli] = aom_memalign(
+ 32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+ xdec[pli] = xd->plane[pli].subsampling_x;
+ ydec[pli] = xd->plane[pli].subsampling_y;
+ bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+ : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+ stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
+ mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+ mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+
+ const int frame_height =
+ (cm->mi_rows * MI_SIZE) >> xd->plane[pli].subsampling_y;
+ const int frame_width =
+ (cm->mi_cols * MI_SIZE) >> xd->plane[pli].subsampling_x;
+
+ for (r = 0; r < frame_height; ++r) {
+ for (c = 0; c < frame_width; ++c) {
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
+ xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
+ ref_coeff[pli][r * stride[pli] + c] =
+ CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
+ } else {
+#endif
+ src[pli][r * stride[pli] + c] =
+ xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
+ ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+ }
+ }
+ }
+ in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
+ sb_count = 0;
+ for (sbr = 0; sbr < nvsb; ++sbr) {
+ for (sbc = 0; sbc < nhsb; ++sbc) {
+ int nvb, nhb;
+ int gi;
+ int dirinit = 0;
+ nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
+ nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+ cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+ MAX_MIB_SIZE * sbc]
+ ->mbmi.cdef_strength = -1;
+ if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
+ dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
+ sbc * MAX_MIB_SIZE, dlist, 1);
+ for (pli = 0; pli < nplanes; pli++) {
+ for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
+ inbuf[i] = OD_DERING_VERY_LARGE;
+ for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
+ int threshold;
+ uint64_t curr_mse;
+ int clpf_strength;
+ threshold = gi / CLPF_STRENGTHS;
+ if (pli > 0 && !chroma_dering) threshold = 0;
+ /* We avoid filtering the pixels for which some of the pixels to
+ average
+ are outside the frame. We could change the filter instead, but it
+ would add special cases for any future vectorization. */
+ int yoff = OD_FILT_VBORDER * (sbr != 0);
+ int xoff = OD_FILT_HBORDER * (sbc != 0);
+ int ysize = (nvb << mi_high_l2[pli]) +
+ OD_FILT_VBORDER * (sbr != nvsb - 1) + yoff;
+ int xsize = (nhb << mi_wide_l2[pli]) +
+ OD_FILT_HBORDER * (sbc != nhsb - 1) + xoff;
+ clpf_strength = gi % CLPF_STRENGTHS;
+ if (clpf_strength == 0)
+ copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
+ src[pli],
+ (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) - yoff,
+ (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]) - xoff,
+ stride[pli], ysize, xsize);
+ od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE,
+ tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, pli,
+ dlist, dering_count, threshold,
+ clpf_strength + (clpf_strength == 3), clpf_damping,
+ dering_damping, coeff_shift, clpf_strength != 0, 1);
+ curr_mse = compute_dering_dist(
+ ref_coeff[pli] +
+ (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) * stride[pli] +
+ (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]),
+ stride[pli], tmp_dst, dlist, dering_count, bsize[pli],
+ coeff_shift, pli);
+ if (pli < 2)
+ mse[pli][sb_count][gi] = curr_mse;
+ else
+ mse[1][sb_count][gi] += curr_mse;
+ sb_index[sb_count] =
+ MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
+ }
+ }
+ sb_count++;
+ }
+ }
+ nb_strength_bits = 0;
+ /* Search for different number of signalling bits. */
+ for (i = 0; i <= 3; i++) {
+ int j;
+ int best_lev0[CDEF_MAX_STRENGTHS];
+ int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+ nb_strengths = 1 << i;
+ if (nplanes >= 3)
+ tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+ mse, sb_count);
+ else
+ tot_mse =
+ joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count);
+ /* Count superblock signalling cost. */
+ tot_mse += (uint64_t)(sb_count * lambda * i);
+ /* Count header signalling cost. */
+ tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
+ if (tot_mse < best_tot_mse) {
+ best_tot_mse = tot_mse;
+ nb_strength_bits = i;
+ for (j = 0; j < 1 << nb_strength_bits; j++) {
+ cm->cdef_strengths[j] = best_lev0[j];
+ cm->cdef_uv_strengths[j] = best_lev1[j];
+ }
+ }
+ }
+ nb_strengths = 1 << nb_strength_bits;
+
+ cm->cdef_bits = nb_strength_bits;
+ cm->nb_cdef_strengths = nb_strengths;
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ int best_gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ best_gi = 0;
+ for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
+ uint64_t curr = mse[0][i][cm->cdef_strengths[gi]];
+ if (nplanes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
+ if (curr < best_mse) {
+ best_gi = gi;
+ best_mse = curr;
+ }
+ }
+ selected_strength[i] = best_gi;
+ cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi;
+ }
+ cm->cdef_dering_damping = dering_damping;
+ cm->cdef_clpf_damping = clpf_damping;
+ aom_free(mse[0]);
+ aom_free(mse[1]);
+ for (pli = 0; pli < nplanes; pli++) {
+ aom_free(src[pli]);
+ aom_free(ref_coeff[pli]);
+ }
+ aom_free(sb_index);
+ aom_free(selected_strength);
+}
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
new file mode 100644
index 0000000000..fc0ea485d8
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "./aom_scale_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+ if (cpi->oxcf.pass == 2) {
+ return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+ : MAX_LOOP_FILTER;
+ } else {
+ return MAX_LOOP_FILTER;
+ }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+ AV1_COMP *const cpi, int filt_level,
+ int partial_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t filt_err;
+
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+ av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
+ partial_frame);
+#else
+ if (cpi->num_workers > 1)
+ av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+ filt_level, 1, partial_frame, cpi->workers,
+ cpi->num_workers, &cpi->lf_row_sync);
+ else
+ av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+ 1, partial_frame);
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ filt_err = aom_highbd_get_y_sse(sd, cm->frame_to_show);
+ } else {
+ filt_err = aom_get_y_sse(sd, cm->frame_to_show);
+ }
+#else
+ filt_err = aom_get_y_sse(sd, cm->frame_to_show);
+#endif // CONFIG_HIGHBITDEPTH
+
+ // Re-instate the unfiltered frame
+ aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+ return filt_err;
+}
+
+int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ int partial_frame, double *best_cost_ret) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct loopfilter *const lf = &cm->lf;
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ int filt_direction = 0;
+ int64_t best_err;
+ int filt_best;
+ MACROBLOCK *x = &cpi->td.mb;
+
+ // Start the search at the previous frame filter level unless it is now out of
+ // range.
+ int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+ int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+ // Sum squared error at each filter level
+ int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+ // Set each entry to -1
+ memset(ss_err, 0xFF, sizeof(ss_err));
+
+ // Make a copy of the unfiltered / processed recon buffer
+ aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+
+ best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
+ filt_best = filt_mid;
+ ss_err[filt_mid] = best_err;
+
+ while (filter_step > 0) {
+ const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+ const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+ // Bias against raising loop filter in favor of lowering it.
+ int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+ if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+ bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+ // yx, bias less for large block size
+ if (cm->tx_mode != ONLY_4X4) bias >>= 1;
+
+ if (filt_direction <= 0 && filt_low != filt_mid) {
+ // Get Low filter error score
+ if (ss_err[filt_low] < 0) {
+ ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
+ }
+ // If value is close to the best so far then bias towards a lower loop
+ // filter value.
+ if (ss_err[filt_low] < (best_err + bias)) {
+ // Was it actually better than the previous best?
+ if (ss_err[filt_low] < best_err) {
+ best_err = ss_err[filt_low];
+ }
+ filt_best = filt_low;
+ }
+ }
+
+ // Now look at filt_high
+ if (filt_direction >= 0 && filt_high != filt_mid) {
+ if (ss_err[filt_high] < 0) {
+ ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
+ }
+ // If value is significantly better than previous best, bias added against
+ // raising filter value
+ if (ss_err[filt_high] < (best_err - bias)) {
+ best_err = ss_err[filt_high];
+ filt_best = filt_high;
+ }
+ }
+
+ // Half the step distance if the best filter value was the same as last time
+ if (filt_best == filt_mid) {
+ filter_step /= 2;
+ filt_direction = 0;
+ } else {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ // Update best error
+ best_err = ss_err[filt_best];
+
+ if (best_cost_ret)
+ *best_cost_ret = RDCOST_DBL(x->rdmult, x->rddiv, 0, best_err);
+ return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ LPF_PICK_METHOD method) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct loopfilter *const lf = &cm->lf;
+
+ lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+
+ if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+ lf->filter_level = 0;
+ } else if (method >= LPF_PICK_FROM_Q) {
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+// These values were determined by linear fitting the result of the
+// searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_HIGHBITDEPTH
+ int filt_guess;
+ switch (cm->bit_depth) {
+ case AOM_BITS_8:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+ break;
+ case AOM_BITS_10:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+ break;
+ case AOM_BITS_12:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+ break;
+ default:
+ assert(0 &&
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+ "or AOM_BITS_12");
+ return;
+ }
+#else
+ int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif // CONFIG_HIGHBITDEPTH
+ if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
+ lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+ } else {
+ lf->filter_level = av1_search_filter_level(
+ sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
+ }
+
+#if CONFIG_EXT_TILE
+ // TODO(any): 0 loopfilter level is only necessary if individual tile
+ // decoding is required. We need to communicate this requirement to this
+ // code and force loop filter level 0 only if required.
+ if (cm->tile_encoding_mode) lf->filter_level = 0;
+#endif // CONFIG_EXT_TILE
+}
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
new file mode 100644
index 0000000000..3c0a83462b
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_PICKLPF_H_
+#define AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ int partial_frame, double *err);
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+ struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
new file mode 100644
index 0000000000..21410e0afa
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -0,0 +1,1269 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
+// When set to RESTORE_NONE (0) we allow switchable.
+const RestorationType force_restore_type = RESTORE_NONE;
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 10
+
+typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src,
+ AV1_COMP *cpi, int partial_frame,
+ RestorationInfo *info,
+ RestorationType *rest_level,
+ double *best_tile_cost,
+ YV12_BUFFER_CONFIG *dst_frame);
+
+const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
+
+static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *dst,
+ const AV1_COMMON *cm, int h_start,
+ int width, int v_start, int height,
+ int components_pattern) {
+ int64_t filt_err = 0;
+ (void)cm;
+ // Y and UV components cannot be mixed
+ assert(components_pattern == 1 || components_pattern == 2 ||
+ components_pattern == 4 || components_pattern == 6);
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ if ((components_pattern >> AOM_PLANE_Y) & 1) {
+ filt_err +=
+ aom_highbd_get_y_sse_part(src, dst, h_start, width, v_start, height);
+ }
+ if ((components_pattern >> AOM_PLANE_U) & 1) {
+ filt_err +=
+ aom_highbd_get_u_sse_part(src, dst, h_start, width, v_start, height);
+ }
+ if ((components_pattern >> AOM_PLANE_V) & 1) {
+ filt_err +=
+ aom_highbd_get_v_sse_part(src, dst, h_start, width, v_start, height);
+ }
+ return filt_err;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ if ((components_pattern >> AOM_PLANE_Y) & 1) {
+ filt_err += aom_get_y_sse_part(src, dst, h_start, width, v_start, height);
+ }
+ if ((components_pattern >> AOM_PLANE_U) & 1) {
+ filt_err += aom_get_u_sse_part(src, dst, h_start, width, v_start, height);
+ }
+ if ((components_pattern >> AOM_PLANE_V) & 1) {
+ filt_err += aom_get_v_sse_part(src, dst, h_start, width, v_start, height);
+ }
+ return filt_err;
+}
+
+static int64_t sse_restoration_frame(AV1_COMMON *const cm,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *dst,
+ int components_pattern) {
+ int64_t filt_err = 0;
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ if ((components_pattern >> AOM_PLANE_Y) & 1) {
+ filt_err += aom_highbd_get_y_sse(src, dst);
+ }
+ if ((components_pattern >> AOM_PLANE_U) & 1) {
+ filt_err += aom_highbd_get_u_sse(src, dst);
+ }
+ if ((components_pattern >> AOM_PLANE_V) & 1) {
+ filt_err += aom_highbd_get_v_sse(src, dst);
+ }
+ return filt_err;
+ }
+#else
+ (void)cm;
+#endif // CONFIG_HIGHBITDEPTH
+ if ((components_pattern >> AOM_PLANE_Y) & 1) {
+ filt_err = aom_get_y_sse(src, dst);
+ }
+ if ((components_pattern >> AOM_PLANE_U) & 1) {
+ filt_err += aom_get_u_sse(src, dst);
+ }
+ if ((components_pattern >> AOM_PLANE_V) & 1) {
+ filt_err += aom_get_v_sse(src, dst);
+ }
+ return filt_err;
+}
+
+static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src,
+ AV1_COMP *const cpi, RestorationInfo *rsi,
+ int components_pattern, int partial_frame,
+ int tile_idx, int subtile_idx,
+ int subtile_bits,
+ YV12_BUFFER_CONFIG *dst_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t filt_err;
+ int tile_width, tile_height, nhtiles, nvtiles;
+ int h_start, h_end, v_start, v_end;
+ int ntiles, width, height;
+
+ // Y and UV components cannot be mixed
+ assert(components_pattern == 1 || components_pattern == 2 ||
+ components_pattern == 4 || components_pattern == 6);
+
+ if (components_pattern == 1) { // Y only
+ width = src->y_crop_width;
+ height = src->y_crop_height;
+ } else { // Color
+ width = src->uv_crop_width;
+ height = src->uv_crop_height;
+ }
+ ntiles = av1_get_rest_ntiles(
+ width, height, cm->rst_info[components_pattern > 1].restoration_tilesize,
+ &tile_width, &tile_height, &nhtiles, &nvtiles);
+ (void)ntiles;
+
+ av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
+ partial_frame, dst_frame);
+ av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, nhtiles,
+ nvtiles, tile_width, tile_height, width, height, 0,
+ 0, &h_start, &h_end, &v_start, &v_end);
+ filt_err = sse_restoration_tile(src, dst_frame, cm, h_start, h_end - h_start,
+ v_start, v_end - v_start, components_pattern);
+
+ return filt_err;
+}
+
+static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src,
+ AV1_COMP *const cpi, RestorationInfo *rsi,
+ int components_pattern, int partial_frame,
+ YV12_BUFFER_CONFIG *dst_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t filt_err;
+ av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
+ partial_frame, dst_frame);
+ filt_err = sse_restoration_frame(cm, src, dst_frame, components_pattern);
+ return filt_err;
+}
+
+static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
+ int src_stride, uint8_t *dat8,
+ int dat_stride, int bit_depth,
+ int32_t *flt1, int flt1_stride,
+ int32_t *flt2, int flt2_stride, int *xqd) {
+ int i, j;
+ int64_t err = 0;
+ int xq[2];
+ decode_xq(xqd, xq);
+ if (bit_depth == 8) {
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t u =
+ (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
+ const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
+ const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
+ src[i * src_stride + j];
+ err += e * e;
+ }
+ }
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t u =
+ (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
+ const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
+ const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
+ src[i * src_stride + j];
+ err += e * e;
+ }
+ }
+ }
+ return err;
+}
+
+static void get_proj_subspace(uint8_t *src8, int width, int height,
+ int src_stride, uint8_t *dat8, int dat_stride,
+ int bit_depth, int32_t *flt1, int flt1_stride,
+ int32_t *flt2, int flt2_stride, int *xq) {
+ int i, j;
+ double H[2][2] = { { 0, 0 }, { 0, 0 } };
+ double C[2] = { 0, 0 };
+ double Det;
+ double x[2];
+ const int size = width * height;
+
+ aom_clear_system_state();
+
+ // Default
+ xq[0] = 0;
+ xq[1] = 0;
+ if (bit_depth == 8) {
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const double s =
+ (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const double f1 = (double)flt1[i * flt1_stride + j] - u;
+ const double f2 = (double)flt2[i * flt2_stride + j] - u;
+ H[0][0] += f1 * f1;
+ H[1][1] += f2 * f2;
+ H[0][1] += f1 * f2;
+ C[0] += f1 * s;
+ C[1] += f2 * s;
+ }
+ }
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const double s =
+ (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const double f1 = (double)flt1[i * flt1_stride + j] - u;
+ const double f2 = (double)flt2[i * flt2_stride + j] - u;
+ H[0][0] += f1 * f1;
+ H[1][1] += f2 * f2;
+ H[0][1] += f1 * f2;
+ C[0] += f1 * s;
+ C[1] += f2 * s;
+ }
+ }
+ }
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+ Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
+ if (Det < 1e-8) return; // ill-posed, return default values
+ x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
+ x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
+ xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+ xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+}
+
+void encode_xq(int *xq, int *xqd) {
+ xqd[0] = xq[0];
+ xqd[0] = clamp(xqd[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+ xqd[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1];
+ xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+}
+
+static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
+ int dat_stride, uint8_t *src8,
+ int src_stride, int bit_depth,
+ int *eps, int *xqd, int32_t *rstbuf) {
+ int32_t *flt1 = rstbuf;
+ int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+ int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
+ int ep, bestep = 0;
+ int64_t err, besterr = -1;
+ int exqd[2], bestxqd[2] = { 0, 0 };
+
+ for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+ int exq[2];
+#if CONFIG_HIGHBITDEPTH
+ if (bit_depth > 8) {
+ uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+#if USE_HIGHPASS_IN_SGRPROJ
+ av1_highpass_filter_highbd(dat, width, height, dat_stride, flt1, width,
+ sgr_params[ep].corner, sgr_params[ep].edge);
+#else
+ av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt1,
+ width, bit_depth, sgr_params[ep].r1,
+ sgr_params[ep].e1, tmpbuf2);
+#endif // USE_HIGHPASS_IN_SGRPROJ
+ av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt2,
+ width, bit_depth, sgr_params[ep].r2,
+ sgr_params[ep].e2, tmpbuf2);
+ } else {
+#endif
+#if USE_HIGHPASS_IN_SGRPROJ
+ av1_highpass_filter(dat8, width, height, dat_stride, flt1, width,
+ sgr_params[ep].corner, sgr_params[ep].edge);
+#else
+ av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width,
+ sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
+#endif // USE_HIGHPASS_IN_SGRPROJ
+ av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width,
+ sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif
+ get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+ bit_depth, flt1, width, flt2, width, exq);
+ encode_xq(exq, exqd);
+ err =
+ get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride,
+ bit_depth, flt1, width, flt2, width, exqd);
+ if (besterr == -1 || err < besterr) {
+ bestep = ep;
+ besterr = err;
+ bestxqd[0] = exqd[0];
+ bestxqd[1] = exqd[1];
+ }
+ }
+ *eps = bestep;
+ xqd[0] = bestxqd[0];
+ xqd[1] = bestxqd[1];
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info) {
+ int bits = SGRPROJ_PARAMS_BITS;
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ return bits;
+}
+
+static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+ int partial_frame, RestorationInfo *info,
+ RestorationType *type, double *best_tile_cost,
+ YV12_BUFFER_CONFIG *dst_frame) {
+ SgrprojInfo *sgrproj_info = info->sgrproj_info;
+ double err, cost_norestore, cost_sgrproj;
+ int bits;
+ MACROBLOCK *x = &cpi->td.mb;
+ AV1_COMMON *const cm = &cpi->common;
+ const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+ RestorationInfo *rsi = &cpi->rst_search[0];
+ int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+ int h_start, h_end, v_start, v_end;
+ // Allocate for the src buffer at high precision
+ const int ntiles = av1_get_rest_ntiles(
+ cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width,
+ &tile_height, &nhtiles, &nvtiles);
+ SgrprojInfo ref_sgrproj_info;
+ set_default_sgrproj(&ref_sgrproj_info);
+
+ rsi->frame_restoration_type = RESTORE_SGRPROJ;
+
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ rsi->restoration_type[tile_idx] = RESTORE_NONE;
+ }
+ // Compute best Sgrproj filters for each tile
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+ tile_height, cm->width, cm->height, 0, 0, &h_start,
+ &h_end, &v_start, &v_end);
+ err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+ h_end - h_start, v_start, v_end - v_start, 1);
+ // #bits when a tile is not restored
+ bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
+ cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+ best_tile_cost[tile_idx] = DBL_MAX;
+ search_selfguided_restoration(
+ dgd->y_buffer + v_start * dgd->y_stride + h_start, h_end - h_start,
+ v_end - v_start, dgd->y_stride,
+ src->y_buffer + v_start * src->y_stride + h_start, src->y_stride,
+#if CONFIG_HIGHBITDEPTH
+ cm->bit_depth,
+#else
+ 8,
+#endif // CONFIG_HIGHBITDEPTH
+ &rsi->sgrproj_info[tile_idx].ep, rsi->sgrproj_info[tile_idx].xqd,
+ cm->rst_internal.tmpbuf);
+ rsi->restoration_type[tile_idx] = RESTORE_SGRPROJ;
+ err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0,
+ dst_frame);
+ bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
+ << AV1_PROB_COST_SHIFT;
+ bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
+ cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+ if (cost_sgrproj >= cost_norestore) {
+ type[tile_idx] = RESTORE_NONE;
+ } else {
+ type[tile_idx] = RESTORE_SGRPROJ;
+ memcpy(&sgrproj_info[tile_idx], &rsi->sgrproj_info[tile_idx],
+ sizeof(sgrproj_info[tile_idx]));
+ bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
+ << AV1_PROB_COST_SHIFT;
+ memcpy(&ref_sgrproj_info, &sgrproj_info[tile_idx],
+ sizeof(ref_sgrproj_info));
+ best_tile_cost[tile_idx] = err;
+ }
+ rsi->restoration_type[tile_idx] = RESTORE_NONE;
+ }
+ // Cost for Sgrproj filtering
+ set_default_sgrproj(&ref_sgrproj_info);
+ bits = frame_level_restore_bits[rsi->frame_restoration_type]
+ << AV1_PROB_COST_SHIFT;
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ bits +=
+ av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, type[tile_idx] != RESTORE_NONE);
+ memcpy(&rsi->sgrproj_info[tile_idx], &sgrproj_info[tile_idx],
+ sizeof(sgrproj_info[tile_idx]));
+ if (type[tile_idx] == RESTORE_SGRPROJ) {
+ bits +=
+ count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
+ << AV1_PROB_COST_SHIFT;
+ memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx],
+ sizeof(ref_sgrproj_info));
+ }
+ rsi->restoration_type[tile_idx] = type[tile_idx];
+ }
+ err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame);
+ cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+ return cost_sgrproj;
+}
+
+static double find_average(uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int stride) {
+ uint64_t sum = 0;
+ double avg = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = v_start; i < v_end; i++)
+ for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+ avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+ return avg;
+}
+
+static void compute_stats(uint8_t *dgd, uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ int i, j, k, l;
+ double Y[WIENER_WIN2];
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ memset(M, 0, sizeof(*M) * WIENER_WIN2);
+ memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2);
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j++) {
+ const double X = (double)src[i * src_stride + j] - avg;
+ int idx = 0;
+ for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) {
+ for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) {
+ Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+ idx++;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ M[k] += Y[k] * X;
+ H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
+ for (l = k + 1; l < WIENER_WIN2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
+ }
+ }
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = k + 1; l < WIENER_WIN2; ++l) {
+ H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static double find_average_highbd(uint16_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ double avg = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = v_start; i < v_end; i++)
+ for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+ avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+ return avg;
+}
+
+static void compute_stats_highbd(uint8_t *dgd8, uint8_t *src8, int h_start,
+ int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M,
+ double *H) {
+ int i, j, k, l;
+ double Y[WIENER_WIN2];
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const double avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ memset(M, 0, sizeof(*M) * WIENER_WIN2);
+ memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2);
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j++) {
+ const double X = (double)src[i * src_stride + j] - avg;
+ int idx = 0;
+ for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) {
+ for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) {
+ Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+ idx++;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ M[k] += Y[k] * X;
+ H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
+ for (l = k + 1; l < WIENER_WIN2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
+ }
+ }
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = k + 1; l < WIENER_WIN2; ++l) {
+ H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+// Solves Ax = b, where x and b are column vectors
+static int linsolve(int n, double *A, int stride, double *b, double *x) {
+ int i, j, k;
+ double c;
+
+ aom_clear_system_state();
+
+ // Forward elimination
+ for (k = 0; k < n - 1; k++) {
+ // Bring the largest magitude to the diagonal position
+ for (i = n - 1; i > k; i--) {
+ if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+ for (j = 0; j < n; j++) {
+ c = A[i * stride + j];
+ A[i * stride + j] = A[(i - 1) * stride + j];
+ A[(i - 1) * stride + j] = c;
+ }
+ c = b[i];
+ b[i] = b[i - 1];
+ b[i - 1] = c;
+ }
+ }
+ for (i = k; i < n - 1; i++) {
+ if (fabs(A[k * stride + k]) < 1e-10) return 0;
+ c = A[(i + 1) * stride + k] / A[k * stride + k];
+ for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+ b[i + 1] -= c * b[k];
+ }
+ }
+ // Backward substitution
+ for (i = n - 1; i >= 0; i--) {
+ if (fabs(A[i * stride + i]) < 1e-10) return 0;
+ c = 0;
+ for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+ x[i] = (b[i] - c) / A[i * stride + i];
+ }
+
+ return 1;
+}
+
+static INLINE int wrap_index(int i) {
+ return (i >= WIENER_HALFWIN1 ? WIENER_WIN - 1 - i : i);
+}
+
+// Fix vector b, update vector a
+static void update_a_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+ int i, j;
+ double S[WIENER_WIN];
+ double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ int w, w2;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < WIENER_WIN; i++) {
+ for (j = 0; j < WIENER_WIN; ++j) {
+ const int jj = wrap_index(j);
+ A[jj] += Mc[i][j] * b[i];
+ }
+ }
+ for (i = 0; i < WIENER_WIN; i++) {
+ for (j = 0; j < WIENER_WIN; j++) {
+ int k, l;
+ for (k = 0; k < WIENER_WIN; ++k)
+ for (l = 0; l < WIENER_WIN; ++l) {
+ const int kk = wrap_index(k);
+ const int ll = wrap_index(l);
+ B[ll * WIENER_HALFWIN1 + kk] +=
+ Hc[j * WIENER_WIN + i][k * WIENER_WIN2 + l] * b[i] * b[j];
+ }
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ w = WIENER_WIN;
+ w2 = (w >> 1) + 1;
+ for (i = 0; i < w2 - 1; ++i)
+ A[i] -=
+ A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+ for (i = 0; i < w2 - 1; ++i)
+ for (j = 0; j < w2 - 1; ++j)
+ B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+ 2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+ if (linsolve(w2 - 1, B, w2, A, S)) {
+ S[w2 - 1] = 1.0;
+ for (i = w2; i < w; ++i) {
+ S[i] = S[w - 1 - i];
+ S[w2 - 1] -= 2 * S[i];
+ }
+ memcpy(a, S, w * sizeof(*a));
+ }
+}
+
+// Fix vector a, update vector b
+static void update_b_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+ int i, j;
+ double S[WIENER_WIN];
+ double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ int w, w2;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < WIENER_WIN; i++) {
+ const int ii = wrap_index(i);
+ for (j = 0; j < WIENER_WIN; j++) A[ii] += Mc[i][j] * a[j];
+ }
+
+ for (i = 0; i < WIENER_WIN; i++) {
+ for (j = 0; j < WIENER_WIN; j++) {
+ const int ii = wrap_index(i);
+ const int jj = wrap_index(j);
+ int k, l;
+ for (k = 0; k < WIENER_WIN; ++k)
+ for (l = 0; l < WIENER_WIN; ++l)
+ B[jj * WIENER_HALFWIN1 + ii] +=
+ Hc[i * WIENER_WIN + j][k * WIENER_WIN2 + l] * a[k] * a[l];
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ w = WIENER_WIN;
+ w2 = WIENER_HALFWIN1;
+ for (i = 0; i < w2 - 1; ++i)
+ A[i] -=
+ A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+ for (i = 0; i < w2 - 1; ++i)
+ for (j = 0; j < w2 - 1; ++j)
+ B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+ 2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+ if (linsolve(w2 - 1, B, w2, A, S)) {
+ S[w2 - 1] = 1.0;
+ for (i = w2; i < w; ++i) {
+ S[i] = S[w - 1 - i];
+ S[w2 - 1] -= 2 * S[i];
+ }
+ memcpy(b, S, w * sizeof(*b));
+ }
+}
+
+static int wiener_decompose_sep_sym(double *M, double *H, double *a,
+ double *b) {
+ static const double init_filt[WIENER_WIN] = {
+ 0.035623, -0.127154, 0.211436, 0.760190, 0.211436, -0.127154, 0.035623,
+ };
+ int i, j, iter;
+ double *Hc[WIENER_WIN2];
+ double *Mc[WIENER_WIN];
+ for (i = 0; i < WIENER_WIN; i++) {
+ Mc[i] = M + i * WIENER_WIN;
+ for (j = 0; j < WIENER_WIN; j++) {
+ Hc[i * WIENER_WIN + j] =
+ H + i * WIENER_WIN * WIENER_WIN2 + j * WIENER_WIN;
+ }
+ }
+ memcpy(a, init_filt, sizeof(*a) * WIENER_WIN);
+ memcpy(b, init_filt, sizeof(*b) * WIENER_WIN);
+
+ iter = 1;
+ while (iter < NUM_WIENER_ITERS) {
+ update_a_sep_sym(Mc, Hc, a, b);
+ update_b_sep_sym(Mc, Hc, a, b);
+ iter++;
+ }
+ return 1;
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static double compute_score(double *M, double *H, InterpKernel vfilt,
+ InterpKernel hfilt) {
+ double ab[WIENER_WIN * WIENER_WIN];
+ int i, k, l;
+ double P = 0, Q = 0;
+ double iP = 0, iQ = 0;
+ double Score, iScore;
+ double a[WIENER_WIN], b[WIENER_WIN];
+
+ aom_clear_system_state();
+
+ a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = 1.0;
+ for (i = 0; i < WIENER_HALFWIN; ++i) {
+ a[i] = a[WIENER_WIN - i - 1] = (double)vfilt[i] / WIENER_FILT_STEP;
+ b[i] = b[WIENER_WIN - i - 1] = (double)hfilt[i] / WIENER_FILT_STEP;
+ a[WIENER_HALFWIN] -= 2 * a[i];
+ b[WIENER_HALFWIN] -= 2 * b[i];
+ }
+ for (k = 0; k < WIENER_WIN; ++k) {
+ for (l = 0; l < WIENER_WIN; ++l) ab[k * WIENER_WIN + l] = a[l] * b[k];
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ P += ab[k] * M[k];
+ for (l = 0; l < WIENER_WIN2; ++l)
+ Q += ab[k] * H[k * WIENER_WIN2 + l] * ab[l];
+ }
+ Score = Q - 2 * P;
+
+ iP = M[WIENER_WIN2 >> 1];
+ iQ = H[(WIENER_WIN2 >> 1) * WIENER_WIN2 + (WIENER_WIN2 >> 1)];
+ iScore = iQ - 2 * iP;
+
+ return Score - iScore;
+}
+
+static void quantize_sym_filter(double *f, InterpKernel fi) {
+ int i;
+ for (i = 0; i < WIENER_HALFWIN; ++i) {
+ fi[i] = RINT(f[i] * WIENER_FILT_STEP);
+ }
+ // Specialize for 7-tap filter
+ fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+ fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+ fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+ // Satisfy filter constraints
+ fi[WIENER_WIN - 1] = fi[0];
+ fi[WIENER_WIN - 2] = fi[1];
+ fi[WIENER_WIN - 3] = fi[2];
+ // The central element has an implicit +WIENER_FILT_STEP
+ fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info) {
+ int bits = 0;
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ return bits;
+}
+
+static double search_wiener_uv(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+ int partial_frame, int plane,
+ RestorationInfo *info, RestorationType *type,
+ YV12_BUFFER_CONFIG *dst_frame) {
+ WienerInfo *wiener_info = info->wiener_info;
+ AV1_COMMON *const cm = &cpi->common;
+ RestorationInfo *rsi = cpi->rst_search;
+ int64_t err;
+ int bits;
+ double cost_wiener, cost_norestore, cost_wiener_frame, cost_norestore_frame;
+ MACROBLOCK *x = &cpi->td.mb;
+ double M[WIENER_WIN2];
+ double H[WIENER_WIN2 * WIENER_WIN2];
+ double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
+ const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+ const int width = src->uv_crop_width;
+ const int height = src->uv_crop_height;
+ const int src_stride = src->uv_stride;
+ const int dgd_stride = dgd->uv_stride;
+ double score;
+ int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+ int h_start, h_end, v_start, v_end;
+ const int ntiles =
+ av1_get_rest_ntiles(width, height, cm->rst_info[1].restoration_tilesize,
+ &tile_width, &tile_height, &nhtiles, &nvtiles);
+ WienerInfo ref_wiener_info;
+ set_default_wiener(&ref_wiener_info);
+ assert(width == dgd->uv_crop_width);
+ assert(height == dgd->uv_crop_height);
+
+ rsi[plane].frame_restoration_type = RESTORE_NONE;
+ err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane));
+ bits = 0;
+ cost_norestore_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+ rsi[plane].frame_restoration_type = RESTORE_WIENER;
+
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
+ }
+
+ // Compute best Wiener filters for each tile
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+ tile_height, width, height, 0, 0, &h_start, &h_end,
+ &v_start, &v_end);
+ err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+ h_end - h_start, v_start, v_end - v_start,
+ 1 << plane);
+ // #bits when a tile is not restored
+ bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
+ cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+ // best_tile_cost[tile_idx] = DBL_MAX;
+
+ av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+ tile_height, width, height, WIENER_HALFWIN,
+ WIENER_HALFWIN, &h_start, &h_end, &v_start,
+ &v_end);
+ if (plane == AOM_PLANE_U) {
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ compute_stats_highbd(dgd->u_buffer, src->u_buffer, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ compute_stats(dgd->u_buffer, src->u_buffer, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H);
+ } else if (plane == AOM_PLANE_V) {
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ compute_stats_highbd(dgd->v_buffer, src->v_buffer, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ compute_stats(dgd->v_buffer, src->v_buffer, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H);
+ } else {
+ assert(0);
+ }
+
+ type[tile_idx] = RESTORE_WIENER;
+
+ if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
+ type[tile_idx] = RESTORE_NONE;
+ continue;
+ }
+ quantize_sym_filter(vfilterd, rsi[plane].wiener_info[tile_idx].vfilter);
+ quantize_sym_filter(hfilterd, rsi[plane].wiener_info[tile_idx].hfilter);
+
+ // Filter score computes the value of the function x'*A*x - x'*b for the
+ // learned filter and compares it against identity filer. If there is no
+ // reduction in the function, the filter is reverted back to identity
+ score = compute_score(M, H, rsi[plane].wiener_info[tile_idx].vfilter,
+ rsi[plane].wiener_info[tile_idx].hfilter);
+ if (score > 0.0) {
+ type[tile_idx] = RESTORE_NONE;
+ continue;
+ }
+
+ rsi[plane].restoration_type[tile_idx] = RESTORE_WIENER;
+ err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
+ tile_idx, 0, 0, dst_frame);
+ bits =
+ count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
+ << AV1_PROB_COST_SHIFT;
+ // bits = WIENER_FILT_BITS << AV1_PROB_COST_SHIFT;
+ bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
+ cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+ if (cost_wiener >= cost_norestore) {
+ type[tile_idx] = RESTORE_NONE;
+ } else {
+ type[tile_idx] = RESTORE_WIENER;
+ memcpy(&wiener_info[tile_idx], &rsi[plane].wiener_info[tile_idx],
+ sizeof(wiener_info[tile_idx]));
+ memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
+ sizeof(ref_wiener_info));
+ }
+ rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
+ }
+ // Cost for Wiener filtering
+ set_default_wiener(&ref_wiener_info);
+ bits = 0;
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ bits +=
+ av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
+ memcpy(&rsi[plane].wiener_info[tile_idx], &wiener_info[tile_idx],
+ sizeof(wiener_info[tile_idx]));
+ if (type[tile_idx] == RESTORE_WIENER) {
+ bits +=
+ count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
+ << AV1_PROB_COST_SHIFT;
+ memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
+ sizeof(ref_wiener_info));
+ }
+ rsi[plane].restoration_type[tile_idx] = type[tile_idx];
+ }
+ err = try_restoration_frame(src, cpi, rsi, 1 << plane, partial_frame,
+ dst_frame);
+ cost_wiener_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+ if (cost_wiener_frame < cost_norestore_frame) {
+ info->frame_restoration_type = RESTORE_WIENER;
+ } else {
+ info->frame_restoration_type = RESTORE_NONE;
+ }
+
+ return info->frame_restoration_type == RESTORE_WIENER ? cost_wiener_frame
+ : cost_norestore_frame;
+}
+
+static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+ int partial_frame, RestorationInfo *info,
+ RestorationType *type, double *best_tile_cost,
+ YV12_BUFFER_CONFIG *dst_frame) {
+ WienerInfo *wiener_info = info->wiener_info;
+ AV1_COMMON *const cm = &cpi->common;
+ RestorationInfo *rsi = cpi->rst_search;
+ int64_t err;
+ int bits;
+ double cost_wiener, cost_norestore;
+ MACROBLOCK *x = &cpi->td.mb;
+ double M[WIENER_WIN2];
+ double H[WIENER_WIN2 * WIENER_WIN2];
+ double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
+ const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+ const int width = cm->width;
+ const int height = cm->height;
+ const int src_stride = src->y_stride;
+ const int dgd_stride = dgd->y_stride;
+ double score;
+ int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+ int h_start, h_end, v_start, v_end;
+ const int ntiles =
+ av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize,
+ &tile_width, &tile_height, &nhtiles, &nvtiles);
+ WienerInfo ref_wiener_info;
+ set_default_wiener(&ref_wiener_info);
+
+ assert(width == dgd->y_crop_width);
+ assert(height == dgd->y_crop_height);
+ assert(width == src->y_crop_width);
+ assert(height == src->y_crop_height);
+
+ rsi->frame_restoration_type = RESTORE_WIENER;
+
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ rsi->restoration_type[tile_idx] = RESTORE_NONE;
+ }
+
+// Construct a (WIENER_HALFWIN)-pixel border around the frame
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd->y_buffer), width, height,
+ dgd_stride);
+ else
+#endif
+ extend_frame(dgd->y_buffer, width, height, dgd_stride);
+
+ // Compute best Wiener filters for each tile
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+ tile_height, width, height, 0, 0, &h_start, &h_end,
+ &v_start, &v_end);
+ err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+ h_end - h_start, v_start, v_end - v_start, 1);
+ // #bits when a tile is not restored
+ bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
+ cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+ best_tile_cost[tile_idx] = DBL_MAX;
+
+ av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+ tile_height, width, height, 0, 0, &h_start, &h_end,
+ &v_start, &v_end);
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ compute_stats_highbd(dgd->y_buffer, src->y_buffer, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ compute_stats(dgd->y_buffer, src->y_buffer, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H);
+
+ type[tile_idx] = RESTORE_WIENER;
+
+ if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
+ type[tile_idx] = RESTORE_NONE;
+ continue;
+ }
+ quantize_sym_filter(vfilterd, rsi->wiener_info[tile_idx].vfilter);
+ quantize_sym_filter(hfilterd, rsi->wiener_info[tile_idx].hfilter);
+
+ // Filter score computes the value of the function x'*A*x - x'*b for the
+ // learned filter and compares it against identity filer. If there is no
+ // reduction in the function, the filter is reverted back to identity
+ score = compute_score(M, H, rsi->wiener_info[tile_idx].vfilter,
+ rsi->wiener_info[tile_idx].hfilter);
+ if (score > 0.0) {
+ type[tile_idx] = RESTORE_NONE;
+ continue;
+ }
+
+ rsi->restoration_type[tile_idx] = RESTORE_WIENER;
+ err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0,
+ dst_frame);
+ bits = count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info)
+ << AV1_PROB_COST_SHIFT;
+ bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
+ cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+ if (cost_wiener >= cost_norestore) {
+ type[tile_idx] = RESTORE_NONE;
+ } else {
+ type[tile_idx] = RESTORE_WIENER;
+ memcpy(&wiener_info[tile_idx], &rsi->wiener_info[tile_idx],
+ sizeof(wiener_info[tile_idx]));
+ memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+ sizeof(ref_wiener_info));
+ bits = count_wiener_bits(&wiener_info[tile_idx], &ref_wiener_info)
+ << AV1_PROB_COST_SHIFT;
+ best_tile_cost[tile_idx] = err;
+ }
+ rsi->restoration_type[tile_idx] = RESTORE_NONE;
+ }
+ // Cost for Wiener filtering
+ set_default_wiener(&ref_wiener_info);
+ bits = frame_level_restore_bits[rsi->frame_restoration_type]
+ << AV1_PROB_COST_SHIFT;
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ bits +=
+ av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
+ memcpy(&rsi->wiener_info[tile_idx], &wiener_info[tile_idx],
+ sizeof(wiener_info[tile_idx]));
+ if (type[tile_idx] == RESTORE_WIENER) {
+ bits += count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info)
+ << AV1_PROB_COST_SHIFT;
+ memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+ sizeof(ref_wiener_info));
+ }
+ rsi->restoration_type[tile_idx] = type[tile_idx];
+ }
+ err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame);
+ cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+ return cost_wiener;
+}
+
+static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+ int partial_frame, RestorationInfo *info,
+ RestorationType *type, double *best_tile_cost,
+ YV12_BUFFER_CONFIG *dst_frame) {
+ double err, cost_norestore;
+ int bits;
+ MACROBLOCK *x = &cpi->td.mb;
+ AV1_COMMON *const cm = &cpi->common;
+ int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+ int h_start, h_end, v_start, v_end;
+ const int ntiles = av1_get_rest_ntiles(
+ cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width,
+ &tile_height, &nhtiles, &nvtiles);
+ (void)info;
+ (void)dst_frame;
+ (void)partial_frame;
+
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+ tile_height, cm->width, cm->height, 0, 0, &h_start,
+ &h_end, &v_start, &v_end);
+ err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
+ h_end - h_start, v_start, v_end - v_start, 1);
+ type[tile_idx] = RESTORE_NONE;
+ best_tile_cost[tile_idx] = err;
+ }
+ // RD cost associated with no restoration
+ err = sse_restoration_tile(src, cm->frame_to_show, cm, 0, cm->width, 0,
+ cm->height, 1);
+ bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT;
+ cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+ return cost_norestore;
+}
+
+static double search_switchable_restoration(
+ AV1_COMP *cpi, int partial_frame, RestorationInfo *rsi,
+ double *tile_cost[RESTORE_SWITCHABLE_TYPES]) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *x = &cpi->td.mb;
+ double cost_switchable = 0;
+ int bits, tile_idx;
+ RestorationType r;
+ const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
+ cm->rst_info[0].restoration_tilesize,
+ NULL, NULL, NULL, NULL);
+ SgrprojInfo ref_sgrproj_info;
+ set_default_sgrproj(&ref_sgrproj_info);
+ WienerInfo ref_wiener_info;
+ set_default_wiener(&ref_wiener_info);
+ (void)partial_frame;
+
+ rsi->frame_restoration_type = RESTORE_SWITCHABLE;
+ bits = frame_level_restore_bits[rsi->frame_restoration_type]
+ << AV1_PROB_COST_SHIFT;
+ cost_switchable = RDCOST_DBL(x->rdmult, x->rddiv, bits >> 4, 0);
+ for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+ double best_cost = RDCOST_DBL(
+ x->rdmult, x->rddiv, (cpi->switchable_restore_cost[RESTORE_NONE] >> 4),
+ tile_cost[RESTORE_NONE][tile_idx]);
+ rsi->restoration_type[tile_idx] = RESTORE_NONE;
+ for (r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
+ if (force_restore_type != 0)
+ if (r != force_restore_type) continue;
+ int tilebits = 0;
+ if (r == RESTORE_WIENER)
+ tilebits +=
+ count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info);
+ else if (r == RESTORE_SGRPROJ)
+ tilebits +=
+ count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info);
+ tilebits <<= AV1_PROB_COST_SHIFT;
+ tilebits += cpi->switchable_restore_cost[r];
+ double cost = RDCOST_DBL(x->rdmult, x->rddiv, tilebits >> 4,
+ tile_cost[r][tile_idx]);
+
+ if (cost < best_cost) {
+ rsi->restoration_type[tile_idx] = r;
+ best_cost = cost;
+ }
+ }
+ if (rsi->restoration_type[tile_idx] == RESTORE_WIENER)
+ memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+ sizeof(ref_wiener_info));
+ else if (rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ)
+ memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx],
+ sizeof(ref_sgrproj_info));
+ if (force_restore_type != 0)
+ assert(rsi->restoration_type[tile_idx] == force_restore_type ||
+ rsi->restoration_type[tile_idx] == RESTORE_NONE);
+ cost_switchable += best_cost;
+ }
+ return cost_switchable;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+ LPF_PICK_METHOD method) {
+ static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = {
+ search_norestore, search_wiener, search_sgrproj,
+ };
+ AV1_COMMON *const cm = &cpi->common;
+ double cost_restore[RESTORE_TYPES];
+ double *tile_cost[RESTORE_SWITCHABLE_TYPES];
+ RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES];
+ double best_cost_restore;
+ RestorationType r, best_restore;
+
+ const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
+ cm->rst_info[0].restoration_tilesize,
+ NULL, NULL, NULL, NULL);
+
+ for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
+ tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles);
+ restore_types[r] =
+ (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles);
+ }
+
+ for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+ if (force_restore_type != 0)
+ if (r != RESTORE_NONE && r != force_restore_type) continue;
+ cost_restore[r] = search_restore_fun[r](
+ src, cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0],
+ restore_types[r], tile_cost[r], &cpi->trial_frame_rst);
+ }
+ cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration(
+ cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0], tile_cost);
+
+ best_cost_restore = DBL_MAX;
+ best_restore = 0;
+ for (r = 0; r < RESTORE_TYPES; ++r) {
+ if (force_restore_type != 0)
+ if (r != RESTORE_NONE && r != force_restore_type) continue;
+ if (cost_restore[r] < best_cost_restore) {
+ best_restore = r;
+ best_cost_restore = cost_restore[r];
+ }
+ }
+ cm->rst_info[0].frame_restoration_type = best_restore;
+ if (force_restore_type != 0)
+ assert(best_restore == force_restore_type || best_restore == RESTORE_NONE);
+ if (best_restore != RESTORE_SWITCHABLE) {
+ memcpy(cm->rst_info[0].restoration_type, restore_types[best_restore],
+ ntiles * sizeof(restore_types[best_restore][0]));
+ }
+
+ // Color components
+ search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_U,
+ &cm->rst_info[AOM_PLANE_U],
+ cm->rst_info[AOM_PLANE_U].restoration_type,
+ &cpi->trial_frame_rst);
+ search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_V,
+ &cm->rst_info[AOM_PLANE_V],
+ cm->rst_info[AOM_PLANE_V].restoration_type,
+ &cpi->trial_frame_rst);
+ /*
+ printf("Frame %d/%d restore types: %d %d %d\n",
+ cm->current_video_frame, cm->show_frame,
+ cm->rst_info[0].frame_restoration_type,
+ cm->rst_info[1].frame_restoration_type,
+ cm->rst_info[2].frame_restoration_type);
+ printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n",
+ cm->current_video_frame, cm->show_frame,
+ cm->rst_info[0].frame_restoration_type, cost_restore[0],
+ cost_restore[1], cost_restore[2], cost_restore[3]);
+ */
+
+ for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
+ aom_free(tile_cost[r]);
+ aom_free(restore_types[r]);
+ }
+}
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
new file mode 100644
index 0000000000..f6096ed1d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_ENCODER_PICKRST_H_
+#define AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ LPF_PICK_METHOD method);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pvq_encoder.c b/third_party/aom/av1/encoder/pvq_encoder.c
new file mode 100644
index 0000000000..ab63f1b7dd
--- /dev/null
+++ b/third_party/aom/av1/encoder/pvq_encoder.c
@@ -0,0 +1,988 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "aom_dsp/entcode.h"
+#include "aom_dsp/entenc.h"
+#include "av1/common/blockd.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/partition.h"
+#include "av1/common/pvq_state.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/pvq_encoder.h"
+#include "aom_ports/system_state.h"
+
+/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
+ dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
+#define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0)
+
+void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
+ int nsymbs) {
+ if (cdf[0] == 0)
+ aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
+ aom_write_symbol(w, symb, cdf, nsymbs);
+}
+
+static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ const od_coeff *in, int n, int k) {
+ int i;
+ aom_encode_band_pvq_splits(w, adapt, in, n, k, 0);
+ for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0);
+}
+
+/* Computes 1/sqrt(i) using a table for small values. */
+static double od_rsqrt_table(int i) {
+ static double table[16] = {
+ 1.000000, 0.707107, 0.577350, 0.500000,
+ 0.447214, 0.408248, 0.377964, 0.353553,
+ 0.333333, 0.316228, 0.301511, 0.288675,
+ 0.277350, 0.267261, 0.258199, 0.250000};
+ if (i <= 16) return table[i-1];
+ else return 1./sqrt(i);
+}
+
+/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
+ where 0 <= i < table_size.*/
+static double od_custom_rsqrt_dynamic_table(const double* table,
+ const int table_size, const double start, const int i) {
+ if (i < table_size) return table[i];
+ else return od_rsqrt_table((int)(start + 2*i + 1));
+}
+
+/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
+static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
+ const double start) {
+ int i;
+ for (i = 0; i < table_size; i++)
+ table[i] = od_rsqrt_table((int)(start + 2*i + 1));
+}
+
+/** Find the codepoint on the given PSphere closest to the desired
+ * vector. Double-precision PVQ search just to make sure our tests
+ * aren't limited by numerical accuracy.
+ *
+ * @param [in] xcoeff input vector to quantize (x in the math doc)
+ * @param [in] n number of dimensions
+ * @param [in] k number of pulses
+ * @param [out] ypulse optimal codevector found (y in the math doc)
+ * @param [out] g2 multiplier for the distortion (typically squared
+ * gain units)
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in] prev_k number of pulses already in ypulse that we should
+ * reuse for the search (or 0 for a new search)
+ * @return cosine distance between x and y (between 0 and 1)
+ */
+double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
+ od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
+ int i, j;
+ double xy;
+ double yy;
+ /* TODO - This blows our 8kB stack space budget and should be fixed when
+ converting PVQ to fixed point. */
+ double x[MAXN];
+ double xx;
+ double lambda;
+ double norm_1;
+ int rdo_pulses;
+ double delta_rate;
+ xx = xy = yy = 0;
+ for (j = 0; j < n; j++) {
+ x[j] = fabs((float)xcoeff[j]);
+ xx += x[j]*x[j];
+ }
+ norm_1 = 1./sqrt(1e-30 + xx);
+ lambda = pvq_norm_lambda/(1e-30 + g2);
+ i = 0;
+ if (prev_k > 0 && prev_k <= k) {
+ /* We reuse pulses from a previous search so we don't have to search them
+ again. */
+ for (j = 0; j < n; j++) {
+ ypulse[j] = abs(ypulse[j]);
+ xy += x[j]*ypulse[j];
+ yy += ypulse[j]*ypulse[j];
+ i += ypulse[j];
+ }
+ }
+ else if (k > 2) {
+ double l1_norm;
+ double l1_inv;
+ l1_norm = 0;
+ for (j = 0; j < n; j++) l1_norm += x[j];
+ l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
+ for (j = 0; j < n; j++) {
+ double tmp;
+ tmp = k*x[j]*l1_inv;
+ ypulse[j] = OD_MAXI(0, (int)floor(tmp));
+ xy += x[j]*ypulse[j];
+ yy += ypulse[j]*ypulse[j];
+ i += ypulse[j];
+ }
+ }
+ else OD_CLEAR(ypulse, n);
+
+ /* Only use RDO on the last few pulses. This not only saves CPU, but using
+ RDO on all pulses actually makes the results worse for reasons I don't
+ fully understand. */
+ rdo_pulses = 1 + k/4;
+ /* Rough assumption for now, the last position costs about 3 bits more than
+ the first. */
+ delta_rate = 3./n;
+ /* Search one pulse at a time */
+ for (; i < k - rdo_pulses; i++) {
+ int pos;
+ double best_xy;
+ double best_yy;
+ pos = 0;
+ best_xy = -10;
+ best_yy = 1;
+ for (j = 0; j < n; j++) {
+ double tmp_xy;
+ double tmp_yy;
+ tmp_xy = xy + x[j];
+ tmp_yy = yy + 2*ypulse[j] + 1;
+ tmp_xy *= tmp_xy;
+ if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
+ best_xy = tmp_xy;
+ best_yy = tmp_yy;
+ pos = j;
+ }
+ }
+ xy = xy + x[pos];
+ yy = yy + 2*ypulse[pos] + 1;
+ ypulse[pos]++;
+ }
+ /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
+ and since x^2 and y^2 are constant, we just maximize x*y, plus a
+ lambda*rate term. Note that since x and y aren't normalized here,
+ we need to divide by sqrt(x^2)*sqrt(y^2). */
+ for (; i < k; i++) {
+ double rsqrt_table[4];
+ int rsqrt_table_size = 4;
+ int pos;
+ double best_cost;
+ pos = 0;
+ best_cost = -1e5;
+ /*Fill the small rsqrt lookup table with inputs relative to yy.
+ Specifically, the table of n values is filled with
+ rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
+ od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy);
+ for (j = 0; j < n; j++) {
+ double tmp_xy;
+ double tmp_yy;
+ tmp_xy = xy + x[j];
+ /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
+ tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
+ yy, ypulse[j]);
+ tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
+ if (j == 0 || tmp_xy > best_cost) {
+ best_cost = tmp_xy;
+ pos = j;
+ }
+ }
+ xy = xy + x[pos];
+ yy = yy + 2*ypulse[pos] + 1;
+ ypulse[pos]++;
+ }
+ for (i = 0; i < n; i++) {
+ if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
+ }
+ return xy/(1e-100 + sqrt(xx*yy));
+}
+
+/** Encodes the gain so that the return value increases with the
+ * distance |x-ref|, so that we can encode a zero when x=ref. The
+ * value x=0 is not covered because it is only allowed in the noref
+ * case.
+ *
+ * @param [in] x quantized gain to encode
+ * @param [in] ref quantized gain of the reference
+ * @return interleave-encoded quantized gain value
+ */
+static int neg_interleave(int x, int ref) {
+ if (x < ref) return -2*(x - ref) - 1;
+ else if (x < 2*ref) return 2*(x - ref);
+ else return x-1;
+}
+
+int od_vector_is_null(const od_coeff *x, int len) {
+ int i;
+ for (i = 0; i < len; i++) if (x[i]) return 0;
+ return 1;
+}
+
+static double od_pvq_rate(int qg, int icgr, int theta, int ts,
+ const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) {
+ double rate;
+ if (k == 0) rate = 0;
+ else if (speed > 0) {
+ int i;
+ int sum;
+ double f;
+ /* Compute "center of mass" of the pulse vector. */
+ sum = 0;
+ for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
+ f = sum/(double)(k*n);
+ /* Estimates the number of bits it will cost to encode K pulses in
+ N dimensions based on hand-tuned fit for bitrate vs K, N and
+ "center of mass". */
+ rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
+ }
+ else {
+ aom_writer w;
+ od_pvq_codeword_ctx cd;
+ int tell;
+#if CONFIG_DAALA_EC
+ od_ec_enc_init(&w.ec, 1000);
+#else
+# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
+#if CONFIG_DAALA_EC
+ tell = od_ec_enc_tell_frac(&w.ec);
+#else
+# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
+#if CONFIG_DAALA_EC
+ rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
+ od_ec_enc_clear(&w.ec);
+#else
+# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ }
+ if (qg > 0 && theta >= 0) {
+ /* Approximate cost of entropy-coding theta */
+ rate += .9*OD_LOG2(ts);
+ if (qg == icgr) rate -= .5;
+ }
+ return rate;
+}
+
+#define MAX_PVQ_ITEMS (20)
+/* This stores the information about a PVQ search candidate, so we can sort
+ based on K. */
+typedef struct {
+ int gain;
+ int k;
+ od_val32 qtheta;
+ int theta;
+ int ts;
+ od_val32 qcg;
+} pvq_search_item;
+
+int items_compare(pvq_search_item *a, pvq_search_item *b) {
+ /* Break ties in K with gain to ensure a stable sort.
+ Otherwise, the order depends on qsort implementation. */
+ return a->k == b->k ? a->gain - b->gain : a->k - b->k;
+}
+
+/** Perform PVQ quantization with prediction, trying several
+ * possible gains and angles. See draft-valin-videocodec-pvq and
+ * http://jmvalin.ca/slides/pvq.pdf for more details.
+ *
+ * @param [out] out coefficients after quantization
+ * @param [in] x0 coefficients before quantization
+ * @param [in] r0 reference, aka predicted coefficients
+ * @param [in] n number of dimensions
+ * @param [in] q0 quantization step size
+ * @param [out] y pulse vector (i.e. selected PVQ codevector)
+ * @param [out] itheta angle between input and reference (-1 if noref)
+ * @param [out] vk total number of pulses
+ * @param [in] beta per-band activity masking beta param
+ * @param [out] skip_diff distortion cost of skipping this block
+ * (accumulated)
+ * @param [in] is_keyframe whether we're encoding a keyframe
+ * @param [in] pli plane index
+ * @param [in] adapt probability adaptation context
+ * @param [in] qm QM with magnitude compensation
+ * @param [in] qm_inv Inverse of QM with magnitude compensation
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in] speed Make search faster by making approximations
+ * @return gain index of the quatized gain
+*/
+static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
+ int n, int q0, od_coeff *y, int *itheta, int *vk,
+ od_val16 beta, double *skip_diff, int is_keyframe, int pli,
+ const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv,
+ double pvq_norm_lambda, int speed) {
+ od_val32 g;
+ od_val32 gr;
+ od_coeff y_tmp[MAXN + 3];
+ int i;
+ /* Number of pulses. */
+ int k;
+ /* Companded gain of x and reference, normalized to q. */
+ od_val32 cg;
+ od_val32 cgr;
+ int icgr;
+ int qg;
+ /* Best RDO cost (D + lamdba*R) so far. */
+ double best_cost;
+ double dist0;
+ /* Distortion (D) that corresponds to the best RDO cost. */
+ double best_dist;
+ double dist;
+ /* Sign of Householder reflection. */
+ int s;
+ /* Dimension on which Householder reflects. */
+ int m;
+ od_val32 theta;
+ double corr;
+ int best_k;
+ od_val32 best_qtheta;
+ od_val32 gain_offset;
+ int noref;
+ double skip_dist;
+ int cfl_enabled;
+ int skip;
+ double gain_weight;
+ od_val16 x16[MAXN];
+ od_val16 r16[MAXN];
+ int xshift;
+ int rshift;
+ /* Give more weight to gain error when calculating the total distortion. */
+ gain_weight = 1.0;
+ OD_ASSERT(n > 1);
+ corr = 0;
+#if !defined(OD_FLOAT_PVQ)
+ /* Shift needed to make x fit in 16 bits even after rotation.
+ This shift value is not normative (it can be changed without breaking
+ the bitstream) */
+ xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
+ /* Shift needed to make the reference fit in 15 bits, so that the Householder
+ vector can fit in 16 bits.
+ This shift value *is* normative, and has to match the decoder. */
+ rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
+#else
+ xshift = 0;
+ rshift = 0;
+#endif
+ for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+ /*This is slightly different from the original float PVQ code,
+ where the qm was applied in the accumulation in od_pvq_compute_gain and
+ the vectors were od_coeffs, not od_val16 (i.e. double).*/
+ x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
+ r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
+#else
+ x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
+ r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
+#endif
+ corr += OD_MULT16_16(x16[i], r16[i]);
+ }
+ cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
+ cg = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
+ cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
+ if (cfl_enabled) cgr = OD_CGAIN_SCALE;
+ /* gain_offset is meant to make sure one of the quantized gains has
+ exactly the same gain as the reference. */
+#if defined(OD_FLOAT_PVQ)
+ icgr = (int)floor(.5 + cgr);
+#else
+ icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
+#endif
+ gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
+ /* Start search with null case: gain=0, no pulse. */
+ qg = 0;
+ dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
+ best_dist = dist;
+ best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
+ n, speed);
+ noref = 1;
+ best_k = 0;
+ *itheta = -1;
+ OD_CLEAR(y, n);
+ best_qtheta = 0;
+ m = 0;
+ s = 1;
+ corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
+ corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
+ if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
+ else {
+ skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
+ + cgr*(double)cg*(2 - 2*corr);
+ skip_dist *= OD_CGAIN_SCALE_2;
+ }
+ if (!is_keyframe) {
+ /* noref, gain=0 isn't allowed, but skip is allowed. */
+ od_val32 scgr;
+ scgr = OD_MAXF(0,gain_offset);
+ if (icgr == 0) {
+ best_dist = gain_weight*(cg - scgr)*(cg - scgr)
+ + scgr*(double)cg*(2 - 2*corr);
+ best_dist *= OD_CGAIN_SCALE_2;
+ }
+ best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
+ NULL, 0, n, speed);
+ best_qtheta = 0;
+ *itheta = 0;
+ noref = 0;
+ }
+ dist0 = best_dist;
+ if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
+ od_val16 xr[MAXN];
+ int gain_bound;
+ int prev_k;
+ pvq_search_item items[MAX_PVQ_ITEMS];
+ int idx;
+ int nitems;
+ double cos_dist;
+ idx = 0;
+ gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
+ /* Perform theta search only if prediction is useful. */
+ theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
+ m = od_compute_householder(r16, n, gr, &s, rshift);
+ od_apply_householder(xr, x16, r16, n);
+ prev_k = 0;
+ for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
+ /* Compute all candidate PVQ searches within a reasonable range of gain
+ and theta. */
+ for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
+ int j;
+ od_val32 qcg;
+ int ts;
+ int theta_lower;
+ int theta_upper;
+ /* Quantized companded gain */
+ qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
+ /* Set angular resolution (in ra) to match the encoded gain */
+ ts = od_pvq_compute_max_theta(qcg, beta);
+ theta_lower = OD_MAXI(0, (int)floor(.5 +
+ theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
+ theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
+ /* Include the angles within a reasonable range. */
+ for (j = theta_lower; j <= theta_upper; j++) {
+ od_val32 qtheta;
+ qtheta = od_pvq_compute_theta(j, ts);
+ k = od_pvq_compute_k(qcg, j, 0, n, beta);
+ items[idx].gain = i;
+ items[idx].theta = j;
+ items[idx].k = k;
+ items[idx].qcg = qcg;
+ items[idx].qtheta = qtheta;
+ items[idx].ts = ts;
+ idx++;
+ OD_ASSERT(idx < MAX_PVQ_ITEMS);
+ }
+ }
+ nitems = idx;
+ cos_dist = 0;
+ /* Sort PVQ search candidates in ascending order of pulses K so that
+ we can reuse all the previously searched pulses across searches. */
+ qsort(items, nitems, sizeof(items[0]),
+ (int (*)(const void *, const void *))items_compare);
+ /* Search for the best gain/theta in order. */
+ for (idx = 0; idx < nitems; idx++) {
+ int j;
+ od_val32 qcg;
+ int ts;
+ double cost;
+ double dist_theta;
+ double sin_prod;
+ od_val32 qtheta;
+ /* Quantized companded gain */
+ qcg = items[idx].qcg;
+ i = items[idx].gain;
+ j = items[idx].theta;
+ /* Set angular resolution (in ra) to match the encoded gain */
+ ts = items[idx].ts;
+ /* Search for the best angle within a reasonable range. */
+ qtheta = items[idx].qtheta;
+ k = items[idx].k;
+ /* Compute the minimal possible distortion by not taking the PVQ
+ cos_dist into account. */
+ dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
+ dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
+ dist *= OD_CGAIN_SCALE_2;
+ /* If we have no hope of beating skip (including a 1-bit worst-case
+ penalty), stop now. */
+ if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
+ sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
+ OD_TRIG_SCALE_1;
+ /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
+ that's the factor by which cos_dist is multiplied to get the
+ distortion metric. */
+ if (k == 0) {
+ cos_dist = 0;
+ OD_CLEAR(y_tmp, n-1);
+ }
+ else if (k != prev_k) {
+ cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
+ qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
+ }
+ prev_k = k;
+ /* See Jmspeex' Journal of Dubious Theoretical Results. */
+ dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
+ + sin_prod*(2 - 2*cos_dist);
+ dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
+ dist *= OD_CGAIN_SCALE_2;
+ /* Do approximate RDO. */
+ cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
+ k, n, speed);
+ if (cost < best_cost) {
+ best_cost = cost;
+ best_dist = dist;
+ qg = i;
+ best_k = k;
+ best_qtheta = qtheta;
+ *itheta = j;
+ noref = 0;
+ OD_COPY(y, y_tmp, n - 1);
+ }
+ }
+ }
+ /* Don't bother with no-reference version if there's a reasonable
+ correlation. */
+ if (n <= OD_MAX_PVQ_SIZE && (corr < .5
+ || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
+ int gain_bound;
+ int prev_k;
+ gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
+ prev_k = 0;
+ /* Search for the best gain (haven't determined reasonable range yet). */
+ for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
+ double cos_dist;
+ double cost;
+ od_val32 qcg;
+ qcg = OD_SHL(i, OD_CGAIN_SHIFT);
+ k = od_pvq_compute_k(qcg, -1, 1, n, beta);
+ /* Compute the minimal possible distortion by not taking the PVQ
+ cos_dist into account. */
+ dist = gain_weight*(qcg - cg)*(qcg - cg);
+ dist *= OD_CGAIN_SCALE_2;
+ if (dist > dist0 && k != 0) continue;
+ cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
+ qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
+ prev_k = k;
+ /* See Jmspeex' Journal of Dubious Theoretical Results. */
+ dist = gain_weight*(qcg - cg)*(qcg - cg)
+ + qcg*(double)cg*(2 - 2*cos_dist);
+ dist *= OD_CGAIN_SCALE_2;
+ /* Do approximate RDO. */
+ cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
+ n, speed);
+ if (cost <= best_cost) {
+ best_cost = cost;
+ best_dist = dist;
+ qg = i;
+ noref = 1;
+ best_k = k;
+ *itheta = -1;
+ OD_COPY(y, y_tmp, n);
+ }
+ }
+ }
+ k = best_k;
+ theta = best_qtheta;
+ skip = 0;
+ if (noref) {
+ if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
+ }
+ else {
+ if (!is_keyframe && qg == 0) {
+ skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
+ }
+ if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
+ }
+ /* Synthesize like the decoder would. */
+ if (skip) {
+ if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
+ else OD_CLEAR(out, n);
+ }
+ else {
+ if (noref) gain_offset = 0;
+ g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
+ od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
+ qm_inv);
+ }
+ *vk = k;
+ *skip_diff += skip_dist - best_dist;
+ /* Encode gain differently depending on whether we use prediction or not.
+ Special encoding on inter frames where qg=0 is allowed for noref=0
+ but not noref=1.*/
+ if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
+ else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
+}
+
+/** Encodes a single vector of integers (eg, a partition within a
+ * coefficient block) using PVQ
+ *
+ * @param [in,out] w multi-symbol entropy encoder
+ * @param [in] qg quantized gain
+ * @param [in] theta quantized post-prediction theta
+ * @param [in] in coefficient vector to code
+ * @param [in] n number of coefficients in partition
+ * @param [in] k number of pulses in partition
+ * @param [in,out] model entropy encoder state
+ * @param [in,out] adapt adaptation context
+ * @param [in,out] exg ExQ16 expectation of gain value
+ * @param [in,out] ext ExQ16 expectation of theta value
+ * @param [in] cdf_ctx selects which cdf context to use
+ * @param [in] is_keyframe whether we're encoding a keyframe
+ * @param [in] code_skip whether the "skip rest" flag is allowed
+ * @param [in] skip_rest when set, we skip all higher bands
+ * @param [in] encode_flip whether we need to encode the CfL flip flag now
+ * @param [in] flip value of the CfL flip flag
+ */
+void pvq_encode_partition(aom_writer *w,
+ int qg,
+ int theta,
+ const od_coeff *in,
+ int n,
+ int k,
+ generic_encoder model[3],
+ od_adapt_ctx *adapt,
+ int *exg,
+ int *ext,
+ int cdf_ctx,
+ int is_keyframe,
+ int code_skip,
+ int skip_rest,
+ int encode_flip,
+ int flip) {
+ int noref;
+ int id;
+ noref = (theta == -1);
+ id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
+ if (is_keyframe) {
+ OD_ASSERT(id != 8);
+ if (id >= 8) id--;
+ }
+ else {
+ OD_ASSERT(id != 10);
+ if (id >= 10) id--;
+ }
+ /* Jointly code gain, theta and noref for small values. Then we handle
+ larger gain and theta values. For noref, theta = -1. */
+ aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
+ 8 + 7*code_skip);
+ if (encode_flip) {
+ /* We could eventually do some smarter entropy coding here, but it would
+ have to be good enough to overcome the overhead of the entropy coder.
+ An early attempt using a "toogle" flag with simple adaptation wasn't
+ worth the trouble. */
+ aom_write_bit(w, flip);
+ }
+ if (qg > 0) {
+ int tmp;
+ tmp = *exg;
+ generic_encode(w, &model[!noref], qg - 1, &tmp, 2);
+ OD_IIR_DIADIC(*exg, qg << 16, 2);
+ }
+ if (theta > 1) {
+ int tmp;
+ tmp = *ext;
+ generic_encode(w, &model[2], theta - 2, &tmp, 2);
+ OD_IIR_DIADIC(*ext, theta << 16, 2);
+ }
+ aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in,
+ n - (theta != -1), k);
+}
+
+/** Quantizes a scalar with rate-distortion optimization (RDO)
+ * @param [in] x unquantized value
+ * @param [in] q quantization step size
+ * @param [in] delta0 rate increase for encoding a 1 instead of a 0
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @retval quantized value
+ */
+int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
+ int n;
+ /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
+ Jmspeex' Journal of Dubious Theoretical Results for details. */
+ n = OD_DIV_R0(abs(x), q);
+ if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
+ return 0;
+ }
+ else {
+ return OD_DIV_R0(x, q);
+ }
+}
+
+/** Encode a coefficient block (excepting DC) using PVQ
+ *
+ * @param [in,out] enc daala encoder context
+ * @param [in] ref 'reference' (prediction) vector
+ * @param [in] in coefficient block to quantize and encode
+ * @param [out] out quantized coefficient block
+ * @param [in] q0 scale/quantizer
+ * @param [in] pli plane index
+ * @param [in] bs log of the block size minus two
+ * @param [in] beta per-band activity masking beta param
+ * @param [in] is_keyframe whether we're encoding a keyframe
+ * @param [in] qm QM with magnitude compensation
+ * @param [in] qm_inv Inverse of QM with magnitude compensation
+ * @param [in] speed Make search faster by making approximations
+ * @param [in] pvq_info If null, conisdered as RDO search mode
+ * @return Returns block skip info indicating whether DC/AC are coded.
+ * bit0: DC is coded, bit1: AC is coded (1 means coded)
+ *
+ */
+PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
+ od_coeff *ref,
+ const od_coeff *in,
+ od_coeff *out,
+ int q_dc,
+ int q_ac,
+ int pli,
+ int bs,
+ const od_val16 *beta,
+ int is_keyframe,
+ const int16_t *qm,
+ const int16_t *qm_inv,
+ int speed,
+ PVQ_INFO *pvq_info){
+ int theta[PVQ_MAX_PARTITIONS];
+ int qg[PVQ_MAX_PARTITIONS];
+ int k[PVQ_MAX_PARTITIONS];
+ od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
+ int *exg;
+ int *ext;
+ int nb_bands;
+ int i;
+ const int *off;
+ int size[PVQ_MAX_PARTITIONS];
+ generic_encoder *model;
+ double skip_diff;
+ int tell;
+ uint16_t *skip_cdf;
+ od_rollback_buffer buf;
+ int dc_quant;
+ int flip;
+ int cfl_encoded;
+ int skip_rest;
+ int skip_dir;
+ int skip_theta_value;
+ const unsigned char *pvq_qm;
+ double dc_rate;
+ int use_masking;
+ PVQ_SKIP_TYPE ac_dc_coded;
+
+ aom_clear_system_state();
+
+ use_masking = enc->use_activity_masking;
+
+ if (use_masking)
+ pvq_qm = &enc->state.pvq_qm_q4[pli][0];
+ else
+ pvq_qm = 0;
+
+ exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0];
+ ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
+ skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)];
+ model = enc->state.adapt->pvq.pvq_param_model;
+ nb_bands = OD_BAND_OFFSETS[bs][0];
+ off = &OD_BAND_OFFSETS[bs][1];
+
+ if (use_masking)
+ dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4);
+ else
+ dc_quant = OD_MAXI(1, q_dc);
+
+ tell = 0;
+ for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
+ skip_diff = 0;
+ flip = 0;
+ /*If we are coding a chroma block of a keyframe, we are doing CfL.*/
+ if (pli != 0 && is_keyframe) {
+ od_val32 xy;
+ xy = 0;
+ /*Compute the dot-product of the first band of chroma with the luma ref.*/
+ for (i = off[0]; i < off[1]; i++) {
+#if defined(OD_FLOAT_PVQ)
+ xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
+ (double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
+#else
+ od_val32 rq;
+ od_val32 inq;
+ rq = ref[i]*qm[i];
+ inq = in[i]*qm[i];
+ xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
+ 1));
+#endif
+ }
+ /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
+ if (xy < 0) {
+ flip = 1;
+ for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
+ }
+ }
+ for (i = 0; i < nb_bands; i++) {
+ int q;
+
+ if (use_masking)
+ q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
+ else
+ q = OD_MAXI(1, q_ac);
+
+ qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
+ q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe,
+ pli, enc->state.adapt, qm + off[i], qm_inv + off[i],
+ enc->pvq_norm_lambda, speed);
+ }
+ od_encode_checkpoint(enc, &buf);
+ if (is_keyframe) out[0] = 0;
+ else {
+ int n;
+ n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
+ if (n == 0) {
+ out[0] = 0;
+ } else {
+ int tell2;
+ od_rollback_buffer dc_buf;
+
+ dc_rate = -OD_LOG2((double)(skip_cdf[3] - skip_cdf[2])/
+ (double)(skip_cdf[2] - skip_cdf[1]));
+ dc_rate += 1;
+
+#if CONFIG_DAALA_EC
+ tell2 = od_ec_enc_tell_frac(&enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ od_encode_checkpoint(enc, &dc_buf);
+ generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
+ n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
+#if CONFIG_DAALA_EC
+ tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ dc_rate += tell2/8.0;
+ od_encode_rollback(enc, &dc_buf);
+
+ out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
+ enc->pvq_norm_lambda);
+ }
+ }
+#if CONFIG_DAALA_EC
+ tell = od_ec_enc_tell_frac(&enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ /* Code as if we're not skipping. */
+ aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
+ ac_dc_coded = AC_CODED + (out[0] != 0);
+ cfl_encoded = 0;
+ skip_rest = 1;
+ skip_theta_value = is_keyframe ? -1 : 0;
+ for (i = 1; i < nb_bands; i++) {
+ if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
+ }
+ skip_dir = 0;
+ if (nb_bands > 1) {
+ for (i = 0; i < 3; i++) {
+ int j;
+ int tmp;
+ tmp = 1;
+ // ToDo(yaowu): figure out better stop condition without gcc warning.
+ for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
+ if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
+ }
+ skip_dir |= tmp << i;
+ }
+ }
+ if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
+
+ /* NOTE: There was no other better place to put this function. */
+ if (pvq_info)
+ av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size,
+ skip_rest, skip_dir, bs);
+
+ for (i = 0; i < nb_bands; i++) {
+ int encode_flip;
+ /* Encode CFL flip bit just after the first time it's used. */
+ encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
+ if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
+ pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i],
+ size[i], k[i], model, enc->state.adapt, exg + i, ext + i,
+ (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
+ is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip);
+ }
+ if (i == 0 && !skip_rest && bs > 0) {
+ aom_write_symbol(&enc->w, skip_dir,
+ &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7);
+ }
+ if (encode_flip) cfl_encoded = 1;
+ }
+#if CONFIG_DAALA_EC
+ tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ /* Account for the rate of skipping the AC, based on the same DC decision
+ we made when trying to not skip AC. */
+ {
+ double skip_rate;
+ if (out[0] != 0) {
+ skip_rate = -OD_LOG2((skip_cdf[1] - skip_cdf[0])/
+ (double)skip_cdf[3]);
+ }
+ else {
+ skip_rate = -OD_LOG2(skip_cdf[0]/
+ (double)skip_cdf[3]);
+ }
+ tell -= (int)floor(.5+8*skip_rate);
+ }
+ if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
+ if (is_keyframe) out[0] = 0;
+ else {
+ int n;
+ n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
+ if (n == 0) {
+ out[0] = 0;
+ } else {
+ int tell2;
+ od_rollback_buffer dc_buf;
+
+ dc_rate = -OD_LOG2((double)(skip_cdf[1] - skip_cdf[0])/
+ (double)skip_cdf[0]);
+ dc_rate += 1;
+
+#if CONFIG_DAALA_EC
+ tell2 = od_ec_enc_tell_frac(&enc->w.ec);
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ od_encode_checkpoint(enc, &dc_buf);
+ generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
+ n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
+#if CONFIG_DAALA_EC
+ tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
+#else
+#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#endif
+ dc_rate += tell2/8.0;
+ od_encode_rollback(enc, &dc_buf);
+
+ out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
+ enc->pvq_norm_lambda);
+ }
+ }
+ /* We decide to skip, roll back everything as it was before. */
+ od_encode_rollback(enc, &buf);
+ aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4);
+ ac_dc_coded = (out[0] != 0);
+ if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
+ else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
+ }
+ if (pvq_info)
+ pvq_info->ac_dc_coded = ac_dc_coded;
+ return ac_dc_coded;
+}
diff --git a/third_party/aom/av1/encoder/pvq_encoder.h b/third_party/aom/av1/encoder/pvq_encoder.h
new file mode 100644
index 0000000000..b84c8961b7
--- /dev/null
+++ b/third_party/aom/av1/encoder/pvq_encoder.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_pvq_encoder_H)
+# define _pvq_encoder_H (1)
+# include "aom_dsp/bitwriter.h"
+# include "aom_dsp/entenc.h"
+# include "av1/common/blockd.h"
+# include "av1/common/pvq.h"
+# include "av1/encoder/encint.h"
+
+void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
+ int nsymbs);
+
+void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
+ const int *y, int n, int k, int level);
+
+void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay);
+
+void pvq_encode_partition(aom_writer *w,
+ int qg,
+ int theta,
+ const od_coeff *in,
+ int n,
+ int k,
+ generic_encoder model[3],
+ od_adapt_ctx *adapt,
+ int *exg,
+ int *ext,
+ int cdf_ctx,
+ int is_keyframe,
+ int code_skip,
+ int skip_rest,
+ int encode_flip,
+ int flip);
+
+PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, od_coeff *ref,
+ const od_coeff *in, od_coeff *out, int q_dc, int q_ac, int pli, int bs,
+ const od_val16 *beta, int is_keyframe,
+ const int16_t *qm, const int16_t *qm_inv, int speed,
+ PVQ_INFO *pvq_info);
+
+#endif
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
new file mode 100644
index 0000000000..5d5dd75721
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#define _POSIX_C_SOURCE 200112L // rand_r()
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "av1/encoder/ransac.h"
+
+#define MAX_MINPTS 4
+#define MAX_DEGENERATE_ITER 10
+#define MINPTS_MULTIPLIER 5
+
+#define INLIER_THRESHOLD 1.0
+#define MIN_TRIALS 20
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef int (*IsDegenerateFunc)(double *p);
+typedef void (*NormalizeFunc)(double *p, int np, double *T);
+typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2);
+typedef int (*FindTransformationFunc)(int points, double *points1,
+ double *points2, double *params);
+typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj);
+
+static void project_points_double_translation(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = x + mat[0];
+ *(proj++) = y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void project_points_double_rotzoom(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+ *(proj++) = -mat[3] * x + mat[2] * y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void project_points_double_affine(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+ *(proj++) = mat[4] * x + mat[5] * y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void project_points_double_hortrapezoid(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ double x, y, Z, Z_inv;
+ for (i = 0; i < n; ++i) {
+ x = *(points++), y = *(points++);
+ Z_inv = mat[7] * y + 1;
+ assert(fabs(Z_inv) > 0.000001);
+ Z = 1. / Z_inv;
+ *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
+ *(proj++) = (mat[5] * y + mat[1]) * Z;
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void project_points_double_vertrapezoid(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ double x, y, Z, Z_inv;
+ for (i = 0; i < n; ++i) {
+ x = *(points++), y = *(points++);
+ Z_inv = mat[6] * x + 1;
+ assert(fabs(Z_inv) > 0.000001);
+ Z = 1. / Z_inv;
+ *(proj++) = (mat[2] * x + mat[0]) * Z;
+ *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void project_points_double_homography(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ double x, y, Z, Z_inv;
+ for (i = 0; i < n; ++i) {
+ x = *(points++), y = *(points++);
+ Z_inv = mat[6] * x + mat[7] * y + 1;
+ assert(fabs(Z_inv) > 0.000001);
+ Z = 1. / Z_inv;
+ *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
+ *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// svdcmp
+// Adopted from Numerical Recipes in C
+
+static const double TINY_NEAR_ZERO = 1.0E-12;
+
+static INLINE double sign(double a, double b) {
+ return ((b) >= 0 ? fabs(a) : -fabs(a));
+}
+
+static INLINE double pythag(double a, double b) {
+ double ct;
+ const double absa = fabs(a);
+ const double absb = fabs(b);
+
+ if (absa > absb) {
+ ct = absb / absa;
+ return absa * sqrt(1.0 + ct * ct);
+ } else {
+ ct = absa / absb;
+ return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
+ }
+}
+
+static void multiply_mat(const double *m1, const double *m2, double *res,
+ const int m1_rows, const int inner_dim,
+ const int m2_cols) {
+ double sum;
+
+ int row, col, inner;
+ for (row = 0; row < m1_rows; ++row) {
+ for (col = 0; col < m2_cols; ++col) {
+ sum = 0;
+ for (inner = 0; inner < inner_dim; ++inner)
+ sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+ *(res++) = sum;
+ }
+ }
+}
+
+static int svdcmp(double **u, int m, int n, double w[], double **v) {
+ const int max_its = 30;
+ int flag, i, its, j, jj, k, l, nm;
+ double anorm, c, f, g, h, s, scale, x, y, z;
+ double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+ g = scale = anorm = 0.0;
+ for (i = 0; i < n; i++) {
+ l = i + 1;
+ rv1[i] = scale * g;
+ g = s = scale = 0.0;
+ if (i < m) {
+ for (k = i; k < m; k++) scale += fabs(u[k][i]);
+ if (scale != 0.) {
+ for (k = i; k < m; k++) {
+ u[k][i] /= scale;
+ s += u[k][i] * u[k][i];
+ }
+ f = u[i][i];
+ g = -sign(sqrt(s), f);
+ h = f * g - s;
+ u[i][i] = f - g;
+ for (j = l; j < n; j++) {
+ for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
+ f = s / h;
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+ }
+ for (k = i; k < m; k++) u[k][i] *= scale;
+ }
+ }
+ w[i] = scale * g;
+ g = s = scale = 0.0;
+ if (i < m && i != n - 1) {
+ for (k = l; k < n; k++) scale += fabs(u[i][k]);
+ if (scale != 0.) {
+ for (k = l; k < n; k++) {
+ u[i][k] /= scale;
+ s += u[i][k] * u[i][k];
+ }
+ f = u[i][l];
+ g = -sign(sqrt(s), f);
+ h = f * g - s;
+ u[i][l] = f - g;
+ for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+ for (j = l; j < m; j++) {
+ for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
+ for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+ }
+ for (k = l; k < n; k++) u[i][k] *= scale;
+ }
+ }
+ anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
+ }
+
+ for (i = n - 1; i >= 0; i--) {
+ if (i < n - 1) {
+ if (g != 0.) {
+ for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
+ for (j = l; j < n; j++) {
+ for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
+ for (k = l; k < n; k++) v[k][j] += s * v[k][i];
+ }
+ }
+ for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
+ }
+ v[i][i] = 1.0;
+ g = rv1[i];
+ l = i;
+ }
+ for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
+ l = i + 1;
+ g = w[i];
+ for (j = l; j < n; j++) u[i][j] = 0.0;
+ if (g != 0.) {
+ g = 1.0 / g;
+ for (j = l; j < n; j++) {
+ for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
+ f = (s / u[i][i]) * g;
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+ }
+ for (j = i; j < m; j++) u[j][i] *= g;
+ } else {
+ for (j = i; j < m; j++) u[j][i] = 0.0;
+ }
+ ++u[i][i];
+ }
+ for (k = n - 1; k >= 0; k--) {
+ for (its = 0; its < max_its; its++) {
+ flag = 1;
+ for (l = k; l >= 0; l--) {
+ nm = l - 1;
+ if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
+ flag = 0;
+ break;
+ }
+ if ((double)(fabs(w[nm]) + anorm) == anorm) break;
+ }
+ if (flag) {
+ c = 0.0;
+ s = 1.0;
+ for (i = l; i <= k; i++) {
+ f = s * rv1[i];
+ rv1[i] = c * rv1[i];
+ if ((double)(fabs(f) + anorm) == anorm) break;
+ g = w[i];
+ h = pythag(f, g);
+ w[i] = h;
+ h = 1.0 / h;
+ c = g * h;
+ s = -f * h;
+ for (j = 0; j < m; j++) {
+ y = u[j][nm];
+ z = u[j][i];
+ u[j][nm] = y * c + z * s;
+ u[j][i] = z * c - y * s;
+ }
+ }
+ }
+ z = w[k];
+ if (l == k) {
+ if (z < 0.0) {
+ w[k] = -z;
+ for (j = 0; j < n; j++) v[j][k] = -v[j][k];
+ }
+ break;
+ }
+ if (its == max_its - 1) {
+ aom_free(rv1);
+ return 1;
+ }
+ assert(k > 0);
+ x = w[l];
+ nm = k - 1;
+ y = w[nm];
+ g = rv1[nm];
+ h = rv1[k];
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+ g = pythag(f, 1.0);
+ f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+ c = s = 1.0;
+ for (j = l; j <= nm; j++) {
+ i = j + 1;
+ g = rv1[i];
+ y = w[i];
+ h = s * g;
+ g = c * g;
+ z = pythag(f, h);
+ rv1[j] = z;
+ c = f / z;
+ s = h / z;
+ f = x * c + g * s;
+ g = g * c - x * s;
+ h = y * s;
+ y *= c;
+ for (jj = 0; jj < n; jj++) {
+ x = v[jj][j];
+ z = v[jj][i];
+ v[jj][j] = x * c + z * s;
+ v[jj][i] = z * c - x * s;
+ }
+ z = pythag(f, h);
+ w[j] = z;
+ if (z != 0.) {
+ z = 1.0 / z;
+ c = f * z;
+ s = h * z;
+ }
+ f = c * g + s * y;
+ x = c * y - s * g;
+ for (jj = 0; jj < m; jj++) {
+ y = u[jj][j];
+ z = u[jj][i];
+ u[jj][j] = y * c + z * s;
+ u[jj][i] = z * c - y * s;
+ }
+ }
+ rv1[l] = 0.0;
+ rv1[k] = f;
+ w[k] = x;
+ }
+ }
+ aom_free(rv1);
+ return 0;
+}
+
+static int SVD(double *U, double *W, double *V, double *matx, int M, int N) {
+ // Assumes allocation for U is MxN
+ double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
+ double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
+ int problem, i;
+
+ problem = !(nrU && nrV);
+ if (!problem) {
+ for (i = 0; i < M; i++) {
+ nrU[i] = &U[i * N];
+ }
+ for (i = 0; i < N; i++) {
+ nrV[i] = &V[i * N];
+ }
+ } else {
+ if (nrU) aom_free(nrU);
+ if (nrV) aom_free(nrV);
+ return 1;
+ }
+
+ /* copy from given matx into nrU */
+ for (i = 0; i < M; i++) {
+ memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
+ }
+
+ /* HERE IT IS: do SVD */
+ if (svdcmp(nrU, M, N, W, nrV)) {
+ aom_free(nrU);
+ aom_free(nrV);
+ return 1;
+ }
+
+ /* aom_free Numerical Recipes arrays */
+ aom_free(nrU);
+ aom_free(nrV);
+
+ return 0;
+}
+
+int pseudo_inverse(double *inv, double *matx, const int M, const int N) {
+ double ans;
+ int i, j, k;
+ double *const U = (double *)aom_malloc(M * N * sizeof(*matx));
+ double *const W = (double *)aom_malloc(N * sizeof(*matx));
+ double *const V = (double *)aom_malloc(N * N * sizeof(*matx));
+
+ if (!(U && W && V)) {
+ return 1;
+ }
+ if (SVD(U, W, V, matx, M, N)) {
+ aom_free(U);
+ aom_free(W);
+ aom_free(V);
+ return 1;
+ }
+ for (i = 0; i < N; i++) {
+ if (fabs(W[i]) < TINY_NEAR_ZERO) {
+ aom_free(U);
+ aom_free(W);
+ aom_free(V);
+ return 1;
+ }
+ }
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j++) {
+ ans = 0;
+ for (k = 0; k < N; k++) {
+ ans += V[k + N * i] * U[k + N * j] / W[k];
+ }
+ inv[j + M * i] = ans;
+ }
+ }
+ aom_free(U);
+ aom_free(W);
+ aom_free(V);
+ return 0;
+}
+
+static void normalize_homography(double *pts, int n, double *T) {
+ double *p = pts;
+ double mean[2] = { 0, 0 };
+ double msqe = 0;
+ double scale;
+ int i;
+ for (i = 0; i < n; ++i, p += 2) {
+ mean[0] += p[0];
+ mean[1] += p[1];
+ }
+ mean[0] /= n;
+ mean[1] /= n;
+ for (p = pts, i = 0; i < n; ++i, p += 2) {
+ p[0] -= mean[0];
+ p[1] -= mean[1];
+ msqe += sqrt(p[0] * p[0] + p[1] * p[1]);
+ }
+ msqe /= n;
+ scale = (msqe == 0 ? 1.0 : sqrt(2) / msqe);
+ T[0] = scale;
+ T[1] = 0;
+ T[2] = -scale * mean[0];
+ T[3] = 0;
+ T[4] = scale;
+ T[5] = -scale * mean[1];
+ T[6] = 0;
+ T[7] = 0;
+ T[8] = 1;
+ for (p = pts, i = 0; i < n; ++i, p += 2) {
+ p[0] *= scale;
+ p[1] *= scale;
+ }
+}
+
+static void invnormalize_mat(double *T, double *iT) {
+ double is = 1.0 / T[0];
+ double m0 = -T[2] * is;
+ double m1 = -T[5] * is;
+ iT[0] = is;
+ iT[1] = 0;
+ iT[2] = m0;
+ iT[3] = 0;
+ iT[4] = is;
+ iT[5] = m1;
+ iT[6] = 0;
+ iT[7] = 0;
+ iT[8] = 1;
+}
+
+static void denormalize_homography(double *params, double *T1, double *T2) {
+ double iT2[9];
+ double params2[9];
+ invnormalize_mat(T2, iT2);
+ multiply_mat(params, T1, params2, 3, 3, 3);
+ multiply_mat(iT2, params2, params, 3, 3, 3);
+}
+
+static void denormalize_homography_reorder(double *params, double *T1,
+ double *T2) {
+ double params_denorm[MAX_PARAMDIM];
+ memcpy(params_denorm, params, sizeof(*params) * 8);
+ params_denorm[8] = 1.0;
+ denormalize_homography(params_denorm, T1, T2);
+ params[0] = params_denorm[2];
+ params[1] = params_denorm[5];
+ params[2] = params_denorm[0];
+ params[3] = params_denorm[1];
+ params[4] = params_denorm[3];
+ params[5] = params_denorm[4];
+ params[6] = params_denorm[6];
+ params[7] = params_denorm[7];
+}
+
+static void denormalize_affine_reorder(double *params, double *T1, double *T2) {
+ double params_denorm[MAX_PARAMDIM];
+ params_denorm[0] = params[0];
+ params_denorm[1] = params[1];
+ params_denorm[2] = params[4];
+ params_denorm[3] = params[2];
+ params_denorm[4] = params[3];
+ params_denorm[5] = params[5];
+ params_denorm[6] = params_denorm[7] = 0;
+ params_denorm[8] = 1;
+ denormalize_homography(params_denorm, T1, T2);
+ params[0] = params_denorm[2];
+ params[1] = params_denorm[5];
+ params[2] = params_denorm[0];
+ params[3] = params_denorm[1];
+ params[4] = params_denorm[3];
+ params[5] = params_denorm[4];
+ params[6] = params[7] = 0;
+}
+
+static void denormalize_rotzoom_reorder(double *params, double *T1,
+ double *T2) {
+ double params_denorm[MAX_PARAMDIM];
+ params_denorm[0] = params[0];
+ params_denorm[1] = params[1];
+ params_denorm[2] = params[2];
+ params_denorm[3] = -params[1];
+ params_denorm[4] = params[0];
+ params_denorm[5] = params[3];
+ params_denorm[6] = params_denorm[7] = 0;
+ params_denorm[8] = 1;
+ denormalize_homography(params_denorm, T1, T2);
+ params[0] = params_denorm[2];
+ params[1] = params_denorm[5];
+ params[2] = params_denorm[0];
+ params[3] = params_denorm[1];
+ params[4] = -params[3];
+ params[5] = params[2];
+ params[6] = params[7] = 0;
+}
+
+static void denormalize_translation_reorder(double *params, double *T1,
+ double *T2) {
+ double params_denorm[MAX_PARAMDIM];
+ params_denorm[0] = 1;
+ params_denorm[1] = 0;
+ params_denorm[2] = params[0];
+ params_denorm[3] = 0;
+ params_denorm[4] = 1;
+ params_denorm[5] = params[1];
+ params_denorm[6] = params_denorm[7] = 0;
+ params_denorm[8] = 1;
+ denormalize_homography(params_denorm, T1, T2);
+ params[0] = params_denorm[2];
+ params[1] = params_denorm[5];
+ params[2] = params[5] = 1;
+ params[3] = params[4] = 0;
+ params[6] = params[7] = 0;
+}
+
+static int find_translation(int np, double *pts1, double *pts2, double *mat) {
+ int i;
+ double sx, sy, dx, dy;
+ double sumx, sumy;
+
+ double T1[9], T2[9];
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ sumx = 0;
+ sumy = 0;
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ sumx += dx - sx;
+ sumy += dy - sy;
+ }
+ mat[0] = sumx / np;
+ mat[1] = sumy / np;
+ denormalize_translation_reorder(mat, T1, T2);
+ return 0;
+}
+
+static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
+ const int np2 = np * 2;
+ double *a = (double *)aom_malloc(sizeof(*a) * np2 * 9);
+ double *b = a + np2 * 4;
+ double *temp = b + np2;
+ int i;
+ double sx, sy, dx, dy;
+
+ double T1[9], T2[9];
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ a[i * 2 * 4 + 0] = sx;
+ a[i * 2 * 4 + 1] = sy;
+ a[i * 2 * 4 + 2] = 1;
+ a[i * 2 * 4 + 3] = 0;
+ a[(i * 2 + 1) * 4 + 0] = sy;
+ a[(i * 2 + 1) * 4 + 1] = -sx;
+ a[(i * 2 + 1) * 4 + 2] = 0;
+ a[(i * 2 + 1) * 4 + 3] = 1;
+
+ b[2 * i] = dx;
+ b[2 * i + 1] = dy;
+ }
+ if (pseudo_inverse(temp, a, np2, 4)) {
+ aom_free(a);
+ return 1;
+ }
+ multiply_mat(temp, b, mat, 4, np2, 1);
+ denormalize_rotzoom_reorder(mat, T1, T2);
+ aom_free(a);
+ return 0;
+}
+
+static int find_affine(int np, double *pts1, double *pts2, double *mat) {
+ const int np2 = np * 2;
+ double *a = (double *)aom_malloc(sizeof(*a) * np2 * 13);
+ double *b = a + np2 * 6;
+ double *temp = b + np2;
+ int i;
+ double sx, sy, dx, dy;
+
+ double T1[9], T2[9];
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ a[i * 2 * 6 + 0] = sx;
+ a[i * 2 * 6 + 1] = sy;
+ a[i * 2 * 6 + 2] = 0;
+ a[i * 2 * 6 + 3] = 0;
+ a[i * 2 * 6 + 4] = 1;
+ a[i * 2 * 6 + 5] = 0;
+ a[(i * 2 + 1) * 6 + 0] = 0;
+ a[(i * 2 + 1) * 6 + 1] = 0;
+ a[(i * 2 + 1) * 6 + 2] = sx;
+ a[(i * 2 + 1) * 6 + 3] = sy;
+ a[(i * 2 + 1) * 6 + 4] = 0;
+ a[(i * 2 + 1) * 6 + 5] = 1;
+
+ b[2 * i] = dx;
+ b[2 * i + 1] = dy;
+ }
+ if (pseudo_inverse(temp, a, np2, 6)) {
+ aom_free(a);
+ return 1;
+ }
+ multiply_mat(temp, b, mat, 6, np2, 1);
+ denormalize_affine_reorder(mat, T1, T2);
+ aom_free(a);
+ return 0;
+}
+
+static int find_vertrapezoid(int np, double *pts1, double *pts2, double *mat) {
+ const int np3 = np * 3;
+ double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
+ double *U = a + np3 * 7;
+ double S[7], V[7 * 7], H[9];
+ int i, mini;
+ double sx, sy, dx, dy;
+ double T1[9], T2[9];
+
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = 0;
+ a[i * 3 * 7 + 2] = -sx;
+ a[i * 3 * 7 + 3] = -sy;
+ a[i * 3 * 7 + 4] = -1;
+ a[i * 3 * 7 + 5] = dy * sx;
+ a[i * 3 * 7 + 6] = dy;
+
+ a[(i * 3 + 1) * 7 + 0] = sx;
+ a[(i * 3 + 1) * 7 + 1] = 1;
+ a[(i * 3 + 1) * 7 + 2] = a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] =
+ 0;
+ a[(i * 3 + 1) * 7 + 5] = -dx * sx;
+ a[(i * 3 + 1) * 7 + 6] = -dx;
+
+ a[(i * 3 + 2) * 7 + 0] = -dy * sx;
+ a[(i * 3 + 2) * 7 + 1] = -dy;
+ a[(i * 3 + 2) * 7 + 2] = dx * sx;
+ a[(i * 3 + 2) * 7 + 3] = dx * sy;
+ a[(i * 3 + 2) * 7 + 4] = dx;
+ a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
+ }
+ if (SVD(U, S, V, a, np3, 7)) {
+ aom_free(a);
+ return 1;
+ } else {
+ double minS = 1e12;
+ mini = -1;
+ for (i = 0; i < 7; ++i) {
+ if (S[i] < minS) {
+ minS = S[i];
+ mini = i;
+ }
+ }
+ }
+ H[1] = H[7] = 0;
+ for (i = 0; i < 1; i++) H[i] = V[i * 7 + mini];
+ for (; i < 6; i++) H[i + 1] = V[i * 7 + mini];
+ for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
+
+ denormalize_homography_reorder(H, T1, T2);
+ aom_free(a);
+ if (H[8] == 0.0) {
+ return 1;
+ } else {
+ // normalize
+ double f = 1.0 / H[8];
+ for (i = 0; i < 8; i++) mat[i] = f * H[i];
+ }
+ return 0;
+}
+
+static int find_hortrapezoid(int np, double *pts1, double *pts2, double *mat) {
+ const int np3 = np * 3;
+ double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
+ double *U = a + np3 * 7;
+ double S[7], V[7 * 7], H[9];
+ int i, mini;
+ double sx, sy, dx, dy;
+ double T1[9], T2[9];
+
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = a[i * 3 * 7 + 2] = 0;
+ a[i * 3 * 7 + 3] = -sy;
+ a[i * 3 * 7 + 4] = -1;
+ a[i * 3 * 7 + 5] = dy * sy;
+ a[i * 3 * 7 + 6] = dy;
+
+ a[(i * 3 + 1) * 7 + 0] = sx;
+ a[(i * 3 + 1) * 7 + 1] = sy;
+ a[(i * 3 + 1) * 7 + 2] = 1;
+ a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = 0;
+ a[(i * 3 + 1) * 7 + 5] = -dx * sy;
+ a[(i * 3 + 1) * 7 + 6] = -dx;
+
+ a[(i * 3 + 2) * 7 + 0] = -dy * sx;
+ a[(i * 3 + 2) * 7 + 1] = -dy * sy;
+ a[(i * 3 + 2) * 7 + 2] = -dy;
+ a[(i * 3 + 2) * 7 + 3] = dx * sy;
+ a[(i * 3 + 2) * 7 + 4] = dx;
+ a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
+ }
+
+ if (SVD(U, S, V, a, np3, 7)) {
+ aom_free(a);
+ return 1;
+ } else {
+ double minS = 1e12;
+ mini = -1;
+ for (i = 0; i < 7; ++i) {
+ if (S[i] < minS) {
+ minS = S[i];
+ mini = i;
+ }
+ }
+ }
+ H[3] = H[6] = 0;
+ for (i = 0; i < 3; i++) H[i] = V[i * 7 + mini];
+ for (; i < 5; i++) H[i + 1] = V[i * 7 + mini];
+ for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
+
+ denormalize_homography_reorder(H, T1, T2);
+ aom_free(a);
+ if (H[8] == 0.0) {
+ return 1;
+ } else {
+ // normalize
+ double f = 1.0 / H[8];
+ for (i = 0; i < 8; i++) mat[i] = f * H[i];
+ }
+ return 0;
+}
+
+static int find_homography(int np, double *pts1, double *pts2, double *mat) {
+ // Implemented from Peter Kovesi's normalized implementation
+ const int np3 = np * 3;
+ double *a = (double *)aom_malloc(sizeof(*a) * np3 * 18);
+ double *U = a + np3 * 9;
+ double S[9], V[9 * 9], H[9];
+ int i, mini;
+ double sx, sy, dx, dy;
+ double T1[9], T2[9];
+
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ a[i * 3 * 9 + 0] = a[i * 3 * 9 + 1] = a[i * 3 * 9 + 2] = 0;
+ a[i * 3 * 9 + 3] = -sx;
+ a[i * 3 * 9 + 4] = -sy;
+ a[i * 3 * 9 + 5] = -1;
+ a[i * 3 * 9 + 6] = dy * sx;
+ a[i * 3 * 9 + 7] = dy * sy;
+ a[i * 3 * 9 + 8] = dy;
+
+ a[(i * 3 + 1) * 9 + 0] = sx;
+ a[(i * 3 + 1) * 9 + 1] = sy;
+ a[(i * 3 + 1) * 9 + 2] = 1;
+ a[(i * 3 + 1) * 9 + 3] = a[(i * 3 + 1) * 9 + 4] = a[(i * 3 + 1) * 9 + 5] =
+ 0;
+ a[(i * 3 + 1) * 9 + 6] = -dx * sx;
+ a[(i * 3 + 1) * 9 + 7] = -dx * sy;
+ a[(i * 3 + 1) * 9 + 8] = -dx;
+
+ a[(i * 3 + 2) * 9 + 0] = -dy * sx;
+ a[(i * 3 + 2) * 9 + 1] = -dy * sy;
+ a[(i * 3 + 2) * 9 + 2] = -dy;
+ a[(i * 3 + 2) * 9 + 3] = dx * sx;
+ a[(i * 3 + 2) * 9 + 4] = dx * sy;
+ a[(i * 3 + 2) * 9 + 5] = dx;
+ a[(i * 3 + 2) * 9 + 6] = a[(i * 3 + 2) * 9 + 7] = a[(i * 3 + 2) * 9 + 8] =
+ 0;
+ }
+
+ if (SVD(U, S, V, a, np3, 9)) {
+ aom_free(a);
+ return 1;
+ } else {
+ double minS = 1e12;
+ mini = -1;
+ for (i = 0; i < 9; ++i) {
+ if (S[i] < minS) {
+ minS = S[i];
+ mini = i;
+ }
+ }
+ }
+
+ for (i = 0; i < 9; i++) H[i] = V[i * 9 + mini];
+ denormalize_homography_reorder(H, T1, T2);
+ aom_free(a);
+ if (H[8] == 0.0) {
+ return 1;
+ } else {
+ // normalize
+ double f = 1.0 / H[8];
+ for (i = 0; i < 8; i++) mat[i] = f * H[i];
+ }
+ return 0;
+}
+
+static int get_rand_indices(int npoints, int minpts, int *indices,
+ unsigned int *seed) {
+ int i, j;
+ int ptr = rand_r(seed) % npoints;
+ if (minpts > npoints) return 0;
+ indices[0] = ptr;
+ ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+ i = 1;
+ while (i < minpts) {
+ int index = rand_r(seed) % npoints;
+ while (index) {
+ ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+ for (j = 0; j < i; ++j) {
+ if (indices[j] == ptr) break;
+ }
+ if (j == i) index--;
+ }
+ indices[i++] = ptr;
+ }
+ return 1;
+}
+
+typedef struct {
+ int num_inliers;
+ double variance;
+ int *inlier_indices;
+} RANSAC_MOTION;
+
+// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise.
+static int compare_motions(const void *arg_a, const void *arg_b) {
+ const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a;
+ const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b;
+
+ if (motion_a->num_inliers > motion_b->num_inliers) return -1;
+ if (motion_a->num_inliers < motion_b->num_inliers) return 1;
+ if (motion_a->variance < motion_b->variance) return -1;
+ if (motion_a->variance > motion_b->variance) return 1;
+ return 0;
+}
+
+static int is_better_motion(const RANSAC_MOTION *motion_a,
+ const RANSAC_MOTION *motion_b) {
+ return compare_motions(motion_a, motion_b) < 0;
+}
+
+static void copy_points_at_indices(double *dest, const double *src,
+ const int *indices, int num_points) {
+ for (int i = 0; i < num_points; ++i) {
+ const int index = indices[i];
+ dest[i * 2] = src[index * 2];
+ dest[i * 2 + 1] = src[index * 2 + 1];
+ }
+}
+
+static const double kInfiniteVariance = 1e12;
+
+static void clear_motion(RANSAC_MOTION *motion, int num_points) {
+ motion->num_inliers = 0;
+ motion->variance = kInfiniteVariance;
+ memset(motion->inlier_indices, 0,
+ sizeof(*motion->inlier_indices * num_points));
+}
+
+static int ransac(const int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_desired_motions, const int minpts,
+ IsDegenerateFunc is_degenerate,
+ FindTransformationFunc find_transformation,
+ ProjectPointsDoubleFunc projectpoints) {
+ static const double PROBABILITY_REQUIRED = 0.9;
+ static const double EPS = 1e-12;
+
+ int N = 10000, trial_count = 0;
+ int i = 0;
+ int ret_val = 0;
+
+ unsigned int seed = (unsigned int)npoints;
+
+ int indices[MAX_MINPTS] = { 0 };
+
+ double *points1, *points2;
+ double *corners1, *corners2;
+ double *image1_coord;
+
+ // Store information for the num_desired_motions best transformations found
+ // and the worst motion among them, as well as the motion currently under
+ // consideration.
+ RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+ RANSAC_MOTION current_motion;
+
+ // Store the parameters and the indices of the inlier points for the motion
+ // currently under consideration.
+ double params_this_motion[MAX_PARAMDIM];
+
+ double *cnp1, *cnp2;
+
+ if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+ return 1;
+ }
+
+ points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+ points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+ corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+ corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+ image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2);
+
+ motions =
+ (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions);
+ for (i = 0; i < num_desired_motions; ++i) {
+ motions[i].inlier_indices =
+ (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints);
+ clear_motion(motions + i, npoints);
+ }
+ current_motion.inlier_indices =
+ (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints);
+ clear_motion(&current_motion, npoints);
+
+ worst_kept_motion = motions;
+
+ if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions &&
+ current_motion.inlier_indices)) {
+ ret_val = 1;
+ goto finish_ransac;
+ }
+
+ cnp1 = corners1;
+ cnp2 = corners2;
+ for (i = 0; i < npoints; ++i) {
+ *(cnp1++) = *(matched_points++);
+ *(cnp1++) = *(matched_points++);
+ *(cnp2++) = *(matched_points++);
+ *(cnp2++) = *(matched_points++);
+ }
+
+ while (N > trial_count) {
+ double sum_distance = 0.0;
+ double sum_distance_squared = 0.0;
+
+ clear_motion(&current_motion, npoints);
+
+ int degenerate = 1;
+ int num_degenerate_iter = 0;
+
+ while (degenerate) {
+ num_degenerate_iter++;
+ if (!get_rand_indices(npoints, minpts, indices, &seed)) {
+ ret_val = 1;
+ goto finish_ransac;
+ }
+
+ copy_points_at_indices(points1, corners1, indices, minpts);
+ copy_points_at_indices(points2, corners2, indices, minpts);
+
+ degenerate = is_degenerate(points1);
+ if (num_degenerate_iter > MAX_DEGENERATE_ITER) {
+ ret_val = 1;
+ goto finish_ransac;
+ }
+ }
+
+ if (find_transformation(minpts, points1, points2, params_this_motion)) {
+ trial_count++;
+ continue;
+ }
+
+ projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2);
+
+ for (i = 0; i < npoints; ++i) {
+ double dx = image1_coord[i * 2] - corners2[i * 2];
+ double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1];
+ double distance = sqrt(dx * dx + dy * dy);
+
+ if (distance < INLIER_THRESHOLD) {
+ current_motion.inlier_indices[current_motion.num_inliers++] = i;
+ sum_distance += distance;
+ sum_distance_squared += distance * distance;
+ }
+ }
+
+ if (current_motion.num_inliers >= worst_kept_motion->num_inliers &&
+ current_motion.num_inliers > 1) {
+ int temp;
+ double fracinliers, pNoOutliers, mean_distance;
+ mean_distance = sum_distance / ((double)current_motion.num_inliers);
+ current_motion.variance =
+ sum_distance_squared / ((double)current_motion.num_inliers - 1.0) -
+ mean_distance * mean_distance * ((double)current_motion.num_inliers) /
+ ((double)current_motion.num_inliers - 1.0);
+ if (is_better_motion(&current_motion, worst_kept_motion)) {
+ // This motion is better than the worst currently kept motion. Remember
+ // the inlier points and variance. The parameters for each kept motion
+ // will be recomputed later using only the inliers.
+ worst_kept_motion->num_inliers = current_motion.num_inliers;
+ worst_kept_motion->variance = current_motion.variance;
+ memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices,
+ sizeof(*current_motion.inlier_indices) * npoints);
+
+ assert(npoints > 0);
+ fracinliers = (double)current_motion.num_inliers / (double)npoints;
+ pNoOutliers = 1 - pow(fracinliers, minpts);
+ pNoOutliers = fmax(EPS, pNoOutliers);
+ pNoOutliers = fmin(1 - EPS, pNoOutliers);
+ temp = (int)(log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers));
+
+ if (temp > 0 && temp < N) {
+ N = AOMMAX(temp, MIN_TRIALS);
+ }
+
+ // Determine the new worst kept motion and its num_inliers and variance.
+ for (i = 0; i < num_desired_motions; ++i) {
+ if (is_better_motion(worst_kept_motion, &motions[i])) {
+ worst_kept_motion = &motions[i];
+ }
+ }
+ }
+ }
+ trial_count++;
+ }
+
+ // Sort the motions, best first.
+ qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+ // Recompute the motions using only the inliers.
+ for (i = 0; i < num_desired_motions; ++i) {
+ copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+ motions[i].num_inliers);
+ copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+ motions[i].num_inliers);
+
+ find_transformation(motions[i].num_inliers, points1, points2,
+ params_by_motion + (MAX_PARAMDIM - 1) * i);
+ num_inliers_by_motion[i] = motions[i].num_inliers;
+ }
+
+finish_ransac:
+ aom_free(points1);
+ aom_free(points2);
+ aom_free(corners1);
+ aom_free(corners2);
+ aom_free(image1_coord);
+ aom_free(current_motion.inlier_indices);
+ for (i = 0; i < num_desired_motions; ++i) {
+ aom_free(motions[i].inlier_indices);
+ }
+ aom_free(motions);
+
+ return ret_val;
+}
+
+static int is_collinear3(double *p1, double *p2, double *p3) {
+ static const double collinear_eps = 1e-3;
+ const double v =
+ (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+ return fabs(v) < collinear_eps;
+}
+
+static int is_degenerate_translation(double *p) {
+ return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+
+static int is_degenerate_affine(double *p) {
+ return is_collinear3(p, p + 2, p + 4);
+}
+
+static int is_degenerate_homography(double *p) {
+ return is_collinear3(p, p + 2, p + 4) || is_collinear3(p, p + 2, p + 6) ||
+ is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6);
+}
+
+int ransac_translation(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 3,
+ is_degenerate_translation, find_translation,
+ project_points_double_translation);
+}
+
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+ find_rotzoom, project_points_double_rotzoom);
+}
+
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+ find_affine, project_points_double_affine);
+}
+
+int ransac_homography(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 4,
+ is_degenerate_homography, find_homography,
+ project_points_double_homography);
+}
+
+int ransac_hortrapezoid(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 4,
+ is_degenerate_homography, find_hortrapezoid,
+ project_points_double_hortrapezoid);
+}
+
+int ransac_vertrapezoid(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 4,
+ is_degenerate_homography, find_vertrapezoid,
+ project_points_double_vertrapezoid);
+}
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
new file mode 100644
index 0000000000..f611add369
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RANSAC_H_
+#define AV1_ENCODER_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+
+#include "av1/common/warped_motion.h"
+
+typedef int (*RansacFunc)(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_motions);
+
+/* Each of these functions fits a motion model from a set of
+ corresponding points in 2 frames using RANSAC. */
+int ransac_homography(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_motions);
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_motions);
+int ransac_hortrapezoid(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_motions);
+int ransac_vertrapezoid(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_motions);
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_motions);
+int ransac_translation(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_motions);
+#endif // AV1_ENCODER_RANSAC_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
new file mode 100644
index 0000000000..1f2ea3606f
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -0,0 +1,1759 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/ratectrl.h"
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define FRAME_OVERHEAD_BITS 200
+#if CONFIG_HIGHBITDEPTH
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ switch (bit_depth) { \
+ case AOM_BITS_8: name = name##_8; break; \
+ case AOM_BITS_10: name = name##_10; break; \
+ case AOM_BITS_12: name = name##_12; break; \
+ default: \
+ assert(0 && \
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+ " or AOM_BITS_12"); \
+ name = NULL; \
+ } \
+ } while (0)
+#else
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ (void)bit_depth; \
+ name = name##_8; \
+ } while (0)
+#endif
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+#if CONFIG_HIGHBITDEPTH
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+#endif
+
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+ // Special case handling to deal with the step from q2.0
+ // down to lossless mode represented by q 1.0.
+ if (minqtarget <= 2.0) return 0;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i;
+ }
+
+ return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+ int *arfgf_high, int *inter, int *rtc,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+ kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+ kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+ arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+ rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+ }
+}
+
+void av1_rc_init_minq_luts(void) {
+ init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+ arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+ inter_minq_8, rtc_minq_8, AOM_BITS_8);
+#if CONFIG_HIGHBITDEPTH
+ init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+ arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+ inter_minq_10, rtc_minq_10, AOM_BITS_10);
+ init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+ arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+ inter_minq_12, rtc_minq_12, AOM_BITS_12);
+#endif
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+// Convert the index to a real Q value (scaled down to match old Q values)
+#if CONFIG_HIGHBITDEPTH
+ switch (bit_depth) {
+ case AOM_BITS_8: return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
+ case AOM_BITS_10: return av1_ac_quant(qindex, 0, bit_depth) / 16.0;
+ case AOM_BITS_12: return av1_ac_quant(qindex, 0, bit_depth) / 64.0;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1.0;
+ }
+#else
+ return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
+#endif
+}
+
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor, aom_bit_depth_t bit_depth) {
+ const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+ assert(correction_factor <= MAX_BPB_FACTOR &&
+ correction_factor >= MIN_BPB_FACTOR);
+
+ // q based adjustment to baseline enumerator
+ enumerator += (int)(enumerator * q) >> 12;
+ return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+ double correction_factor,
+ aom_bit_depth_t bit_depth) {
+ const int bpm =
+ (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+ return AOMMAX(FRAME_OVERHEAD_BITS,
+ (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const int min_frame_target =
+ AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+// Clip the frame target to the minimum setup value.
+#if CONFIG_EXT_REFS
+ if (cpi->rc.is_src_frame_alt_ref) {
+#else
+ if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+#endif // CONFIG_EXT_REFS
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a constructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for constructed ARFs.
+ target = min_frame_target;
+ } else if (target < min_frame_target) {
+ target = min_frame_target;
+ }
+
+ // Clip the frame target to the maximum allowed value.
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+
+ return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ if (oxcf->rc_max_intra_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ return target;
+}
+
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+// Non-viewable frames are a special case and are treated as pure overhead.
+#if CONFIG_EXT_REFS
+ // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
+ // differently, since it is a no-show frame.
+ if (!cm->show_frame && !rc->is_bwd_ref_frame)
+#else
+ if (!cm->show_frame)
+#endif // CONFIG_EXT_REFS
+ rc->bits_off_target -= encoded_frame_size;
+ else
+ rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+ // Clip the buffer level to the maximum specified buffer size.
+ rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+ rc->buffer_level = rc->bits_off_target;
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+ double framerate) {
+ // Assume we do not need any constraint lower than 4K 20 fps
+ static const double factor_safe = 3840 * 2160 * 20.0;
+ const double factor = width * height * framerate;
+ const int default_interval =
+ clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+ if (factor <= factor_safe)
+ return default_interval;
+ else
+ return AOMMAX(default_interval,
+ (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+ // Note this logic makes:
+ // 4K24: 5
+ // 4K30: 6
+ // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+ int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+ interval += (interval & 0x01); // Round to even value
+ return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+ int i;
+
+ if (pass == 0 && oxcf->rc_mode == AOM_CBR) {
+ rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+ rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+ } else {
+ rc->avg_frame_qindex[KEY_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ rc->avg_frame_qindex[INTER_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ }
+
+ rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+ rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+ rc->buffer_level = rc->starting_buffer_level;
+ rc->bits_off_target = rc->starting_buffer_level;
+
+ rc->rolling_target_bits = rc->avg_frame_bandwidth;
+ rc->rolling_actual_bits = rc->avg_frame_bandwidth;
+ rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+ rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+ rc->total_actual_bits = 0;
+ rc->total_target_bits = 0;
+ rc->total_target_vs_actual = 0;
+
+ rc->frames_since_key = 8; // Sensible default for first frame.
+ rc->this_key_frame_forced = 0;
+ rc->next_key_frame_forced = 0;
+ rc->source_alt_ref_pending = 0;
+ rc->source_alt_ref_active = 0;
+
+ rc->frames_till_gf_update_due = 0;
+ rc->ni_av_qi = oxcf->worst_allowed_q;
+ rc->ni_tot_qi = 0;
+ rc->ni_frames = 0;
+
+ rc->tot_q = 0.0;
+ rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+ for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ rc->rate_correction_factors[i] = 1.0;
+ }
+
+ rc->min_gf_interval = oxcf->min_gf_interval;
+ rc->max_gf_interval = oxcf->max_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->width, oxcf->height, oxcf->init_framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ oxcf->init_framerate, rc->min_gf_interval);
+ rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ if (!oxcf->drop_frames_water_mark) {
+ return 0;
+ } else {
+ if (rc->buffer_level < 0) {
+ // Always drop if buffer is below 0.
+ return 1;
+ } else {
+ // If buffer is below drop_mark, for now just drop every other frame
+ // (starting with the next frame) until it increases back over drop_mark.
+ int drop_mark =
+ (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
+ if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+ --rc->decimation_factor;
+ } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+ rc->decimation_factor = 1;
+ }
+ if (rc->decimation_factor > 0) {
+ if (rc->decimation_count > 0) {
+ --rc->decimation_count;
+ return 1;
+ } else {
+ rc->decimation_count = rc->decimation_factor;
+ return 0;
+ }
+ } else {
+ rc->decimation_count = 0;
+ return 0;
+ }
+ }
+ }
+}
+
+static double get_rate_correction_factor(const AV1_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ double rcf;
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ rcf = rc->rate_correction_factors[KF_STD];
+ } else if (cpi->oxcf.pass == 2) {
+ RATE_FACTOR_LEVEL rf_lvl =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ rcf = rc->rate_correction_factors[rf_lvl];
+ } else {
+ if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+ !rc->is_src_frame_alt_ref &&
+ (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+ rcf = rc->rate_correction_factors[GF_ARF_STD];
+ else
+ rcf = rc->rate_correction_factors[INTER_NORMAL];
+ }
+ rcf *= rcf_mult[rc->frame_size_selector];
+ return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(AV1_COMP *cpi, double factor) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Normalize RCF to account for the size-dependent scaling factor.
+ factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+ factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ rc->rate_correction_factors[KF_STD] = factor;
+ } else if (cpi->oxcf.pass == 2) {
+ RATE_FACTOR_LEVEL rf_lvl =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ rc->rate_correction_factors[rf_lvl] = factor;
+ } else {
+ if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+ !rc->is_src_frame_alt_ref &&
+ (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+ rc->rate_correction_factors[GF_ARF_STD] = factor;
+ else
+ rc->rate_correction_factors[INTER_NORMAL] = factor;
+ }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int correction_factor = 100;
+ double rate_correction_factor = get_rate_correction_factor(cpi);
+ double adjustment_limit;
+
+ int projected_size_based_on_q = 0;
+
+ // Do not update the rate factors for arf overlay frames.
+ if (cpi->rc.is_src_frame_alt_ref) return;
+
+ // Clear down mmx registers to allow floating point in what follows
+ aom_clear_system_state();
+
+ // Work out how big we would have expected the frame to be at this Q given
+ // the current correction factor.
+ // Stay in double to avoid int overflow when values are large
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+ projected_size_based_on_q =
+ av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+ } else {
+ projected_size_based_on_q =
+ av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs,
+ rate_correction_factor, cm->bit_depth);
+ }
+ // Work out a size correction factor.
+ if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+ correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+ projected_size_based_on_q);
+
+ // More heavily damped adjustment used if we have been oscillating either side
+ // of target.
+ if (correction_factor > 0) {
+ adjustment_limit =
+ 0.25 + 0.5 * AOMMIN(1, fabs(log10(0.01 * correction_factor)));
+ } else {
+ adjustment_limit = 0.75;
+ }
+
+ cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+ cpi->rc.q_1_frame = cm->base_qindex;
+ cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+ if (correction_factor > 110)
+ cpi->rc.rc_1_frame = -1;
+ else if (correction_factor < 90)
+ cpi->rc.rc_1_frame = 1;
+ else
+ cpi->rc.rc_1_frame = 0;
+
+ if (correction_factor > 102) {
+ // We are not already at the worst allowable quality
+ correction_factor =
+ (int)(100 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ } else if (correction_factor < 99) {
+ // We are not already at the best allowable quality
+ correction_factor =
+ (int)(100 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ set_rate_correction_factor(cpi, rate_correction_factor);
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int q = active_worst_quality;
+ int last_error = INT_MAX;
+ int i, target_bits_per_mb, bits_per_mb_at_this_q;
+ const double correction_factor = get_rate_correction_factor(cpi);
+
+ // Calculate required scaling factor based on target frame size and size of
+ // frame produced using previous Q.
+ target_bits_per_mb =
+ (int)((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
+
+ i = active_best_quality;
+
+ do {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ bits_per_mb_at_this_q =
+ (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+ } else {
+ bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb(
+ cm->frame_type, i, correction_factor, cm->bit_depth);
+ }
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+ if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+ q = i;
+ else
+ q = i - 1;
+
+ break;
+ } else {
+ last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+ }
+ } while (++i <= active_worst_quality);
+
+ // In CBR mode, this makes sure q is between oscillating Qs to prevent
+ // resonance.
+ if (cpi->oxcf.rc_mode == AOM_CBR &&
+ (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+ cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+ q = clamp(q, AOMMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+ AOMMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+ }
+ return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+ int *low_motion_minq, int *high_motion_minq) {
+ if (gfu_boost > high) {
+ return low_motion_minq[q];
+ } else if (gfu_boost < low) {
+ return high_motion_minq[q];
+ } else {
+ const int gap = high - low;
+ const int offset = high - gfu_boost;
+ const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+ const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+ return low_motion_minq[q] + adjustment;
+ }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+ aom_bit_depth_t bit_depth) {
+ int *kf_low_motion_minq;
+ int *kf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+ return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+ kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+ aom_bit_depth_t bit_depth) {
+ int *arfgf_low_motion_minq;
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+ arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const unsigned int curr_frame = cpi->common.current_video_frame;
+ int active_worst_quality;
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ active_worst_quality =
+ curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
+ } else {
+ if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+ : rc->last_q[INTER_FRAME];
+ } else {
+ active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+ : rc->last_q[INTER_FRAME] * 2;
+ }
+ }
+ return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
+ // Adjust active_worst_quality: If buffer is above the optimal/target level,
+ // bring active_worst_quality down depending on fullness of buffer.
+ // If buffer is below the optimal level, let the active_worst_quality go from
+ // ambient Q (at buffer = optimal level) to worst_quality level
+ // (at buffer = critical level).
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *rc = &cpi->rc;
+ // Buffer level below which we push active_worst to worst_quality.
+ int64_t critical_level = rc->optimal_buffer_level >> 3;
+ int64_t buff_lvl_step = 0;
+ int adjustment = 0;
+ int active_worst_quality;
+ int ambient_qp;
+ if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+ // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+ // for the first few frames following key frame. These are both initialized
+ // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+ // So for first few frames following key, the qp of that key frame is weighted
+ // into the active_worst_quality setting.
+ ambient_qp = (cm->current_video_frame < 5)
+ ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME],
+ rc->avg_frame_qindex[KEY_FRAME])
+ : rc->avg_frame_qindex[INTER_FRAME];
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+ if (rc->buffer_level > rc->optimal_buffer_level) {
+ // Adjust down.
+ // Maximum limit for down adjustment, ~30%.
+ int max_adjustment_down = active_worst_quality / 3;
+ if (max_adjustment_down) {
+ buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
+ max_adjustment_down);
+ if (buff_lvl_step)
+ adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+ buff_lvl_step);
+ active_worst_quality -= adjustment;
+ }
+ } else if (rc->buffer_level > critical_level) {
+ // Adjust up from ambient Q.
+ if (critical_level) {
+ buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+ if (buff_lvl_step) {
+ adjustment = (int)((rc->worst_quality - ambient_qp) *
+ (rc->optimal_buffer_level - rc->buffer_level) /
+ buff_lvl_step);
+ }
+ active_worst_quality = ambient_qp + adjustment;
+ }
+ } else {
+ // Set to worst_quality if buffer is below critical level.
+ active_worst_quality = rc->worst_quality;
+ }
+ return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi,
+ int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+ int q;
+ int *rtc_minq;
+ ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
+
+ if (frame_is_intra_only(cm)) {
+ active_best_quality = rc->best_quality;
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ if (rc->this_key_frame_forced) {
+ int qindex = rc->last_boosted_qindex;
+ double last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ int delta_qindex = av1_compute_qdelta(
+ rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (cm->current_video_frame > 0) {
+ // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ active_best_quality = get_kf_active_quality(
+ rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((cm->width * cm->height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
+ } else {
+ q = active_worst_quality;
+ }
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ if (cm->current_video_frame > 1) {
+ if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ } else {
+ if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+ !(cm->current_video_frame == 0)) {
+ int qdelta = 0;
+ aom_clear_system_state();
+ qdelta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ // Special case code to try and match quality with forced key frames
+ if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+ q = rc->last_boosted_qindex;
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+ const AV1EncoderConfig *const oxcf) {
+ static const double cq_adjust_threshold = 0.1;
+ int active_cq_level = oxcf->cq_level;
+ if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
+ const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+ if (x < cq_adjust_threshold) {
+ active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+ }
+ }
+ return active_cq_level;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi,
+ int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int cq_level = get_active_cq_level(rc, oxcf);
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+ int q;
+ int *inter_minq;
+ ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+ if (frame_is_intra_only(cm)) {
+ if (oxcf->rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const int delta_qindex =
+ av1_compute_qdelta(rc, q_val, q_val * 0.25, cm->bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (rc->this_key_frame_forced) {
+ const int qindex = rc->last_boosted_qindex;
+ const double last_boosted_q =
+ av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const int delta_qindex = av1_compute_qdelta(
+ rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else { // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+
+ active_best_quality = get_kf_active_quality(
+ rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((cm->width * cm->height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta on active_best_quality.
+ {
+ const double q_val =
+ av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ }
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ q = (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ ? rc->avg_frame_qindex[INTER_FRAME]
+ : rc->avg_frame_qindex[KEY_FRAME];
+ // For constrained quality dont allow Q less than the cq level
+ if (oxcf->rc_mode == AOM_CQ) {
+ if (q < cq_level) q = cq_level;
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+ } else if (oxcf->rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const int delta_qindex =
+ (cpi->refresh_alt_ref_frame)
+ ? av1_compute_qdelta(rc, q_val, q_val * 0.40, cm->bit_depth)
+ : av1_compute_qdelta(rc, q_val, q_val * 0.50, cm->bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ }
+ } else {
+ if (oxcf->rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+ 0.70, 1.0, 0.85, 1.0 };
+ const int delta_qindex = av1_compute_qdelta(
+ rc, q_val,
+ q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+ cm->bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ active_best_quality = (cm->current_video_frame > 1)
+ ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]]
+ : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ {
+ int qdelta = 0;
+ aom_clear_system_state();
+ if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+ !(cm->current_video_frame == 0)) {
+ qdelta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ qdelta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
+ }
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ if (oxcf->rc_mode == AOM_Q) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames
+ } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+ q = rc->last_boosted_qindex;
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
+ static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+ 1.00, // INTER_NORMAL
+#if CONFIG_EXT_REFS
+ 0.80, // INTER_LOW
+ 1.50, // INTER_HIGH
+ 1.25, // GF_ARF_LOW
+#else
+ 1.00, // INTER_HIGH
+ 1.50, // GF_ARF_LOW
+#endif // CONFIG_EXT_REFS
+ 2.00, // GF_ARF_STD
+ 2.00, // KF_STD
+ };
+ static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+#if CONFIG_EXT_REFS
+ { INTER_FRAME, INTER_FRAME, INTER_FRAME,
+ INTER_FRAME, INTER_FRAME, KEY_FRAME };
+#else
+ { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME };
+#endif // CONFIG_EXT_REFS
+ const AV1_COMMON *const cm = &cpi->common;
+ int qdelta =
+ av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
+ rate_factor_deltas[rf_level], cm->bit_depth);
+ return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
+static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ const int cq_level = get_active_cq_level(rc, oxcf);
+ int active_best_quality;
+ int active_worst_quality = cpi->twopass.active_worst_quality;
+ int q;
+ int *inter_minq;
+ ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+ if (frame_is_intra_only(cm)) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ if (rc->this_key_frame_forced) {
+ double last_boosted_q;
+ int delta_qindex;
+ int qindex;
+
+ if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+ active_best_quality = qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 1.25, cm->bit_depth);
+ active_worst_quality =
+ AOMMIN(qindex + delta_qindex, active_worst_quality);
+ } else {
+ qindex = rc->last_boosted_qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 0.75, cm->bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ }
+ } else {
+ // Not forced keyframe.
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ // Baseline value derived from cpi->active_worst_quality and kf boost.
+ active_best_quality =
+ get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((cm->width * cm->height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Make a further adjustment based on the kf zero motion measure.
+ q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
+ } else {
+ q = active_worst_quality;
+ }
+ // For constrained quality dont allow Q less than the cq level
+ if (oxcf->rc_mode == AOM_CQ) {
+ if (q < cq_level) q = cq_level;
+
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+
+ } else if (oxcf->rc_mode == AOM_Q) {
+ if (!cpi->refresh_alt_ref_frame) {
+ active_best_quality = cq_level;
+ } else {
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+ // Modify best quality for second level arfs. For mode AOM_Q this
+ // becomes the baseline frame q.
+ if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+ }
+ } else {
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ }
+ } else {
+ if (oxcf->rc_mode == AOM_Q) {
+ active_best_quality = cq_level;
+ } else {
+ active_best_quality = inter_minq[active_worst_quality];
+
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+ }
+
+ // Extension to max or min Q if undershoot or overshoot is outside
+ // the permitted range.
+ if ((cpi->oxcf.rc_mode != AOM_Q) &&
+ (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+ if (frame_is_intra_only(cm) ||
+ (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+ active_best_quality -=
+ (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+ active_worst_quality += (cpi->twopass.extend_maxq / 2);
+ } else {
+ active_best_quality -=
+ (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+ active_worst_quality += cpi->twopass.extend_maxq;
+ }
+ }
+
+ aom_clear_system_state();
+ // Static forced key frames Q restrictions dealt with elsewhere.
+ if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
+ (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+ int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+ active_worst_quality);
+ active_worst_quality =
+ AOMMAX(active_worst_quality + qdelta, active_best_quality);
+ }
+
+ // Modify active_best_quality for downscaled normal frames.
+ if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+ int qdelta = av1_compute_qdelta_by_rate(
+ rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
+ active_best_quality =
+ AOMMAX(active_best_quality + qdelta, rc->best_quality);
+ }
+
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ if (oxcf->rc_mode == AOM_Q) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames.
+ } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+ // If static since last kf use better of last boosted and last kf q.
+ if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+ } else {
+ q = rc->last_boosted_qindex;
+ }
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality);
+ if (q > active_worst_quality) {
+ // Special case when we are targeting the max allowed rate.
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ active_worst_quality = q;
+ else
+ q = active_worst_quality;
+ }
+ }
+ clamp(q, active_best_quality, active_worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int *bottom_index,
+ int *top_index) {
+ int q;
+ if (cpi->oxcf.pass == 0) {
+ if (cpi->oxcf.rc_mode == AOM_CBR)
+ q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+ else
+ q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+ } else {
+ q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+ }
+
+ return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
+ if (cpi->oxcf.rc_mode == AOM_Q) {
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ } else {
+ // For very small rate targets where the fractional adjustment
+ // may be tiny make sure there is at least a minimum range.
+ const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
+ *frame_under_shoot_limit = AOMMAX(frame_target - tolerance - 200, 0);
+ *frame_over_shoot_limit =
+ AOMMIN(frame_target + tolerance + 200, cpi->rc.max_frame_bandwidth);
+ }
+}
+
+void av1_rc_set_frame_target(AV1_COMP *cpi, int target) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ rc->this_frame_target = target;
+
+ // Modify frame size target when down-scaling.
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+ rc->frame_size_selector != UNSCALED)
+ rc->this_frame_target = (int)(rc->this_frame_target *
+ rate_thresh_mult[rc->frame_size_selector]);
+
+ // Target rate per SB64 (including partial SB64s.
+ rc->sb64_target_rate = (int)((int64_t)rc->this_frame_target * 64 * 64) /
+ (cm->width * cm->height);
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+ // this frame refreshes means next frames don't unless specified by user
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->frames_since_golden = 0;
+
+ // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+ rc->source_alt_ref_pending = 0;
+
+ // Set the alternate reference frame active flag
+ rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+#if CONFIG_EXT_REFS
+ // Update the Golden frame usage counts.
+ // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
+ // only the virtual indices for the reference frame will be
+ // updated and cpi->refresh_golden_frame will still be zero.
+ if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+#else
+ // Update the Golden frame usage counts.
+ if (cpi->refresh_golden_frame) {
+#endif // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+ // We will not use internal overlay frames to replace the golden frame
+ if (!rc->is_src_frame_ext_arf)
+#endif // CONFIG_EXT_REFS
+ // this frame refreshes means next frames don't unless specified by user
+ rc->frames_since_golden = 0;
+
+ // If we are not using alt ref in the up and coming group clear the arf
+ // active flag. In multi arf group case, if the index is not 0 then
+ // we are overlaying a mid group arf so should not reset the flag.
+ if (cpi->oxcf.pass == 2) {
+ if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+ rc->source_alt_ref_active = 0;
+ } else if (!rc->source_alt_ref_pending) {
+ rc->source_alt_ref_active = 0;
+ }
+
+ // Decrement count down till next gf
+ if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+
+ } else if (!cpi->refresh_alt_ref_frame) {
+ // Decrement count down till next gf
+ if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+
+ rc->frames_since_golden++;
+ }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int qindex = cm->base_qindex;
+
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ av1_cyclic_refresh_postencode(cpi);
+ }
+
+ // Update rate control heuristics
+ rc->projected_frame_size = (int)(bytes_used << 3);
+
+ // Post encode loop adjustment of Q prediction.
+ av1_rc_update_rate_correction_factors(cpi);
+
+ // Keep a record of last Q and ambient average Q.
+ if (cm->frame_type == KEY_FRAME) {
+ rc->last_q[KEY_FRAME] = qindex;
+ rc->avg_frame_qindex[KEY_FRAME] =
+ ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+ } else {
+ if (!rc->is_src_frame_alt_ref &&
+ !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ rc->last_q[INTER_FRAME] = qindex;
+ rc->avg_frame_qindex[INTER_FRAME] =
+ ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+ rc->ni_frames++;
+ rc->tot_q += av1_convert_qindex_to_q(qindex, cm->bit_depth);
+ rc->avg_q = rc->tot_q / rc->ni_frames;
+ // Calculate the average Q for normal inter frames (not key or GFU
+ // frames).
+ rc->ni_tot_qi += qindex;
+ rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+ }
+ }
+
+ // Keep record of last boosted (KF/GF/ARF) Q value.
+ // If the current frame is coded at a lower Q then we also update it.
+ // If all mbs in this group are skipped only update if the Q value is
+ // better than that already stored.
+ // This is used to help set quality in forced key frames to reduce popping
+ if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
+ (!rc->constrained_gf_group &&
+ (cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+ rc->last_boosted_qindex = qindex;
+ }
+ if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+
+ update_buffer_level(cpi, rc->projected_frame_size);
+
+ // Rolling monitors of whether we are over or underspending used to help
+ // regulate min and Max Q in two pass.
+ if (cm->frame_type != KEY_FRAME) {
+ rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+ rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+ rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+ rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+ rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+ rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+ rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+ rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+ }
+
+ // Actual bits spent
+ rc->total_actual_bits += rc->projected_frame_size;
+#if CONFIG_EXT_REFS
+ rc->total_target_bits +=
+ (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
+#else
+ rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+#endif // CONFIG_EXT_REFS
+
+ rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+ if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+ (cm->frame_type != KEY_FRAME))
+ // Update the alternate reference frame stats as appropriate.
+ update_alt_ref_frame_stats(cpi);
+ else
+ // Update the Golden frame stats as appropriate.
+ update_golden_frame_stats(cpi);
+
+ if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+
+#if CONFIG_EXT_REFS
+ if (cm->show_frame || rc->is_bwd_ref_frame) {
+#else
+ if (cm->show_frame) {
+#endif // CONFIG_EXT_REFS
+ rc->frames_since_key++;
+ rc->frames_to_key--;
+ }
+
+ // Trigger the resizing of the next frame if it is scaled.
+ if (oxcf->pass != 0) {
+ cpi->resize_pending =
+ rc->next_frame_size_selector != rc->frame_size_selector;
+ rc->frame_size_selector = rc->next_frame_size_selector;
+ }
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+ // Update buffer level with zero size, update frame counters, and return.
+ update_buffer_level(cpi, 0);
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
+}
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+ static const int af_ratio = 10;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+#if USE_ALTREF_FOR_ONE_PASS
+ target =
+ (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
+ ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+ (rc->baseline_gf_interval + af_ratio - 1)
+ : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+ (rc->baseline_gf_interval + af_ratio - 1);
+#else
+ target = rc->avg_frame_bandwidth;
+#endif
+ return av1_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+ static const int kf_ratio = 25;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const int target = rc->avg_frame_bandwidth * kf_ratio;
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+ // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+ if (!cpi->refresh_alt_ref_frame &&
+ (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+ rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+ cm->frame_type = KEY_FRAME;
+ rc->this_key_frame_forced =
+ cm->current_video_frame != 0 && rc->frames_to_key == 0;
+ rc->frames_to_key = cpi->oxcf.key_freq;
+ rc->kf_boost = DEFAULT_KF_BOOST;
+ rc->source_alt_ref_active = 0;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ if (rc->frames_till_gf_update_due == 0) {
+ rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+ if (rc->frames_till_gf_update_due > rc->frames_to_key) {
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ rc->constrained_gf_group = 1;
+ } else {
+ rc->constrained_gf_group = 0;
+ }
+ cpi->refresh_golden_frame = 1;
+ rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ }
+ if (cm->frame_type == KEY_FRAME)
+ target = calc_iframe_target_size_one_pass_vbr(cpi);
+ else
+ target = calc_pframe_target_size_one_pass_vbr(cpi);
+ av1_rc_set_frame_target(cpi, target);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+ const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+ int min_frame_target =
+ AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+ int target;
+
+ if (oxcf->gf_cbr_boost_pct) {
+ const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+ target = cpi->refresh_golden_frame
+ ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+ af_ratio_pct) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
+ : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ } else {
+ target = rc->avg_frame_bandwidth;
+ }
+
+ if (diff > 0) {
+ // Lower the target bandwidth for this frame.
+ const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+ target -= (target * pct_low) / 200;
+ } else if (diff < 0) {
+ // Increase the target bandwidth for this frame.
+ const int pct_high =
+ (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+ target += (target * pct_high) / 200;
+ }
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ return AOMMAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ int target;
+ if (cpi->common.current_video_frame == 0) {
+ target = ((rc->starting_buffer_level / 2) > INT_MAX)
+ ? INT_MAX
+ : (int)(rc->starting_buffer_level / 2);
+ } else {
+ int kf_boost = 32;
+ double framerate = cpi->framerate;
+
+ kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+ if (rc->frames_since_key < framerate / 2) {
+ kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+ }
+ target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+ }
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+ // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+ if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+ rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+ cm->frame_type = KEY_FRAME;
+ rc->this_key_frame_forced =
+ cm->current_video_frame != 0 && rc->frames_to_key == 0;
+ rc->frames_to_key = cpi->oxcf.key_freq;
+ rc->kf_boost = DEFAULT_KF_BOOST;
+ rc->source_alt_ref_active = 0;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ if (rc->frames_till_gf_update_due == 0) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_cyclic_refresh_set_golden_update(cpi);
+ else
+ rc->baseline_gf_interval =
+ (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ }
+
+ // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+ // should be done here, before the frame qp is selected.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_cyclic_refresh_update_parameters(cpi);
+
+ if (cm->frame_type == KEY_FRAME)
+ target = calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = calc_pframe_target_size_one_pass_cbr(cpi);
+
+ av1_rc_set_frame_target(cpi, target);
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+ cpi->resize_pending = av1_resize_one_pass_cbr(cpi);
+ else
+ cpi->resize_pending = 0;
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth) {
+ int start_index = rc->worst_quality;
+ int target_index = rc->worst_quality;
+ int i;
+
+ // Convert the average q value to an index.
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ start_index = i;
+ if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break;
+ }
+
+ // Convert the q target to an index
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ target_index = i;
+ if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break;
+ }
+
+ return target_index - start_index;
+}
+
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio,
+ aom_bit_depth_t bit_depth) {
+ int target_index = rc->worst_quality;
+ int i;
+
+ // Look up the current projected bits per block for the base index
+ const int base_bits_per_mb =
+ av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+
+ // Find the target bits per mb based on the base value and given ratio.
+ const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+ // Convert the q target to an index
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+ target_bits_per_mb) {
+ target_index = i;
+ break;
+ }
+ }
+ return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+ RATE_CONTROL *const rc) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Special case code for 1 pass fixed Q mode tests
+ if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+ rc->max_gf_interval = FIXED_GF_INTERVAL;
+ rc->min_gf_interval = FIXED_GF_INTERVAL;
+ rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+ } else {
+ // Set Maximum gf/arf interval
+ rc->max_gf_interval = oxcf->max_gf_interval;
+ rc->min_gf_interval = oxcf->min_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->width, oxcf->height, cpi->framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ cpi->framerate, rc->min_gf_interval);
+
+ // Extended interval for genuinely static scenes
+ rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+ if (is_altref_enabled(cpi)) {
+ if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+ rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+ }
+
+ if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+ rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+ // Clamp min to max
+ rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+ }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int vbr_max_bits;
+
+ rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+ rc->min_frame_bandwidth =
+ (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+
+ rc->min_frame_bandwidth =
+ AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+ // A maximum bitrate for a frame is defined.
+ // The baseline for this aligns with HW implementations that
+ // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+ // per 16x16 MB (averaged over a frame). However this limit is extended if
+ // a very high rate is given on the command line or the the rate cannnot
+ // be acheived because of a user specificed max q (e.g. when the user
+ // specifies lossless encode.
+ vbr_max_bits =
+ (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+ 100);
+ rc->max_frame_bandwidth =
+ AOMMAX(AOMMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+ av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+ int max_delta;
+ double position_factor = 1.0;
+
+ // How far through the clip are we.
+ // This number is used to damp the per frame rate correction.
+ // Range 0 - 1.0
+ if (cpi->twopass.total_stats.count != 0.) {
+ position_factor = sqrt((double)cpi->common.current_video_frame /
+ cpi->twopass.total_stats.count);
+ }
+ max_delta = (int)(position_factor *
+ ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+ // vbr_bits_off_target > 0 means we have extra bits to spend
+ if (vbr_bits_off_target > 0) {
+ *this_frame_target += (vbr_bits_off_target > max_delta)
+ ? max_delta
+ : (int)vbr_bits_off_target;
+ } else {
+ *this_frame_target -= (vbr_bits_off_target < -max_delta)
+ ? max_delta
+ : (int)-vbr_bits_off_target;
+ }
+
+ // Fast redistribution of bits arising from massive local undershoot.
+ // Dont do it for kf,arf,gf or overlay frames.
+ if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+ rc->vbr_bits_off_target_fast) {
+ int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+ int fast_extra_bits;
+ fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+ fast_extra_bits = (int)AOMMIN(
+ fast_extra_bits,
+ AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+ *this_frame_target += (int)fast_extra_bits;
+ rc->vbr_bits_off_target_fast -= fast_extra_bits;
+ }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target_rate = rc->base_frame_target;
+
+ // Correction to rate target based on prior over or under shoot.
+ if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
+ vbr_rate_correction(cpi, &target_rate);
+ av1_rc_set_frame_target(cpi, target_rate);
+}
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int av1_resize_one_pass_cbr(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int resize_now = 0;
+ cpi->resize_scale_num = 1;
+ cpi->resize_scale_den = 1;
+ // Don't resize on key frame; reset the counters on key frame.
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->resize_avg_qp = 0;
+ cpi->resize_count = 0;
+ return 0;
+ }
+ // Resize based on average buffer underflow and QP over some window.
+ // Ignore samples close to key frame, since QP is usually high after key.
+ if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+ const int window = (int)(5 * cpi->framerate);
+ cpi->resize_avg_qp += cm->base_qindex;
+ if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+ ++cpi->resize_buffer_underflow;
+ ++cpi->resize_count;
+ // Check for resize action every "window" frames.
+ if (cpi->resize_count >= window) {
+ int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+ // Resize down if buffer level has underflowed sufficent amount in past
+ // window, and we are at original resolution.
+ // Resize back up if average QP is low, and we are currently in a resized
+ // down state.
+ if (cpi->resize_state == 0 &&
+ cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
+ resize_now = 1;
+ cpi->resize_state = 1;
+ } else if (cpi->resize_state == 1 &&
+ avg_qp < 40 * cpi->rc.worst_quality / 100) {
+ resize_now = -1;
+ cpi->resize_state = 0;
+ }
+ // Reset for next window measurement.
+ cpi->resize_avg_qp = 0;
+ cpi->resize_count = 0;
+ cpi->resize_buffer_underflow = 0;
+ }
+ }
+ // If decision is to resize, reset some quantities, and check is we should
+ // reduce rate correction factor,
+ if (resize_now != 0) {
+ int target_bits_per_frame;
+ int active_worst_quality;
+ int qindex;
+ int tot_scale_change;
+ // For now, resize is by 1/2 x 1/2.
+ cpi->resize_scale_num = 1;
+ cpi->resize_scale_den = 2;
+ tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+ (cpi->resize_scale_num * cpi->resize_scale_num);
+ // Reset buffer level to optimal, update target size.
+ rc->buffer_level = rc->optimal_buffer_level;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+ // Reset cyclic refresh parameters.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+ av1_cyclic_refresh_reset_resize(cpi);
+ // Get the projected qindex, based on the scaled target frame size (scaled
+ // so target_bits_per_mb in av1_rc_regulate_q will be correct target).
+ target_bits_per_frame = (resize_now == 1)
+ ? rc->this_frame_target * tot_scale_change
+ : rc->this_frame_target / tot_scale_change;
+ active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+ qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+ active_worst_quality);
+ // If resize is down, check if projected q index is close to worst_quality,
+ // and if so, reduce the rate correction factor (since likely can afford
+ // lower q for resized frame).
+ if (resize_now == 1 && qindex > 90 * cpi->rc.worst_quality / 100) {
+ rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+ }
+ // If resize is back up, check if projected q index is too much above the
+ // current base_qindex, and if so, reduce the rate correction factor
+ // (since prefer to keep q for resized frame at least close to previous q).
+ if (resize_now == -1 && qindex > 130 * cm->base_qindex / 100) {
+ rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+ }
+ }
+ return resize_now;
+}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
new file mode 100644
index 0000000000..93a9b49397
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RATECTRL_H_
+#define AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 16
+#define FIXED_GF_INTERVAL 8 // Used in some testing modes only
+
+#if CONFIG_EXT_REFS
+typedef enum {
+ INTER_NORMAL = 0,
+ INTER_LOW = 1,
+ INTER_HIGH = 2,
+ GF_ARF_LOW = 3,
+ GF_ARF_STD = 4,
+ KF_STD = 5,
+ RATE_FACTOR_LEVELS = 6
+} RATE_FACTOR_LEVEL;
+#else
+typedef enum {
+ INTER_NORMAL = 0,
+ INTER_HIGH = 1,
+ GF_ARF_LOW = 2,
+ GF_ARF_STD = 3,
+ KF_STD = 4,
+ RATE_FACTOR_LEVELS = 5
+} RATE_FACTOR_LEVEL;
+#endif // CONFIG_EXT_REFS
+
+// Internal frame scaling level.
+typedef enum {
+ UNSCALED = 0, // Frame is unscaled.
+ SCALE_STEP1 = 1, // First-level down-scaling.
+ FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = { 16, 24 };
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
+
+typedef struct {
+ // Rate targetting variables
+ int base_frame_target; // A baseline frame target before adjustment
+ // for previous under or over shoot.
+ int this_frame_target; // Actual frame target after rc adjustment.
+ int projected_frame_size;
+ int sb64_target_rate;
+ int last_q[FRAME_TYPES]; // Separate values for Intra/Inter
+ int last_boosted_qindex; // Last boosted GF/KF/ARF q
+ int last_kf_qindex; // Q index of the last key frame coded.
+
+ int gfu_boost;
+ int last_boost;
+ int kf_boost;
+
+ double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ int frames_since_golden;
+ int frames_till_gf_update_due;
+ int min_gf_interval;
+ int max_gf_interval;
+ int static_scene_max_gf_interval;
+ int baseline_gf_interval;
+ int constrained_gf_group;
+ int frames_to_key;
+ int frames_since_key;
+ int this_key_frame_forced;
+ int next_key_frame_forced;
+ int source_alt_ref_pending;
+ int source_alt_ref_active;
+ int is_src_frame_alt_ref;
+
+#if CONFIG_EXT_REFS
+ // Length of the bi-predictive frame group interval
+ int bipred_group_interval;
+
+ // NOTE: Different types of frames may have different bits allocated
+ // accordingly, aiming to achieve the overall optimal RD performance.
+ int is_bwd_ref_frame;
+ int is_last_bipred_frame;
+ int is_bipred_frame;
+ int is_src_frame_ext_arf;
+#endif // CONFIG_EXT_REFS
+
+ int avg_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation used for any frame
+ int max_frame_bandwidth; // Maximum burst rate allowed for a frame.
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex[FRAME_TYPES];
+ double tot_q;
+ double avg_q;
+
+ int64_t buffer_level;
+ int64_t bits_off_target;
+ int64_t vbr_bits_off_target;
+ int64_t vbr_bits_off_target_fast;
+
+ int decimation_factor;
+ int decimation_count;
+
+ int rolling_target_bits;
+ int rolling_actual_bits;
+
+ int long_rolling_target_bits;
+ int long_rolling_actual_bits;
+
+ int rate_error_estimate;
+
+ int64_t total_actual_bits;
+ int64_t total_target_bits;
+ int64_t total_target_vs_actual;
+
+ int worst_quality;
+ int best_quality;
+
+ int64_t starting_buffer_level;
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
+
+ // rate control history for last frame(1) and the frame before(2).
+ // -1: undershot
+ // 1: overshoot
+ // 0: not initialized.
+ int rc_1_frame;
+ int rc_2_frame;
+ int q_1_frame;
+ int q_2_frame;
+
+ // Auto frame-scaling variables.
+ FRAME_SCALE_LEVEL frame_size_selector;
+ FRAME_SCALE_LEVEL next_frame_size_selector;
+ int frame_width[FRAME_SCALE_STEPS];
+ int frame_height[FRAME_SCALE_STEPS];
+ int rf_level_maxq[RATE_FACTOR_LEVELS];
+} RATE_CONTROL;
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
+ RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+ double correction_factor, aom_bit_depth_t bit_depth);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+// av1_rc_get_one_pass_vbr_params()
+// av1_rc_get_one_pass_cbr_params()
+// av1_rc_get_first_pass_params()
+// av1_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+// av1_rc_postencode_update()
+// av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_rc_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
+void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+ int this_frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int *bottom_index,
+ int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor, aom_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+ int target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+ int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the av1_rc_get_..._params() functions.
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio,
+ aom_bit_depth_t bit_depth);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+ RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c
new file mode 100644
index 0000000000..b9f827528f
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.c
@@ -0,0 +1,1244 @@
+/*
+ * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "av1/common/odintrin.h"
+#include "av1/encoder/ratectrl_xiph.h"
+
+#define OD_Q57(v) ((int64_t)((uint64_t)(v) << 57))
+#define OD_F_Q45(v) ((int64_t)(((v) * ((int64_t)1 << 45))))
+#define OD_F_Q12(v) ((int32_t)(((v) * ((int32_t)1 << 12))))
+
+/*A rough lookup table for tan(x), 0 <= x < pi/2.
+ The values are Q12 fixed-point and spaced at 5 degree intervals.
+ These decisions are somewhat arbitrary, but sufficient for the 2nd order
+ Bessel follower below.
+ Values of x larger than 85 degrees are extrapolated from the last interval,
+ which is way off, but "good enough".*/
+static uint16_t OD_ROUGH_TAN_LOOKUP[18] = { 0, 358, 722, 1098, 1491,
+ 1910, 2365, 2868, 3437, 4096,
+ 4881, 5850, 7094, 8784, 11254,
+ 15286, 23230, 46817 };
+
+/*alpha is Q24 in the range [0,0.5).
+ The return values is 5.12.*/
+static int od_warp_alpha(int alpha) {
+ int i;
+ int d;
+ int t0;
+ int t1;
+ i = alpha * 36 >> 24;
+ if (i >= 17) i = 16;
+ t0 = OD_ROUGH_TAN_LOOKUP[i];
+ t1 = OD_ROUGH_TAN_LOOKUP[i + 1];
+ d = alpha * 36 - (i << 24);
+ return (int)((((int64_t)t0 << 32) + ((t1 - t0) << 8) * (int64_t)d) >> 32);
+}
+
+static const int64_t OD_ATANH_LOG2[32] = {
+ 0x32B803473F7AD0F4LL, 0x2F2A71BD4E25E916LL, 0x2E68B244BB93BA06LL,
+ 0x2E39FB9198CE62E4LL, 0x2E2E683F68565C8FLL, 0x2E2B850BE2077FC1LL,
+ 0x2E2ACC58FE7B78DBLL, 0x2E2A9E2DE52FD5F2LL, 0x2E2A92A338D53EECLL,
+ 0x2E2A8FC08F5E19B6LL, 0x2E2A8F07E51A485ELL, 0x2E2A8ED9BA8AF388LL,
+ 0x2E2A8ECE2FE7384ALL, 0x2E2A8ECB4D3E4B1ALL, 0x2E2A8ECA94940FE8LL,
+ 0x2E2A8ECA6669811DLL, 0x2E2A8ECA5ADEDD6ALL, 0x2E2A8ECA57FC347ELL,
+ 0x2E2A8ECA57438A43LL, 0x2E2A8ECA57155FB4LL, 0x2E2A8ECA5709D510LL,
+ 0x2E2A8ECA5706F267LL, 0x2E2A8ECA570639BDLL, 0x2E2A8ECA57060B92LL,
+ 0x2E2A8ECA57060008LL, 0x2E2A8ECA5705FD25LL, 0x2E2A8ECA5705FC6CLL,
+ 0x2E2A8ECA5705FC3ELL, 0x2E2A8ECA5705FC33LL, 0x2E2A8ECA5705FC30LL,
+ 0x2E2A8ECA5705FC2FLL, 0x2E2A8ECA5705FC2FLL
+};
+
+static int od_ilog64(int64_t v) {
+ static const unsigned char OD_DEBRUIJN_IDX64[64] = {
+ 0, 1, 2, 7, 3, 13, 8, 19, 4, 25, 14, 28, 9, 34, 20, 40,
+ 5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57,
+ 63, 6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56,
+ 62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
+ };
+ int ret;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v |= v >> 32;
+ ret = (int)v & 1;
+ v = (v >> 1) + 1;
+ ret += OD_DEBRUIJN_IDX64[v * UINT64_C(0x218A392CD3D5DBF) >> 58 & 0x3F];
+ return ret;
+}
+
+/*Computes the binary exponential of logq57.
+ input: a log base 2 in Q57 format
+ output: a 64 bit integer in Q0 (no fraction) */
+static int64_t od_bexp64(int64_t logq57) {
+ int64_t w;
+ int64_t z;
+ int ipart;
+ ipart = (int)(logq57 >> 57);
+ if (ipart < 0) return 0;
+ if (ipart >= 63) return 0x7FFFFFFFFFFFFFFFLL;
+ z = logq57 - OD_Q57(ipart);
+ if (z) {
+ int64_t mask;
+ int64_t wlo;
+ int i;
+ /*C doesn't give us 64x64->128 muls, so we use CORDIC.
+ This is not particularly fast, but it's not being used in time-critical
+ code; it is very accurate.*/
+ /*z is the fractional part of the log in Q62 format.
+ We need 1 bit of headroom since the magnitude can get larger than 1
+ during the iteration, and a sign bit.*/
+ z <<= 5;
+ /*w is the exponential in Q61 format (since it also needs headroom and can
+ get as large as 2.0); we could get another bit if we dropped the sign,
+ but we'll recover that bit later anyway.
+ Ideally this should start out as
+ \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
+ but in order to guarantee convergence we have to repeat iterations 4,
+ 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
+ w = 0x26A3D0E401DD846DLL;
+ for (i = 0;; i++) {
+ mask = -(z < 0);
+ w += ((w >> (i + 1)) + mask) ^ mask;
+ z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
+ /*Repeat iteration 4.*/
+ if (i >= 3) break;
+ z *= 2;
+ }
+ for (;; i++) {
+ mask = -(z < 0);
+ w += ((w >> (i + 1)) + mask) ^ mask;
+ z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
+ /*Repeat iteration 13.*/
+ if (i >= 12) break;
+ z *= 2;
+ }
+ for (; i < 32; i++) {
+ mask = -(z < 0);
+ w += ((w >> (i + 1)) + mask) ^ mask;
+ z = (z - ((OD_ATANH_LOG2[i] + mask) ^ mask)) * 2;
+ }
+ wlo = 0;
+ /*Skip the remaining iterations unless we really require that much
+ precision.
+ We could have bailed out earlier for smaller iparts, but that would
+ require initializing w from a table, as the limit doesn't converge to
+ 61-bit precision until n=30.*/
+ if (ipart > 30) {
+ /*For these iterations, we just update the low bits, as the high bits
+ can't possibly be affected.
+ OD_ATANH_LOG2 has also converged (it actually did so one iteration
+ earlier, but that's no reason for an extra special case).*/
+ for (;; i++) {
+ mask = -(z < 0);
+ wlo += ((w >> i) + mask) ^ mask;
+ z -= (OD_ATANH_LOG2[31] + mask) ^ mask;
+ /*Repeat iteration 40.*/
+ if (i >= 39) break;
+ z <<= 1;
+ }
+ for (; i < 61; i++) {
+ mask = -(z < 0);
+ wlo += ((w >> i) + mask) ^ mask;
+ z = (z - ((OD_ATANH_LOG2[31] + mask) ^ mask)) << 1;
+ }
+ }
+ w = (w << 1) + wlo;
+ } else {
+ w = (int64_t)1 << 62;
+ }
+ if (ipart < 62) {
+ w = ((w >> (61 - ipart)) + 1) >> 1;
+ }
+ return w;
+}
+
+/*Computes the binary log of w
+ input: a 64-bit integer in Q0 (no fraction)
+ output: a 64-bit log in Q57 */
+static int64_t od_blog64(int64_t w) {
+ int64_t z;
+ int ipart;
+ if (w <= 0) return -1;
+ ipart = od_ilog64(w) - 1;
+ if (ipart > 61) {
+ w >>= ipart - 61;
+ } else {
+ w <<= 61 - ipart;
+ }
+ z = 0;
+ if (w & (w - 1)) {
+ int64_t x;
+ int64_t y;
+ int64_t u;
+ int64_t mask;
+ int i;
+ /*C doesn't give us 64x64->128 muls, so we use CORDIC.
+ This is not particularly fast, but it's not being used in time-critical
+ code; it is very accurate.*/
+ /*z is the fractional part of the log in Q61 format.*/
+ /*x and y are the cosh() and sinh(), respectively, in Q61 format.
+ We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)).*/
+ x = w + ((int64_t)1 << 61);
+ y = w - ((int64_t)1 << 61);
+ for (i = 0; i < 4; i++) {
+ mask = -(y < 0);
+ z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
+ u = x >> (i + 1);
+ x -= ((y >> (i + 1)) + mask) ^ mask;
+ y -= (u + mask) ^ mask;
+ }
+ /*Repeat iteration 4.*/
+ for (i--; i < 13; i++) {
+ mask = -(y < 0);
+ z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
+ u = x >> (i + 1);
+ x -= ((y >> (i + 1)) + mask) ^ mask;
+ y -= (u + mask) ^ mask;
+ }
+ /*Repeat iteration 13.*/
+ for (i--; i < 32; i++) {
+ mask = -(y < 0);
+ z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
+ u = x >> (i + 1);
+ x -= ((y >> (i + 1)) + mask) ^ mask;
+ y -= (u + mask) ^ mask;
+ }
+ /*OD_ATANH_LOG2 has converged.*/
+ for (; i < 40; i++) {
+ mask = -(y < 0);
+ z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
+ u = x >> (i + 1);
+ x -= ((y >> (i + 1)) + mask) ^ mask;
+ y -= (u + mask) ^ mask;
+ }
+ /*Repeat iteration 40.*/
+ for (i--; i < 62; i++) {
+ mask = -(y < 0);
+ z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
+ u = x >> (i + 1);
+ x -= ((y >> (i + 1)) + mask) ^ mask;
+ y -= (u + mask) ^ mask;
+ }
+ z = (z + 8) >> 4;
+ }
+ return OD_Q57(ipart) + z;
+}
+
+/*Convenience function converts Q57 value to a clamped 32-bit Q24 value
+ in: input in Q57 format.
+ Return: same number in Q24 */
+static int32_t od_q57_to_q24(int64_t in) {
+ int64_t ret;
+ ret = (in + ((int64_t)1 << 32)) >> 33;
+ /*0x80000000 is automatically converted to unsigned on 32-bit systems.
+ -0x7FFFFFFF-1 is needed to avoid "promoting" the whole expression to
+ unsigned.*/
+ return (int32_t)OD_CLAMPI(-0x7FFFFFFF - 1, ret, 0x7FFFFFFF);
+}
+
+/*Binary exponential of log_scale with 24-bit fractional precision and
+ saturation.
+ log_scale: A binary logarithm in Q57 format.
+ Return: The binary exponential in Q24 format, saturated to 2**31-1 if
+ log_scale was too large.*/
+static int32_t od_bexp64_q24(int64_t log_scale) {
+ if (log_scale < OD_Q57(8)) {
+ int64_t ret;
+ ret = od_bexp64(log_scale + OD_Q57(24));
+ return ret < 0x7FFFFFFF ? (int32_t)ret : 0x7FFFFFFF;
+ }
+ return 0x7FFFFFFF;
+}
+
+/*Re-initialize Bessel filter coefficients with the specified delay.
+ This does not alter the x/y state, but changes the reaction time of the
+ filter.
+ Altering the time constant of a reactive filter without alterning internal
+ state is something that has to be done carefuly, but our design operates at
+ high enough delays and with small enough time constant changes to make it
+ safe.*/
+static void od_iir_bessel2_reinit(od_iir_bessel2 *f, int delay) {
+ int alpha;
+ int64_t one48;
+ int64_t warp;
+ int64_t k1;
+ int64_t k2;
+ int64_t d;
+ int64_t a;
+ int64_t ik2;
+ int64_t b1;
+ int64_t b2;
+ /*This borrows some code from an unreleased version of Postfish.
+ See the recipe at http://unicorn.us.com/alex/2polefilters.html for details
+ on deriving the filter coefficients.*/
+ /*alpha is Q24*/
+ alpha = (1 << 24) / delay;
+ one48 = (int64_t)1 << 48;
+ /*warp is 7.12*/
+ warp = OD_MAXI(od_warp_alpha(alpha), 1);
+ /*k1 is 9.12*/
+ k1 = 3 * warp;
+ /*k2 is 16.24.*/
+ k2 = k1 * warp;
+ /*d is 16.15.*/
+ d = ((((1 << 12) + k1) << 12) + k2 + 256) >> 9;
+ /*a is 0.32, since d is larger than both 1.0 and k2.*/
+ a = (k2 << 23) / d;
+ /*ik2 is 25.24.*/
+ ik2 = one48 / k2;
+ /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/
+ b1 = 2 * a * (ik2 - (1 << 24));
+ /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/
+ b2 = (one48 << 8) - ((4 * a) << 24) - b1;
+ /*All of the filter parameters are Q24.*/
+ f->c[0] = (int32_t)((b1 + ((int64_t)1 << 31)) >> 32);
+ f->c[1] = (int32_t)((b2 + ((int64_t)1 << 31)) >> 32);
+ f->g = (int32_t)((a + 128) >> 8);
+}
+
+/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay
+ and initial value.
+ value is Q24.*/
+static void od_iir_bessel2_init(od_iir_bessel2 *f, int delay, int32_t value) {
+ od_iir_bessel2_reinit(f, delay);
+ f->y[1] = f->y[0] = f->x[1] = f->x[0] = value;
+}
+
+static int64_t od_iir_bessel2_update(od_iir_bessel2 *f, int32_t x) {
+ int64_t c0;
+ int64_t c1;
+ int64_t g;
+ int64_t x0;
+ int64_t x1;
+ int64_t y0;
+ int64_t y1;
+ int64_t ya;
+ c0 = f->c[0];
+ c1 = f->c[1];
+ g = f->g;
+ x0 = f->x[0];
+ x1 = f->x[1];
+ y0 = f->y[0];
+ y1 = f->y[1];
+ ya = ((x + x0 * 2 + x1) * g + y0 * c0 + y1 * c1 + (1 << 23)) >> 24;
+ f->x[1] = (int32_t)x0;
+ f->x[0] = x;
+ f->y[1] = (int32_t)y0;
+ f->y[0] = (int32_t)ya;
+ return ya;
+}
+
+static void od_enc_rc_reset(od_rc_state *rc) {
+ int64_t npixels;
+ int64_t ibpp;
+ rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
+ /*Insane framerates or frame sizes mean insane bitrates.
+ Let's not get carried away.*/
+ if (rc->bits_per_frame > 0x400000000000LL) {
+ rc->bits_per_frame = (int64_t)0x400000000000LL;
+ } else {
+ if (rc->bits_per_frame < 32) {
+ rc->bits_per_frame = 32;
+ }
+ }
+ rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
+ rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
+ /*Start with a buffer fullness and fullness target of 50% */
+ rc->reservoir_target = (rc->reservoir_max + 1) >> 1;
+ rc->reservoir_fullness = rc->reservoir_target;
+ /*Pick exponents and initial scales for quantizer selection.*/
+ npixels = rc->frame_width * (int64_t)rc->frame_height;
+ rc->log_npixels = od_blog64(npixels);
+ ibpp = npixels / rc->bits_per_frame;
+ /*All of these initial scale/exp values are from Theora, and have not yet
+ been adapted to Daala, so they're certainly wrong.
+ The B-frame values especially are simply copies of the P-frame values.*/
+ if (ibpp < 1) {
+ rc->exp[OD_I_FRAME] = 59;
+ rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(OD_COEFF_SHIFT);
+ } else if (ibpp < 2) {
+ rc->exp[OD_I_FRAME] = 55;
+ rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(OD_COEFF_SHIFT);
+ } else {
+ rc->exp[OD_I_FRAME] = 48;
+ rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(OD_COEFF_SHIFT);
+ }
+ if (ibpp < 4) {
+ rc->exp[OD_P_FRAME] = 100;
+ rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(OD_COEFF_SHIFT);
+ } else if (ibpp < 8) {
+ rc->exp[OD_P_FRAME] = 95;
+ rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(OD_COEFF_SHIFT);
+ } else {
+ rc->exp[OD_P_FRAME] = 73;
+ rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(OD_COEFF_SHIFT);
+ }
+ /*Golden P-frames both use the same log_scale and exp modeling
+ values as regular P-frames and the same scale follower.
+ For convenience in the rate calculation code, we maintain a copy of
+ the scale and exp values in OD_GOLDEN_P_FRAME.*/
+ rc->exp[OD_GOLDEN_P_FRAME] = rc->exp[OD_P_FRAME];
+ rc->log_scale[OD_GOLDEN_P_FRAME] = rc->log_scale[OD_P_FRAME];
+ rc->exp[OD_ALTREF_P_FRAME] = rc->exp[OD_P_FRAME];
+ rc->log_scale[OD_ALTREF_P_FRAME] = rc->log_scale[OD_P_FRAME];
+ /*We clamp the actual I and B frame delays to a minimum of 10 to work within
+ the range of values where later incrementing the delay works as designed.
+ 10 is not an exact choice, but rather a good working trade-off.*/
+ rc->inter_p_delay = 10;
+ rc->inter_delay_target = rc->reservoir_frame_delay >> 1;
+ memset(rc->frame_count, 0, sizeof(rc->frame_count));
+ /*Drop-frame tracking is concerned with more than just the basic three frame
+ types.
+ It needs to track boosted and cut subtypes (of which there is only one
+ right now, OD_GOLDEN_P_FRAME). */
+ rc->prev_drop_count[OD_I_FRAME] = 0;
+ rc->log_drop_scale[OD_I_FRAME] = OD_Q57(0);
+ rc->prev_drop_count[OD_P_FRAME] = 0;
+ rc->log_drop_scale[OD_P_FRAME] = OD_Q57(0);
+ rc->prev_drop_count[OD_GOLDEN_P_FRAME] = 0;
+ rc->log_drop_scale[OD_GOLDEN_P_FRAME] = OD_Q57(0);
+ rc->prev_drop_count[OD_ALTREF_P_FRAME] = 0;
+ rc->log_drop_scale[OD_ALTREF_P_FRAME] = OD_Q57(0);
+ /*Set up second order followers, initialized according to corresponding
+ time constants.*/
+ od_iir_bessel2_init(&rc->scalefilter[OD_I_FRAME], 4,
+ od_q57_to_q24(rc->log_scale[OD_I_FRAME]));
+ od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], rc->inter_p_delay,
+ od_q57_to_q24(rc->log_scale[OD_P_FRAME]));
+ od_iir_bessel2_init(&rc->vfrfilter[OD_I_FRAME], 4,
+ od_bexp64_q24(rc->log_drop_scale[OD_I_FRAME]));
+ od_iir_bessel2_init(&rc->vfrfilter[OD_P_FRAME], 4,
+ od_bexp64_q24(rc->log_drop_scale[OD_P_FRAME]));
+ od_iir_bessel2_init(&rc->vfrfilter[OD_GOLDEN_P_FRAME], 4,
+ od_bexp64_q24(rc->log_drop_scale[OD_GOLDEN_P_FRAME]));
+ od_iir_bessel2_init(&rc->vfrfilter[OD_ALTREF_P_FRAME], 4,
+ od_bexp64_q24(rc->log_drop_scale[OD_ALTREF_P_FRAME]));
+}
+
+int od_enc_rc_resize(od_rc_state *rc) {
+ /*If encoding has not yet begun, reset the buffer state.*/
+ if (rc->cur_frame == 0) {
+ od_enc_rc_reset(rc);
+ } else {
+ int idt;
+ /*Otherwise, update the bounds on the buffer, but not the current
+ fullness.*/
+ rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
+ /*Insane framerates or frame sizes mean insane bitrates.
+ Let's not get carried away.*/
+ if (rc->bits_per_frame > 0x400000000000LL) {
+ rc->bits_per_frame = (int64_t)0x400000000000LL;
+ } else {
+ if (rc->bits_per_frame < 32) {
+ rc->bits_per_frame = 32;
+ }
+ }
+ rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
+ rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
+ rc->reservoir_target =
+ ((rc->reservoir_max + 1) >> 1) +
+ ((rc->bits_per_frame + 2) >> 2) *
+ OD_MINI(rc->keyframe_rate, rc->reservoir_frame_delay);
+ /*Update the INTER-frame scale filter delay.
+ We jump to it immediately if we've already seen enough frames; otherwise
+ it is simply set as the new target.*/
+ rc->inter_delay_target = idt = OD_MAXI(rc->reservoir_frame_delay >> 1, 10);
+ if (idt < OD_MINI(rc->inter_p_delay, rc->frame_count[OD_P_FRAME])) {
+ od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], idt,
+ rc->scalefilter[OD_P_FRAME].y[0]);
+ rc->inter_p_delay = idt;
+ }
+ }
+ return 0;
+}
+
+int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms) {
+ if (rc->framerate <= 0) return 1;
+ if (rc->target_bitrate > 0) {
+ /*State has already been initialized; rather than reinitialize,
+ adjust the buffering for the new target rate. */
+ rc->target_bitrate = bitrate;
+ return od_enc_rc_resize(rc);
+ }
+ rc->target_quantizer = 0;
+ rc->target_bitrate = bitrate;
+ rc->rate_bias = 0;
+ if (bitrate > 0) {
+ /* The buffer size is clamped between [12, 256], this interval is short
+ enough to
+ allow reaction, but long enough to allow looking into the next GOP
+ (avoiding
+ the case where the last frames before an I-frame get starved).
+ The 12 frame minimum gives us some chance to distribute bit estimation
+ errors in the worst case. The 256 frame maximum means we'll require 8-10
+ seconds
+ of pre-buffering at 24-30 fps, which is not unreasonable.*/
+ rc->reservoir_frame_delay =
+ (int)OD_MINI((delay_ms / 1000) * rc->framerate, 256);
+ rc->drop_frames = 1;
+ rc->cap_overflow = 1;
+ rc->cap_underflow = 0;
+ rc->twopass_state = 0;
+ od_enc_rc_reset(rc);
+ }
+ return 0;
+}
+
+/*Scale the number of frames by the number of expected drops/duplicates.*/
+static int od_rc_scale_drop(od_rc_state *rc, int frame_type, int nframes) {
+ if (rc->prev_drop_count[frame_type] > 0 ||
+ rc->log_drop_scale[frame_type] > OD_Q57(0)) {
+ int64_t dup_scale;
+ dup_scale = od_bexp64(((rc->log_drop_scale[frame_type] +
+ od_blog64(rc->prev_drop_count[frame_type] + 1)) >>
+ 1) +
+ OD_Q57(8));
+ if (dup_scale < nframes << 8) {
+ int dup_scalei;
+ dup_scalei = (int)dup_scale;
+ if (dup_scalei > 0) {
+ nframes = ((nframes << 8) + dup_scalei - 1) / dup_scalei;
+ }
+ } else {
+ nframes = !!nframes;
+ }
+ }
+ return nframes;
+}
+
+/*Closed form version of frame determination code.
+ Used by rate control to predict frame types and subtypes into the future.
+ No side effects, may be called any number of times.
+ Note that it ignores end-of-file conditions; one-pass planning *should*
+ ignore end-of-file. */
+int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
+ int *is_altref, int64_t *ip_count) {
+ int frame_type;
+ if (coding_frame_count == 0) {
+ *is_golden = 1;
+ *is_altref = 1;
+ *ip_count = 0;
+ frame_type = OD_I_FRAME;
+ } else {
+ int keyrate = rc->keyframe_rate;
+ if (rc->closed_gop) {
+ int ip_per_gop;
+ int gop_n;
+ int gop_i;
+ ip_per_gop = (keyrate - 1) / 2;
+ gop_n = coding_frame_count / keyrate;
+ gop_i = coding_frame_count - gop_n * keyrate;
+ *ip_count = gop_n * ip_per_gop + (gop_i > 0) + (gop_i - 1);
+ frame_type = gop_i == 0 ? OD_I_FRAME : OD_P_FRAME;
+ } else {
+ int ip_per_gop;
+ int gop_n;
+ int gop_i;
+ ip_per_gop = (keyrate);
+ gop_n = (coding_frame_count - 1) / keyrate;
+ gop_i = coding_frame_count - gop_n * keyrate - 1;
+ *ip_count = (coding_frame_count > 0) + gop_n * ip_per_gop + (gop_i);
+ frame_type = gop_i / 1 < ip_per_gop - 1 ? OD_P_FRAME : OD_I_FRAME;
+ }
+ }
+ *is_golden =
+ (*ip_count % rc->goldenframe_rate) == 0 || frame_type == OD_I_FRAME;
+ *is_altref = (*ip_count % rc->altref_rate) == 0 || frame_type == OD_I_FRAME;
+ return frame_type;
+}
+
+/*Count frames types forward from the current frame up to but not including
+ the last I-frame in reservoir_frame_delay.
+ If reservoir_frame_delay contains no I-frames (or the current frame is the
+ only I-frame), count all reservoir_frame_delay frames.
+ Returns the number of frames counted.
+ Right now, this implementation is simple, brute-force, and expensive.
+ It is also easy to understand and debug.
+ TODO: replace with a virtual FIFO that keeps running totals as
+ repeating the counting over-and-over will have a performance impact on
+ whole-file 2pass usage.*/
+static int frame_type_count(od_rc_state *rc, int nframes[OD_FRAME_NSUBTYPES]) {
+ int i;
+ int j;
+ int acc[OD_FRAME_NSUBTYPES];
+ int count;
+ int reservoir_frames;
+ int reservoir_frame_delay;
+ memset(nframes, 0, OD_FRAME_NSUBTYPES * sizeof(*nframes));
+ memset(acc, 0, sizeof(acc));
+ count = 0;
+ reservoir_frames = 0;
+#if 1
+ /*Go ahead and count past end-of-stream.
+ We won't nail the exact bitrate on short files that end with a partial
+ GOP, but we also won't [potentially] destroy the quality of the last few
+ frames in that same case when we suddenly find out the stream is ending
+ before the original planning horizon.*/
+ reservoir_frame_delay = rc->reservoir_frame_delay;
+#else
+ /*Don't count past the end of the stream (once we know where end-of-stream
+ is).*/
+ reservoir_frame_delay =
+ rc->end_of_input ? rc->input_size + 1 : rc->reservoir_frame_delay;
+#endif
+ for (i = 0; i < reservoir_frame_delay; i++) {
+ int frame_type;
+ int is_golden;
+ int is_altref;
+ int64_t dummy;
+ frame_type =
+ od_frame_type(rc, rc->cur_frame + i, &is_golden, &is_altref, &dummy);
+ switch (frame_type) {
+ case OD_I_FRAME: {
+ for (j = 0; j < OD_FRAME_NSUBTYPES; j++) nframes[j] += acc[j];
+ reservoir_frames += count;
+ memset(acc, 0, sizeof(acc));
+ acc[OD_I_FRAME] = 1;
+ count = 1;
+ break;
+ }
+ case OD_P_FRAME: {
+ if (is_golden) {
+ ++acc[OD_GOLDEN_P_FRAME];
+ ++count;
+ } else if (is_altref) {
+ ++acc[OD_ALTREF_P_FRAME];
+ ++count;
+ } else {
+ ++acc[OD_P_FRAME];
+ ++count;
+ }
+ break;
+ }
+ }
+ }
+ /*If there were no I-frames at all, or only the first frame was an I-frame,
+ the accumulators never flushed and still contain the counts for the
+ entire buffer.
+ In both these cases, we return these counts.
+ Otherwise, we discard what remains in the accumulators as they contain
+ the counts from and past the last I-frame.*/
+ if (reservoir_frames == 0) {
+ for (i = 0; i < OD_FRAME_NSUBTYPES; i++) nframes[i] = acc[i];
+ reservoir_frames += count;
+ }
+ return reservoir_frames;
+}
+
+static int convert_to_ac_quant(int q, int bit_depth) {
+ return lrint(av1_convert_qindex_to_q(q, bit_depth));
+}
+
+int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
+ int is_golden_frame,
+ int is_altref_frame, int frame_type,
+ int *bottom_idx, int *top_idx) {
+ int frame_subtype;
+ int64_t log_cur_scale;
+ int lossy_quantizer_min;
+ int lossy_quantizer_max;
+ double mqp_i = OD_MQP_I;
+ double mqp_p = OD_MQP_P;
+ double mqp_gp = OD_MQP_GP;
+ double mqp_ap = OD_MQP_AP;
+ int reservoir_frames;
+ int nframes[OD_FRAME_NSUBTYPES];
+ int32_t mqp_Q12[OD_FRAME_NSUBTYPES];
+ int64_t dqp_Q45[OD_FRAME_NSUBTYPES];
+ /*Verify the closed-form frame type determination code matches what the
+ input queue set.*/
+ /*One pseudo-non-closed-form caveat:
+ Once we've seen end-of-input, the batched frame determination code
+ suppresses the last open-GOP's I-frame (since it would only be
+ useful for the next GOP, which doesn't exist).
+ Thus, don't check one the input queue is drained.*/
+ if (!rc->end_of_input) {
+ int closed_form_type;
+ int closed_form_golden;
+ int closed_form_altref;
+ int64_t closed_form_cur_frame;
+ closed_form_type =
+ od_frame_type(rc, rc->cur_frame, &closed_form_golden,
+ &closed_form_altref, &closed_form_cur_frame);
+ OD_UNUSED(closed_form_type);
+ OD_UNUSED(is_altref_frame);
+ assert(closed_form_type == frame_type);
+ assert(closed_form_cur_frame == rc->cur_frame);
+ assert(closed_form_altref == is_altref_frame);
+ assert(closed_form_golden == is_golden_frame);
+ }
+
+ log_cur_scale = (int64_t)rc->scalefilter[frame_type].y[0] << 33;
+
+ /*Count the various types and classes of frames.*/
+ reservoir_frames = frame_type_count(rc, nframes);
+ nframes[OD_I_FRAME] = od_rc_scale_drop(rc, OD_I_FRAME, nframes[OD_I_FRAME]);
+ nframes[OD_P_FRAME] = od_rc_scale_drop(rc, OD_P_FRAME, nframes[OD_P_FRAME]);
+ nframes[OD_GOLDEN_P_FRAME] =
+ od_rc_scale_drop(rc, OD_GOLDEN_P_FRAME, nframes[OD_GOLDEN_P_FRAME]);
+ nframes[OD_ALTREF_P_FRAME] =
+ od_rc_scale_drop(rc, OD_ALTREF_P_FRAME, nframes[OD_ALTREF_P_FRAME]);
+
+ switch (rc->twopass_state) {
+ default: break;
+ case 1: {
+ /*Pass 1 mode: use a fixed qi value.*/
+ return rc->firstpass_quant;
+ } break;
+ case 2: {
+ int i;
+ int64_t scale_sum[OD_FRAME_NSUBTYPES];
+ int qti;
+ /*Pass 2 mode: we know exactly how much of each frame type there is in
+ the current buffer window, and have estimates for the scales.*/
+ for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+ nframes[i] = rc->nframes[i];
+ nframes[i] = rc->nframes[i];
+ scale_sum[i] = rc->scale_sum[i];
+ }
+ /*If we're not using the same frame type as in pass 1 (because someone
+ changed the keyframe interval), remove that scale estimate.
+ We'll add in a replacement for the correct frame type below.*/
+ qti = rc->cur_metrics.frame_type;
+ if (qti != frame_type) {
+ nframes[qti]--;
+ scale_sum[qti] -= od_bexp64_q24(rc->cur_metrics.log_scale);
+ }
+ /*Compute log_scale estimates for each frame type from the pass-1 scales
+ we measured in the current window.*/
+ for (qti = 0; qti < OD_FRAME_NSUBTYPES; qti++) {
+ rc->log_scale[qti] = nframes[qti] > 0
+ ? od_blog64(scale_sum[qti]) -
+ od_blog64(nframes[qti]) - OD_Q57(24)
+ : -rc->log_npixels;
+ }
+ /*If we're not using the same frame type as in pass 1, add a scale
+ estimate for the corresponding frame using the current low-pass
+ filter value.
+ This is mostly to ensure we have a valid estimate even when pass 1 had
+ no frames of this type in the buffer window.
+ TODO: We could also plan ahead and figure out how many keyframes we'll
+ be forced to add in the current buffer window.*/
+ qti = rc->cur_metrics.frame_type;
+ if (qti != frame_type) {
+ int64_t scale;
+ scale = rc->log_scale[frame_type] < OD_Q57(23)
+ ? od_bexp64(rc->log_scale[frame_type] + OD_Q57(24))
+ : 0x7FFFFFFFFFFFLL;
+ scale *= nframes[frame_type];
+ nframes[frame_type]++;
+ scale += od_bexp64_q24(log_cur_scale >> 33);
+ rc->log_scale[frame_type] =
+ od_blog64(scale) - od_blog64(nframes[qti]) - OD_Q57(24);
+ } else {
+ log_cur_scale = (int64_t)rc->cur_metrics.log_scale << 33;
+ }
+ } break;
+ }
+
+ /*Quantizer selection sticks to the codable, lossy portion of the quantizer
+ range.*/
+ lossy_quantizer_min = convert_to_ac_quant(rc->minq, rc->bit_depth);
+ lossy_quantizer_max = convert_to_ac_quant(rc->maxq, rc->bit_depth);
+ frame_subtype = frame_type;
+ /*Stash quantizer modulation by frame type.*/
+ mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
+ mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
+ mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
+ mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
+ dqp_Q45[OD_I_FRAME] = OD_F_Q45(OD_DQP_I);
+ dqp_Q45[OD_P_FRAME] = OD_F_Q45(OD_DQP_P);
+ dqp_Q45[OD_GOLDEN_P_FRAME] = OD_F_Q45(OD_DQP_GP);
+ dqp_Q45[OD_ALTREF_P_FRAME] = OD_F_Q45(OD_DQP_AP);
+ /*Is rate control active?*/
+ if (rc->target_bitrate <= 0) {
+ /*Rate control is not active; derive quantizer directly from
+ quality parameter and frame type. */
+ /*Can't use the OD_LOSSLESS macro, as it uses state.quantizer to intuit,
+ and we've not set it yet.*/
+ if (rc->quality == 0) {
+ /*Lossless coding requested.*/
+ rc->base_quantizer = 0;
+ rc->target_quantizer = 0;
+ } else {
+ int64_t log_quantizer;
+
+ /* Adjust the modulation constants using the last frame's quantizer. */
+ double mqp_delta = (255 - rc->target_quantizer) / 2000.0f;
+ mqp_i -= mqp_delta;
+ mqp_p += mqp_delta;
+ mqp_gp -= mqp_delta;
+ mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
+ mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
+ mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
+ mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
+
+ if (rc->quality == -1) {
+ /*A quality of -1 means quality was unset; use a default.*/
+ rc->base_quantizer = convert_to_ac_quant(10, rc->bit_depth);
+ } else {
+ rc->base_quantizer = convert_to_ac_quant(rc->quality, rc->bit_depth);
+ }
+
+ if (rc->periodic_boosts && !is_golden_frame) {
+ int pattern_rate = (rc->goldenframe_rate >> 1);
+ int dist_to_golden = rc->cur_frame % pattern_rate;
+ int dist_away_golden = pattern_rate - dist_to_golden;
+ int boost = dist_to_golden;
+ if (dist_away_golden > dist_to_golden) boost = dist_away_golden;
+ boost -= pattern_rate;
+ boost *= (rc->base_quantizer) / OD_PERIODIC_BOOST_DIV;
+ rc->base_quantizer = rc->base_quantizer + boost;
+ }
+
+ /*As originally written, qp modulation is applied to the coded quantizer.
+ Because we now have and use a more precise target quantizer for various
+ calculation, that needs to be modulated as well.
+ Calculate what is, effectively, a fractional coded quantizer. */
+ /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
+ log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(OD_COEFF_SHIFT);
+ /*log_quantizer to Q21.*/
+ log_quantizer >>= 36;
+ /*scale log quantizer, result is Q33.*/
+ log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
+ /*Add Q33 offset to Q33 log_quantizer.*/
+ log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
+ /*Modulate quantizer according to frame type; result is Q45.*/
+ log_quantizer *= mqp_Q12[frame_subtype];
+ /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
+ log_quantizer += dqp_Q45[frame_subtype];
+ /*Back to log2 quantizer in Q57.*/
+ log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
+ OD_LOG_QUANTIZER_EXP_Q12 +
+ OD_Q57(OD_COEFF_SHIFT);
+ /*Convert Q57 log2 quantizer to unclamped linear target quantizer value.*/
+ rc->target_quantizer = od_bexp64(log_quantizer);
+ }
+ } else {
+ int clamp;
+ int64_t rate_bias;
+ int64_t rate_total;
+ int base_quantizer;
+ int64_t log_quantizer;
+ int qlo;
+ int qhi;
+ int i;
+ /*We clamp the allowed amount of qi change (after initialization).*/
+ clamp = rc->cur_frame > 0;
+ /*Figure out how to re-distribute bits so that we hit our fullness target
+ before the last keyframe in our current buffer window (after the current
+ frame), or the end of the buffer window, whichever comes first.*/
+ /*Single pass only right now.*/
+ /*If we've been missing our target, add a penalty term.*/
+ rate_bias = (rc->rate_bias / (rc->cur_frame + 1000)) * reservoir_frames;
+ /*rate_total is the total bits available over the next
+ reservoir_frames frames.*/
+ rate_total = rc->reservoir_fullness - rc->reservoir_target + rate_bias +
+ reservoir_frames * rc->bits_per_frame;
+ /*Find a target quantizer that meets our rate target for the specific mix
+ of frame types we'll have over the next frame_delay frames.
+ We model the rate<->quantizer relationship as:
+ rate = scale*(quantizer**-exp)
+ In this case, we have our desired rate, an exponent selected in setup,
+ and a scale that's been measured over our frame history, so we're
+ solving for the quantizer.
+ Exponentiation with arbitrary exponents is expensive, so we work in
+ the binary log domain (binary exp and log aren't too bad):
+ rate = e2(log2_scale - log2_quantizer * exp)
+ There's no easy closed form solution, so we bisection search for it.*/
+ /*We do not currently allow rate control to select lossless encoding.*/
+ qlo = 1;
+ /*If there's a quality specified, it's used to select the
+ coarsest base quantizer we can select.
+ Otherwise we can use up to and including the coarsest codable
+ quantizer.*/
+ if (rc->quality > 0)
+ qhi = convert_to_ac_quant(rc->quality, rc->bit_depth);
+ else
+ qhi = lossy_quantizer_max;
+ base_quantizer = (qlo + qhi) >> 1;
+ while (qlo < qhi) {
+ volatile int64_t log_base_quantizer;
+ int64_t diff;
+ int64_t bits;
+ /*Count bits contributed by each frame type using the model.*/
+ bits = 0;
+ log_base_quantizer = od_blog64(base_quantizer);
+ for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+ /*Modulate base quantizer by frame type.*/
+ /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
+ log_quantizer = log_base_quantizer - OD_Q57(OD_COEFF_SHIFT);
+ /*log_quantizer to Q21.*/
+ log_quantizer >>= 36;
+ /*scale log quantizer, result is Q33.*/
+ log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
+ /*Add Q33 offset to Q33 log_quantizer.*/
+ log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
+ /*Modulate quantizer according to frame type; result is Q45.*/
+ log_quantizer *= mqp_Q12[i];
+ /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
+ log_quantizer += dqp_Q45[i];
+ /*Back to log2 quantizer in Q57.*/
+ log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
+ OD_LOG_QUANTIZER_EXP_Q12 +
+ OD_Q57(OD_COEFF_SHIFT);
+ /*Clamp modulated quantizer values.*/
+ log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
+ od_blog64(lossy_quantizer_max));
+ /* All the fields here are Q57 except for the exponent which is Q6.*/
+ bits += nframes[i] * od_bexp64(rc->log_scale[i] + rc->log_npixels -
+ (log_quantizer >> 6) * rc->exp[i]);
+ }
+ diff = bits - rate_total;
+ if (diff > 0) {
+ qlo = base_quantizer + 1;
+ } else if (diff < 0) {
+ qhi = base_quantizer - 1;
+ } else {
+ break;
+ }
+ base_quantizer = (qlo + qhi) >> 1;
+ }
+ /*If this was not one of the initial frames, limit the change in base
+ quantizer to within [0.8*Q,1.2*Q], where Q is the previous frame's
+ base quantizer.*/
+ if (clamp) {
+ base_quantizer = OD_CLAMPI((rc->base_quantizer * 0x0CCCD + 0x8000) >> 16,
+ base_quantizer,
+ (rc->base_quantizer * 0x13333 + 0x8000) >> 16);
+ }
+ /*Modulate chosen base quantizer to produce target quantizer.*/
+ log_quantizer = od_blog64(base_quantizer);
+ /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
+ log_quantizer -= OD_Q57(OD_COEFF_SHIFT);
+ /*log_quantizer to Q21.*/
+ log_quantizer >>= 36;
+ /*scale log quantizer, result is Q33.*/
+ log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
+ /*Add Q33 offset to Q33 log_quantizer.*/
+ log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
+ /*Modulate quantizer according to frame type; result is Q45.*/
+ log_quantizer *= mqp_Q12[frame_subtype];
+ /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
+ log_quantizer += dqp_Q45[frame_subtype];
+ /*Back to log2 quantizer in Q57.*/
+ log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
+ OD_LOG_QUANTIZER_EXP_Q12 +
+ OD_Q57(OD_COEFF_SHIFT);
+ /*Clamp modulated quantizer values.*/
+ log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
+ od_blog64(lossy_quantizer_max));
+ /*The above allocation looks only at the total rate we'll accumulate in
+ the next reservoir_frame_delay frames.
+ However we could overflow the bit reservoir on the very next frame, so
+ check for that here if we're not using a soft target.*/
+ if (rc->cap_overflow) {
+ int64_t margin;
+ int64_t soft_limit;
+ int64_t log_soft_limit;
+ int64_t log_scale_pixels;
+ int64_t exp;
+ int64_t log_qexp;
+ /*Allow 3% of the buffer for prediction error.
+ This should be plenty, and we don't mind if we go a bit over; we only
+ want to keep these bits from being completely wasted.*/
+ margin = (rc->reservoir_max + 31) >> 5;
+ /*We want to use at least this many bits next frame.*/
+ soft_limit = rc->reservoir_fullness + rc->bits_per_frame -
+ (rc->reservoir_max - margin);
+ log_soft_limit = od_blog64(soft_limit);
+ /*If we're predicting we won't use that many bits...*/
+ log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
+ exp = rc->exp[frame_subtype];
+ log_qexp = (log_quantizer >> 6) * exp;
+ if (log_scale_pixels - log_qexp < log_soft_limit) {
+ /*Scale the adjustment based on how far into the margin we are.*/
+ log_qexp += ((log_scale_pixels - log_soft_limit - log_qexp) >> 32) *
+ (OD_MINI(margin, soft_limit) << 32) / margin;
+ log_quantizer = (((log_qexp + (exp >> 1)) / exp) << 6);
+ }
+ }
+ /*We just checked we don't overflow the reservoir next frame, now check
+ we don't underflow and bust the budget (when not using a soft target).
+ Disabled when a quality bound is set; if we saturate quantizer to the
+ maximum possible size when we have a limiting max quality, the
+ resulting lambda can cause strange behavior.*/
+ if (rc->quality == -1) {
+ int64_t exp;
+ int64_t log_qexp;
+ int64_t log_scale_pixels;
+ int64_t log_hard_limit;
+ /*Compute the maximum number of bits we can use in the next frame.
+ Allow 50% of the rate for a single frame for prediction error.
+ This may not be enough for keyframes or sudden changes in
+ complexity.*/
+ log_hard_limit =
+ od_blog64(rc->reservoir_fullness + (rc->bits_per_frame >> 1));
+ /*If we're predicting we'll use more than this...*/
+ log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
+ exp = rc->exp[frame_subtype];
+ log_qexp = (log_quantizer >> 6) * exp;
+ if (log_scale_pixels - log_qexp > log_hard_limit) {
+ /*Force the target to hit our limit exactly.*/
+ log_qexp = log_scale_pixels - log_hard_limit;
+ log_quantizer = (log_qexp + (exp >> 1)) / exp << 6;
+ /*If that target is unreasonable, oh well; we'll have to drop.*/
+ log_quantizer = OD_MAXI(log_quantizer, od_blog64(lossy_quantizer_max));
+ }
+ }
+ /*Compute a final estimate of the number of bits we plan to use, update
+ the running rate bias measurement.*/
+ {
+ int64_t log_qexp;
+ int64_t log_scale_pixels;
+ log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
+ log_qexp = (log_quantizer >> 6) * rc->exp[frame_subtype];
+ rc->rate_bias += od_bexp64(log_scale_pixels - log_qexp);
+ }
+ rc->target_quantizer = od_bexp64(log_quantizer);
+ /*The various cappings and adjustments may have altered the log_quantizer
+ target significantly.
+ We can either update the base quantizer to be consistent with the
+ target or let it track separately.
+ Theora behavior effectively keeps them consistent, as it regenerates
+ the effective base quantizer from the target each frame rather than
+ saving both.
+ For Daala, it's easier to allow them to track separately.
+ For now, allow them to track separately and see how it behaves.*/
+ rc->base_quantizer = base_quantizer;
+ }
+ *bottom_idx = lossy_quantizer_min;
+ *top_idx = lossy_quantizer_max;
+ rc->target_quantizer = av1_qindex_from_ac(
+ OD_CLAMPI(lossy_quantizer_min, rc->target_quantizer, lossy_quantizer_max),
+ rc->bit_depth);
+ return rc->target_quantizer;
+}
+
+int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
+ int is_altref_frame, int frame_type, int droppable) {
+ int dropped;
+ dropped = 0;
+ /*Update rate control only if rate control is active.*/
+ if (rc->target_bitrate > 0) {
+ int64_t log_scale;
+ int frame_subtype;
+ frame_subtype = frame_type;
+ /*Track non-golden and golden P frame drops separately.*/
+ if (is_golden_frame && frame_type == OD_P_FRAME)
+ frame_subtype = OD_GOLDEN_P_FRAME;
+ else if (is_altref_frame && frame_type == OD_P_FRAME)
+ frame_subtype = OD_ALTREF_P_FRAME;
+ if (bits <= 0) {
+ /*We didn't code any blocks in this frame.*/
+ log_scale = OD_Q57(-64);
+ bits = 0;
+ ++rc->prev_drop_count[frame_subtype];
+ } else {
+ int64_t log_bits;
+ int64_t log_qexp;
+ /*Compute the estimated scale factor for this frame type.*/
+ log_bits = od_blog64(bits);
+ log_qexp = od_blog64(rc->target_quantizer);
+ log_qexp = (log_qexp >> 6) * (rc->exp[frame_type]);
+ log_scale = OD_MINI(log_bits - rc->log_npixels + log_qexp, OD_Q57(16));
+ }
+
+ switch (rc->twopass_state) {
+ case 1: {
+ int golden, altref;
+ int64_t ipc;
+ rc->cur_metrics.frame_type =
+ od_frame_type(rc, rc->cur_frame, &golden, &altref, &ipc);
+ /*Pass 1 mode: save the metrics for this frame.*/
+ rc->cur_metrics.log_scale = od_q57_to_q24(log_scale);
+ } break;
+ case 2: {
+ /*Pass 2 mode:*/
+ int m_frame_type = rc->cur_metrics.frame_type;
+ rc->nframes[m_frame_type]--;
+ rc->scale_sum[m_frame_type] -= od_bexp64_q24(rc->cur_metrics.log_scale);
+ } break;
+ }
+
+ if (bits > 0) {
+ od_iir_bessel2 *f;
+ /*If this is the first example of the given frame type we've
+ seen, we immediately replace the default scale factor guess
+ with the estimate we just computed using the first frame.*/
+ if (rc->frame_count[frame_type] == 0) {
+ f = rc->scalefilter + frame_type;
+ f->y[1] = f->y[0] = f->x[1] = f->x[0] = od_q57_to_q24(log_scale);
+ rc->log_scale[frame_type] = log_scale;
+ } else {
+ /*Lengthen the time constant for the inter filters as we collect more
+ frame statistics, until we reach our target.*/
+ if (frame_type != OD_I_FRAME &&
+ rc->inter_p_delay < rc->inter_delay_target &&
+ rc->frame_count[frame_type] >= rc->inter_p_delay) {
+ od_iir_bessel2_reinit(&rc->scalefilter[frame_type],
+ ++rc->inter_p_delay);
+ }
+ /*Update the low-pass scale filter for this frame type
+ regardless of whether or not we drop this frame.*/
+ rc->log_scale[frame_type] =
+ od_iir_bessel2_update(rc->scalefilter + frame_type,
+ od_q57_to_q24(log_scale))
+ << 33;
+ }
+ /*If this frame busts our budget, it must be dropped.*/
+ if (droppable && rc->reservoir_fullness + rc->bits_per_frame < bits) {
+ ++rc->prev_drop_count[frame_subtype];
+ bits = 0;
+ dropped = 1;
+ } else {
+ uint32_t drop_count;
+ /*Update a low-pass filter to estimate the "real" frame rate taking
+ drops into account.
+ This is only done if the frame is coded, as it needs the final
+ count of dropped frames.*/
+ drop_count = rc->prev_drop_count[frame_subtype] + 1;
+ if (drop_count > 0x7F) {
+ drop_count = 0x7FFFFFFF;
+ } else {
+ drop_count <<= 24;
+ }
+ rc->log_drop_scale[frame_subtype] =
+ od_blog64(od_iir_bessel2_update(rc->vfrfilter + frame_subtype,
+ drop_count)) -
+ OD_Q57(24);
+ /*Zero the drop count for this frame.
+ It will be increased if we drop frames.*/
+ rc->prev_drop_count[frame_subtype] = 0;
+ }
+ /*Increment the frame count for filter adaptation purposes.*/
+ if (!rc->twopass_state) rc->frame_count[frame_type]++;
+ }
+ rc->reservoir_fullness += rc->bits_per_frame - bits;
+ /*If we're too quick filling the buffer and overflow is capped,
+ that rate is lost forever.*/
+ if (rc->cap_overflow && rc->reservoir_fullness > rc->reservoir_max) {
+ rc->reservoir_fullness = rc->reservoir_max;
+ }
+ /*If we're too quick draining the buffer and underflow is capped,
+ don't try to make up that rate later.*/
+ if (rc->cap_underflow && rc->reservoir_fullness < 0) {
+ rc->reservoir_fullness = 0;
+ }
+ /*Adjust the bias for the real bits we've used.*/
+ rc->rate_bias -= bits;
+ }
+ return dropped;
+}
+
+static INLINE void od_rc_buffer_val(od_rc_state *rc, int64_t val, int bytes) {
+ while (bytes-- > 0) {
+ rc->twopass_buffer[rc->twopass_buffer_bytes++] = (uint8_t)(val & 0xFF);
+ val >>= 8;
+ }
+}
+
+static INLINE int64_t od_rc_unbuffer_val(od_rc_state *rc, int bytes) {
+ int64_t ret = 0;
+ int shift = 0;
+ while (bytes-- > 0) {
+ ret |= ((int64_t)rc->twopass_buffer[rc->twopass_buffer_bytes++]) << shift;
+ shift += 8;
+ }
+ return ret;
+}
+
+int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
+ int summary) {
+ int i;
+ struct aom_codec_cx_pkt pkt;
+ rc->twopass_buffer = rc->firstpass_buffer;
+ rc->twopass_buffer_bytes = 0;
+ if (!rc->twopass_state) {
+ rc->twopass_state = 1;
+ for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+ rc->frame_count[i] = 0;
+ rc->exp[i] = 0;
+ rc->scale_sum[i] = 0;
+ }
+ }
+ if (summary) {
+ od_rc_buffer_val(rc, OD_RC_2PASS_MAGIC, 4);
+ od_rc_buffer_val(rc, OD_RC_2PASS_VERSION, 1);
+ for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+ od_rc_buffer_val(rc, rc->frame_count[i], 4);
+ od_rc_buffer_val(rc, rc->exp[i], 4);
+ od_rc_buffer_val(rc, rc->scale_sum[i], 8);
+ }
+ } else {
+ int frame_type = rc->cur_metrics.frame_type;
+ rc->scale_sum[frame_type] += od_bexp64_q24(rc->cur_metrics.log_scale);
+ rc->frame_count[frame_type]++;
+ od_rc_buffer_val(rc, rc->cur_metrics.frame_type, 1);
+ od_rc_buffer_val(rc, rc->cur_metrics.log_scale, 4);
+ }
+ pkt.data.twopass_stats.buf = rc->firstpass_buffer;
+ pkt.data.twopass_stats.sz = rc->twopass_buffer_bytes;
+ pkt.kind = AOM_CODEC_STATS_PKT;
+ aom_codec_pkt_list_add(pkt_list, &pkt);
+ return 0;
+}
+
+int od_enc_rc_2pass_in(od_rc_state *rc) {
+ /* Enable pass 2 mode if this is the first call. */
+ if (rc->twopass_state == 0) {
+ uint32_t i, total_frames = 0;
+
+ if (!rc->twopass_allframes_buf ||
+ rc->twopass_allframes_buf_size < OD_RC_2PASS_MIN)
+ return -1;
+
+ /* Find summary packet at the end */
+ rc->twopass_buffer = rc->twopass_allframes_buf;
+ rc->twopass_buffer +=
+ rc->twopass_allframes_buf_size - OD_RC_2PASS_SUMMARY_SZ;
+ rc->twopass_buffer_bytes = 0;
+
+ if (od_rc_unbuffer_val(rc, 4) != OD_RC_2PASS_MAGIC) return -1;
+ if (od_rc_unbuffer_val(rc, 1) != OD_RC_2PASS_VERSION) return -1;
+
+ for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
+ rc->frame_count[i] = od_rc_unbuffer_val(rc, 4);
+ rc->exp[i] = od_rc_unbuffer_val(rc, 4);
+ rc->scale_sum[i] = od_rc_unbuffer_val(rc, 8);
+ rc->nframes[i] = rc->frame_count[i];
+ total_frames += rc->frame_count[i];
+ }
+
+ if (total_frames < 1) return -1;
+
+ if (total_frames * OD_RC_2PASS_PACKET_SZ > rc->twopass_allframes_buf_size)
+ return -1;
+
+ od_enc_rc_reset(rc);
+
+ /* Everything looks ok */
+ rc->twopass_buffer = rc->twopass_allframes_buf;
+ rc->twopass_state = 2;
+ rc->twopass_buffer_bytes = 0;
+ }
+
+ rc->cur_metrics.frame_type = od_rc_unbuffer_val(rc, 1);
+ rc->cur_metrics.log_scale = od_rc_unbuffer_val(rc, 4);
+
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h
new file mode 100644
index 0000000000..a4a9052faa
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if !defined(_ratectrl_xiph_H)
+#define _ratectrl_xiph_H (1)
+
+#include "av1/encoder/ratectrl.h"
+#include "aom/internal/aom_codec_internal.h"
+
+/*Frame types.*/
+#define OD_I_FRAME (0)
+#define OD_P_FRAME (1)
+#define OD_GOLDEN_P_FRAME (2)
+#define OD_ALTREF_P_FRAME (3)
+
+#define OD_FRAME_NSUBTYPES (OD_ALTREF_P_FRAME + 1)
+
+/* Periodic boost (in between golden frames) strength - lower is more */
+#define OD_PERIODIC_BOOST_DIV (10)
+
+/* Constants for frame QP modulation <- tweak these
+ * Adjusts how the rate control system decides the quantizers per frame
+ * (sub)type */
+#define OD_MQP_I (0.98)
+#define OD_MQP_P (1.06)
+#define OD_MQP_GP (0.99)
+#define OD_MQP_AP (0.92)
+#define OD_DQP_I (-2)
+#define OD_DQP_P (0)
+#define OD_DQP_GP (-2)
+#define OD_DQP_AP (-2)
+
+/*Fractional_coded_quantizer ~=
+ log2(quantizer / (1 << OD_COEFF_SHIFT))*6.307 + 6.235*/
+/*Base/scale factor for linear quantizer to fractional coded quantizer
+ conversion (6.307 * 2^12) */
+#define OD_LOG_QUANTIZER_BASE_Q12 (0x0064EB)
+/*Inverse of above scale factor.*/
+#define OD_LOG_QUANTIZER_EXP_Q12 (0x000289)
+/*Offset for linear quantizer to fractional coded quantizer
+ conversion (6.235 * 2^45) */
+#define OD_LOG_QUANTIZER_OFFSET_Q45 (0x0000C7851EB851ECLL)
+
+#define OD_RC_2PASS_MAGIC (0x53015641) /* [A, V, 1, S] in little endian */
+#define OD_RC_2PASS_SUMMARY_SZ (4 + 1 + (4 + 4 + 8) * OD_FRAME_NSUBTYPES)
+#define OD_RC_2PASS_PACKET_SZ (1 + 4)
+#define OD_RC_2PASS_MIN (OD_RC_2PASS_PACKET_SZ + OD_RC_2PASS_SUMMARY_SZ)
+#define OD_RC_2PASS_VERSION (1)
+
+/*A 2nd order low-pass Bessel follower.
+ We use this for rate control because it has fast reaction time, but is
+ critically damped.*/
+typedef struct od_iir_bessel2 {
+ int32_t c[2];
+ int64_t g;
+ int32_t x[2];
+ int32_t y[2];
+} od_iir_bessel2;
+
+/* The 2-pass metrics associated with a single frame. */
+typedef struct od_frame_metrics {
+ /*The log base 2 of the scale factor for this frame in Q24 format.*/
+ int64_t log_scale;
+ /*The frame type from pass 1.*/
+ unsigned frame_type : 1;
+} od_frame_metrics;
+
+/*Rate control setup and working state information.*/
+typedef struct od_rc_state {
+ /* Image format */
+ int frame_width;
+ int frame_height;
+ int bit_depth;
+
+ /* Framerate */
+ double framerate;
+ /* Keyframe rate */
+ int keyframe_rate;
+ /* Golden frame period */
+ int goldenframe_rate;
+ /* Altref frame period */
+ int altref_rate;
+ /*The target bit-rate in bits per second.*/
+ int64_t target_bitrate;
+ /* Quality level for non-bitrate-targeting */
+ int quality;
+ /* Copied from oxcf->frame_periodic_boost */
+ int periodic_boosts;
+ /* Max Q */
+ int maxq;
+ /* Min Q */
+ int minq;
+ /* Quantizer to use for the first pass */
+ int firstpass_quant;
+
+ /* 2-pass metrics */
+ od_frame_metrics cur_metrics;
+
+ /* 2-pass state */
+ int64_t scale_sum[OD_FRAME_NSUBTYPES];
+ int nframes[OD_FRAME_NSUBTYPES];
+
+ /* 2-pass bytestream reader/writer context */
+ uint8_t *twopass_buffer;
+ int twopass_buffer_bytes;
+
+ /* Pass 1 stats packet storage */
+ uint8_t firstpass_buffer[OD_RC_2PASS_SUMMARY_SZ];
+
+ /* Every state packet from the first pass in a single buffer */
+ uint8_t *twopass_allframes_buf;
+ size_t twopass_allframes_buf_size;
+
+ /* Actual returned quantizer */
+ int target_quantizer;
+ /*The full-precision, unmodulated quantizer upon which
+ our modulated quantizers are based.*/
+ int base_quantizer;
+
+ /* Increments by 1 for each frame. */
+ int64_t cur_frame;
+
+ /* End of input flag */
+ int end_of_input;
+ /* Closed GOP flag */
+ int closed_gop;
+ /*The number of frames over which to distribute the reservoir usage.*/
+ int reservoir_frame_delay;
+ /*Will we drop frames to meet bitrate target?*/
+ unsigned char drop_frames;
+ /*Do we respect the maximum reservoir fullness?*/
+ unsigned char cap_overflow;
+ /*Can the reservoir go negative?*/
+ unsigned char cap_underflow;
+ /*Two-pass mode state.
+ 0 => 1-pass encoding.
+ 1 => 1st pass of 2-pass encoding.
+ 2 => 2nd pass of 2-pass encoding.*/
+ int twopass_state;
+ /*The log of the number of pixels in a frame in Q57 format.*/
+ int64_t log_npixels;
+ /*The target average bits per frame.*/
+ int64_t bits_per_frame;
+ /*The current bit reservoir fullness (bits available to be used).*/
+ int64_t reservoir_fullness;
+ /*The target buffer fullness.
+ This is where we'd like to be by the last keyframe the appears in the next
+ buf_delay frames.*/
+ int64_t reservoir_target;
+ /*The maximum buffer fullness (total size of the buffer).*/
+ int64_t reservoir_max;
+ /*The log of estimated scale factor for the rate model in Q57 format.*/
+ int64_t log_scale[OD_FRAME_NSUBTYPES];
+ /*The exponent used in the rate model in Q8 format.*/
+ unsigned exp[OD_FRAME_NSUBTYPES];
+ /*The log of an estimated scale factor used to obtain the real framerate, for
+ VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
+ int64_t log_drop_scale[OD_FRAME_NSUBTYPES];
+ /*The total drop count from the previous frame.*/
+ uint32_t prev_drop_count[OD_FRAME_NSUBTYPES];
+ /*Second-order lowpass filters to track scale and VFR/drops.*/
+ od_iir_bessel2 scalefilter[OD_FRAME_NSUBTYPES];
+ od_iir_bessel2 vfrfilter[OD_FRAME_NSUBTYPES];
+ int frame_count[OD_FRAME_NSUBTYPES];
+ int inter_p_delay;
+ int inter_delay_target;
+ /*The total accumulated estimation bias.*/
+ int64_t rate_bias;
+} od_rc_state;
+
+int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms);
+
+int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
+ int is_golden_frame,
+ int is_altref_frame, int frame_type,
+ int *bottom_idx, int *top_idx);
+
+/* Returns 1 if the frame should be dropped */
+int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
+ int is_altref_frame, int frame_type, int droppable);
+
+int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
+ int *is_altref, int64_t *ip_count);
+
+int od_enc_rc_resize(od_rc_state *rc);
+
+int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
+ int summary);
+
+int od_enc_rc_2pass_in(od_rc_state *rc);
+
+#endif
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
new file mode 100644
index 0000000000..f06e569e7f
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.c
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/tokenize.h"
+
+#define RD_THRESH_POW 1.25
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
+#if CONFIG_CB4X4
+ 2, 2, 2,
+#endif
+ 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
+#if CONFIG_EXT_PARTITION
+ 48, 48, 64
+#endif // CONFIG_EXT_PARTITION
+};
+
+static void fill_mode_costs(AV1_COMP *cpi) {
+ const FRAME_CONTEXT *const fc = cpi->common.fc;
+ int i, j;
+
+ for (i = 0; i < INTRA_MODES; ++i)
+ for (j = 0; j < INTRA_MODES; ++j)
+ av1_cost_tokens(cpi->y_mode_costs[i][j], av1_kf_y_mode_prob[i][j],
+ av1_intra_mode_tree);
+
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+ av1_cost_tokens(cpi->mbmode_cost[i], fc->y_mode_prob[i],
+ av1_intra_mode_tree);
+
+ for (i = 0; i < INTRA_MODES; ++i)
+ av1_cost_tokens(cpi->intra_uv_mode_cost[i], fc->uv_mode_prob[i],
+ av1_intra_mode_tree);
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ av1_cost_tokens(cpi->switchable_interp_costs[i],
+ fc->switchable_interp_prob[i], av1_switchable_interp_tree);
+
+#if CONFIG_PALETTE
+ for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
+ av1_cost_tokens(cpi->palette_y_size_cost[i],
+ av1_default_palette_y_size_prob[i], av1_palette_size_tree);
+ av1_cost_tokens(cpi->palette_uv_size_cost[i],
+ av1_default_palette_uv_size_prob[i], av1_palette_size_tree);
+ }
+
+ for (i = 0; i < PALETTE_SIZES; ++i) {
+ for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+ av1_cost_tokens(cpi->palette_y_color_cost[i][j],
+ av1_default_palette_y_color_index_prob[i][j],
+ av1_palette_color_index_tree[i]);
+ av1_cost_tokens(cpi->palette_uv_color_cost[i][j],
+ av1_default_palette_uv_color_index_prob[i][j],
+ av1_palette_color_index_tree[i]);
+ }
+ }
+#endif // CONFIG_PALETTE
+
+ for (i = 0; i < MAX_TX_DEPTH; ++i)
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+ av1_cost_tokens(cpi->tx_size_cost[i][j], fc->tx_size_probs[i][j],
+ av1_tx_size_tree[i]);
+
+#if CONFIG_EXT_TX
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ int s;
+ for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+ if (use_inter_ext_tx_for_txsize[s][i]) {
+ av1_cost_tokens(cpi->inter_tx_type_costs[s][i],
+ fc->inter_ext_tx_prob[s][i], av1_ext_tx_inter_tree[s]);
+ }
+ }
+ for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+ if (use_intra_ext_tx_for_txsize[s][i]) {
+ for (j = 0; j < INTRA_MODES; ++j)
+ av1_cost_tokens(cpi->intra_tx_type_costs[s][i][j],
+ fc->intra_ext_tx_prob[s][i][j],
+ av1_ext_tx_intra_tree[s]);
+ }
+ }
+ }
+#else
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ for (j = 0; j < TX_TYPES; ++j)
+ av1_cost_tokens(cpi->intra_tx_type_costs[i][j],
+ fc->intra_ext_tx_prob[i][j], av1_ext_tx_tree);
+ }
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ av1_cost_tokens(cpi->inter_tx_type_costs[i], fc->inter_ext_tx_prob[i],
+ av1_ext_tx_tree);
+ }
+#endif // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+ for (i = 0; i < INTRA_FILTERS + 1; ++i)
+ av1_cost_tokens(cpi->intra_filter_cost[i], fc->intra_filter_probs[i],
+ av1_intra_filter_tree);
+#endif // CONFIG_INTRA_INTERP
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_LOOP_RESTORATION
+ av1_cost_tokens(cpi->switchable_restore_cost, fc->switchable_restore_prob,
+ av1_switchable_restore_tree);
+#endif // CONFIG_LOOP_RESTORATION
+#if CONFIG_GLOBAL_MOTION
+ av1_cost_tokens(cpi->gmtype_cost, fc->global_motion_types_prob,
+ av1_global_motion_types_tree);
+#endif // CONFIG_GLOBAL_MOTION
+}
+
+void av1_fill_token_costs(av1_coeff_cost *c,
+ av1_coeff_probs_model (*p)[PLANE_TYPES]) {
+ int i, j, k, l;
+ TX_SIZE t;
+ for (t = 0; t < TX_SIZES; ++t)
+ for (i = 0; i < PLANE_TYPES; ++i)
+ for (j = 0; j < REF_TYPES; ++j)
+ for (k = 0; k < COEF_BANDS; ++k)
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ aom_prob probs[ENTROPY_NODES];
+ av1_model_to_full_probs(p[t][i][j][k][l], probs);
+ av1_cost_tokens((int *)c[t][i][j][k][0][l], probs, av1_coef_tree);
+ av1_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
+ av1_coef_tree);
+ assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
+ c[t][i][j][k][1][l][EOB_TOKEN]);
+ }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
+
+#if CONFIG_HIGHBITDEPTH
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+#endif
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ // Initialize the sad lut tables using a formulaic calculation for now.
+ // This is to make it easier to resolve the impact of experimental changes
+ // to the quantizer tables.
+ for (i = 0; i < range; i++) {
+ const double q = av1_convert_qindex_to_q(i, bit_depth);
+ bit16lut[i] = (int)(0.0418 * q + 2.4107);
+ bit4lut[i] = (int)(0.063 * q + 2.742);
+ }
+}
+
+void av1_init_me_luts(void) {
+ init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+ AOM_BITS_8);
+#if CONFIG_HIGHBITDEPTH
+ init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+ AOM_BITS_10);
+ init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+ AOM_BITS_12);
+#endif
+}
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+ 8, 8, 4, 4, 2, 2, 1, 0 };
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
+ 128, 144, 128, 128, 144,
+#if CONFIG_EXT_REFS
+ // TODO(zoeliu): To adjust further following factor values.
+ 128, 128, 128
+ // TODO(weitinglin): We should investigate if the values should be the same
+ // as the value used by OVERLAY frame
+ ,
+ 144
+#endif // CONFIG_EXT_REFS
+};
+
+int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
+ const int64_t q = av1_dc_quant(qindex, 0, cpi->common.bit_depth);
+#if CONFIG_HIGHBITDEPTH
+ int64_t rdmult = 0;
+ switch (cpi->common.bit_depth) {
+ case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
+ case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
+ case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+#else
+ int64_t rdmult = 88 * q * q / 24;
+#endif // CONFIG_HIGHBITDEPTH
+ if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+ const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+
+ rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+ rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+ }
+ if (rdmult < 1) rdmult = 1;
+ return (int)rdmult;
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+ double q;
+#if CONFIG_HIGHBITDEPTH
+ switch (bit_depth) {
+ case AOM_BITS_8: q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; break;
+ case AOM_BITS_10: q = av1_dc_quant(qindex, 0, AOM_BITS_10) / 16.0; break;
+ case AOM_BITS_12: q = av1_dc_quant(qindex, 0, AOM_BITS_12) / 64.0; break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+#else
+ (void)bit_depth;
+ q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0;
+#endif // CONFIG_HIGHBITDEPTH
+ // TODO(debargha): Adjust the function below.
+ return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
+#if CONFIG_HIGHBITDEPTH
+ switch (cpi->common.bit_depth) {
+ case AOM_BITS_8:
+ x->sadperbit16 = sad_per_bit16lut_8[qindex];
+ x->sadperbit4 = sad_per_bit4lut_8[qindex];
+ break;
+ case AOM_BITS_10:
+ x->sadperbit16 = sad_per_bit16lut_10[qindex];
+ x->sadperbit4 = sad_per_bit4lut_10[qindex];
+ break;
+ case AOM_BITS_12:
+ x->sadperbit16 = sad_per_bit16lut_12[qindex];
+ x->sadperbit4 = sad_per_bit4lut_12[qindex];
+ break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ }
+#else
+ (void)cpi;
+ x->sadperbit16 = sad_per_bit16lut_8[qindex];
+ x->sadperbit4 = sad_per_bit4lut_8[qindex];
+#endif // CONFIG_HIGHBITDEPTH
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
+ int i, bsize, segment_id;
+
+ for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+ const int qindex =
+ clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
+ cm->y_dc_delta_q,
+ 0, MAXQ);
+ const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
+
+ for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+ // Threshold here seems unnecessarily harsh but fine given actual
+ // range of values used for cpi->sf.thresh_mult[].
+ const int t = q * rd_thresh_block_size_factor[bsize];
+ const int thresh_max = INT_MAX / t;
+
+#if CONFIG_CB4X4
+ for (i = 0; i < MAX_MODES; ++i)
+ rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+ ? rd->thresh_mult[i] * t / 4
+ : INT_MAX;
+#else
+ if (bsize >= BLOCK_8X8) {
+ for (i = 0; i < MAX_MODES; ++i)
+ rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+ ? rd->thresh_mult[i] * t / 4
+ : INT_MAX;
+ } else {
+ for (i = 0; i < MAX_REFS; ++i)
+ rd->threshes[segment_id][bsize][i] =
+ rd->thresh_mult_sub8x8[i] < thresh_max
+ ? rd->thresh_mult_sub8x8[i] * t / 4
+ : INT_MAX;
+ }
+#endif
+ }
+ }
+}
+
+#if CONFIG_REF_MV
+void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
+ int ref_mv_idx) {
+ MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+ int8_t rf_type = av1_ref_frame_type(x->e_mbd.mi[0]->mbmi.ref_frame);
+ int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+ mbmi_ext->ref_mv_stack[rf_type], ref, ref_mv_idx);
+ (void)ref_frame;
+ x->mvcost = x->mv_cost_stack[nmv_ctx];
+ x->nmvjointcost = x->nmv_vec_cost[nmv_ctx];
+ x->mvsadcost = x->mvcost;
+ x->nmvjointsadcost = x->nmvjointcost;
+}
+#endif
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ RD_OPT *const rd = &cpi->rd;
+ int i;
+#if CONFIG_REF_MV
+ int nmv_ctx;
+#endif
+
+ aom_clear_system_state();
+
+ rd->RDDIV = RDDIV_BITS; // In bits (to multiply D by 128).
+ rd->RDMULT = av1_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+
+ set_error_per_bit(x, rd->RDMULT);
+
+ set_block_thresholds(cm, rd);
+
+#if CONFIG_REF_MV
+ for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
+ av1_build_nmv_cost_table(
+ x->nmv_vec_cost[nmv_ctx],
+ cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
+ : x->nmvcost[nmv_ctx],
+ &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
+ }
+ x->mvcost = x->mv_cost_stack[0];
+ x->nmvjointcost = x->nmv_vec_cost[0];
+ x->mvsadcost = x->mvcost;
+ x->nmvjointsadcost = x->nmvjointcost;
+#else
+ av1_build_nmv_cost_table(
+ x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
+ &cm->fc->nmvc, cm->allow_high_precision_mv);
+#endif
+
+ if (cpi->oxcf.pass != 1) {
+ av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
+
+ if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+ cm->frame_type == KEY_FRAME) {
+#if CONFIG_EXT_PARTITION_TYPES
+ for (i = 0; i < PARTITION_PLOFFSET; ++i)
+ av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+ av1_partition_tree);
+ for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+ av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+ av1_ext_partition_tree);
+#else
+ for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+ av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+ av1_partition_tree);
+#endif // CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_UNPOISON_PARTITION_CTX
+ for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
+ aom_prob p = cm->fc->partition_prob[i][PARTITION_VERT];
+ assert(p > 0);
+ cpi->partition_cost[i][PARTITION_NONE] = INT_MAX;
+ cpi->partition_cost[i][PARTITION_HORZ] = INT_MAX;
+ cpi->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0);
+ cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
+ }
+ for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
+ aom_prob p = cm->fc->partition_prob[i][PARTITION_HORZ];
+ assert(p > 0);
+ cpi->partition_cost[i][PARTITION_NONE] = INT_MAX;
+ cpi->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0);
+ cpi->partition_cost[i][PARTITION_VERT] = INT_MAX;
+ cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
+ }
+ cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX;
+ cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX;
+ cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX;
+ cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0;
+#endif // CONFIG_UNPOISON_PARTITION_CTX
+ }
+
+ fill_mode_costs(cpi);
+
+ if (!frame_is_intra_only(cm)) {
+#if CONFIG_REF_MV
+ for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+ cpi->newmv_mode_cost[i][0] = av1_cost_bit(cm->fc->newmv_prob[i], 0);
+ cpi->newmv_mode_cost[i][1] = av1_cost_bit(cm->fc->newmv_prob[i], 1);
+ }
+
+ for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
+ cpi->zeromv_mode_cost[i][0] = av1_cost_bit(cm->fc->zeromv_prob[i], 0);
+ cpi->zeromv_mode_cost[i][1] = av1_cost_bit(cm->fc->zeromv_prob[i], 1);
+ }
+
+ for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+ cpi->refmv_mode_cost[i][0] = av1_cost_bit(cm->fc->refmv_prob[i], 0);
+ cpi->refmv_mode_cost[i][1] = av1_cost_bit(cm->fc->refmv_prob[i], 1);
+ }
+
+ for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+ cpi->drl_mode_cost0[i][0] = av1_cost_bit(cm->fc->drl_prob[i], 0);
+ cpi->drl_mode_cost0[i][1] = av1_cost_bit(cm->fc->drl_prob[i], 1);
+ }
+#else
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ av1_cost_tokens((int *)cpi->inter_mode_cost[i],
+ cm->fc->inter_mode_probs[i], av1_inter_mode_tree);
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ av1_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
+ cm->fc->inter_compound_mode_probs[i],
+ av1_inter_compound_mode_tree);
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+ av1_cost_tokens((int *)cpi->interintra_mode_cost[i],
+ cm->fc->interintra_mode_prob[i],
+ av1_interintra_mode_tree);
+#endif // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+ av1_cost_tokens((int *)cpi->motion_mode_cost[i],
+ cm->fc->motion_mode_prob[i], av1_motion_mode_tree);
+ }
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+ for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+ cpi->motion_mode_cost1[i][0] = av1_cost_bit(cm->fc->obmc_prob[i], 0);
+ cpi->motion_mode_cost1[i][1] = av1_cost_bit(cm->fc->obmc_prob[i], 1);
+ }
+#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ }
+ }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+ // NOTE: The tables below must be of the same size.
+
+ // The functions described below are sampled at the four most significant
+ // bits of x^2 + 8 / 256.
+
+ // Normalized rate:
+ // This table models the rate for a Laplacian source with given variance
+ // when quantized with a uniform quantizer with given stepsize. The
+ // closed form expression is:
+ // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+ // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+ // and H(x) is the binary entropy function.
+ static const int rate_tab_q10[] = {
+ 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+ 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+ 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+ 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+ 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+ 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424,
+ 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87,
+ 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6,
+ 5, 3, 2, 1, 1, 1, 0, 0,
+ };
+ // Normalized distortion:
+ // This table models the normalized distortion for a Laplacian source
+ // with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+ // where x = qpstep / sqrt(variance).
+ // Note the actual distortion is Dn * variance.
+ static const int dist_tab_q10[] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5,
+ 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17,
+ 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54,
+ 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142,
+ 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351,
+ 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659,
+ 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936,
+ 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017,
+ 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+ };
+ static const int xsq_iq_q10[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32,
+ 40, 48, 56, 64, 72, 80, 88, 96, 112,
+ 128, 144, 160, 176, 192, 208, 224, 256, 288,
+ 320, 352, 384, 416, 448, 480, 544, 608, 672,
+ 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504,
+ 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296,
+ 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136,
+ 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328,
+ 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736,
+ 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696,
+ 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808,
+ 180192, 196576, 212960, 229344, 245728,
+ };
+ const int tmp = (xsq_q10 >> 2) + 8;
+ const int k = get_msb(tmp) - 3;
+ const int xq = (k << 3) + ((tmp >> k) & 0x7);
+ const int one_q10 = 1 << 10;
+ const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+ const int b_q10 = one_q10 - a_q10;
+ *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+ *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+ unsigned int qstep, int *rate,
+ int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expressions are in:
+ // Hang and Chen, "Source Model for transform video coder and its
+ // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+ // Sys. for Video Tech., April 1997.
+ if (var == 0) {
+ *rate = 0;
+ *dist = 0;
+ } else {
+ int d_q10, r_q10;
+ static const uint32_t MAX_XSQ_Q10 = 245727;
+ const uint64_t xsq_q10_64 =
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+ const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+ model_rd_norm(xsq_q10, &r_q10, &d_q10);
+ *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+ *dist = (var * (int64_t)d_q10 + 512) >> 10;
+ }
+}
+
+static void get_entropy_contexts_plane(
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+ const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const ENTROPY_CONTEXT *const above = pd->above_context;
+ const ENTROPY_CONTEXT *const left = pd->left_context;
+
+ int i;
+
+#if CONFIG_CB4X4
+ switch (tx_size) {
+ case TX_2X2:
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+ break;
+ case TX_4X4:
+ for (i = 0; i < num_4x4_w; i += 2)
+ t_above[i] = !!*(const uint16_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 2)
+ t_left[i] = !!*(const uint16_t *)&left[i];
+ break;
+ case TX_8X8:
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
+ break;
+ case TX_16X16:
+ for (i = 0; i < num_4x4_w; i += 8)
+ t_above[i] = !!*(const uint64_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 8)
+ t_left[i] = !!*(const uint64_t *)&left[i];
+ break;
+ case TX_32X32:
+ for (i = 0; i < num_4x4_w; i += 16)
+ t_above[i] =
+ !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+ for (i = 0; i < num_4x4_h; i += 16)
+ t_left[i] =
+ !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+ break;
+ case TX_4X8:
+ for (i = 0; i < num_4x4_w; i += 2)
+ t_above[i] = !!*(const uint16_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
+ break;
+ case TX_8X4:
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 2)
+ t_left[i] = !!*(const uint16_t *)&left[i];
+ break;
+ case TX_8X16:
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 8)
+ t_left[i] = !!*(const uint64_t *)&left[i];
+ break;
+ case TX_16X8:
+ for (i = 0; i < num_4x4_w; i += 8)
+ t_above[i] = !!*(const uint64_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
+ break;
+ case TX_16X32:
+ for (i = 0; i < num_4x4_w; i += 8)
+ t_above[i] = !!*(const uint64_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 16)
+ t_left[i] =
+ !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+ break;
+ case TX_32X16:
+ for (i = 0; i < num_4x4_w; i += 16)
+ t_above[i] =
+ !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+ for (i = 0; i < num_4x4_h; i += 8)
+ t_left[i] = !!*(const uint64_t *)&left[i];
+ break;
+
+ default: assert(0 && "Invalid transform size."); break;
+ }
+ return;
+#endif
+
+ switch (tx_size) {
+ case TX_4X4:
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+ break;
+ case TX_8X8:
+ for (i = 0; i < num_4x4_w; i += 2)
+ t_above[i] = !!*(const uint16_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 2)
+ t_left[i] = !!*(const uint16_t *)&left[i];
+ break;
+ case TX_16X16:
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
+ break;
+ case TX_32X32:
+ for (i = 0; i < num_4x4_w; i += 8)
+ t_above[i] = !!*(const uint64_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 8)
+ t_left[i] = !!*(const uint64_t *)&left[i];
+ break;
+#if CONFIG_TX64X64
+ case TX_64X64:
+ for (i = 0; i < num_4x4_w; i += 16)
+ t_above[i] =
+ !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+ for (i = 0; i < num_4x4_h; i += 16)
+ t_left[i] =
+ !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+ break;
+#endif // CONFIG_TX64X64
+ case TX_4X8:
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ for (i = 0; i < num_4x4_h; i += 2)
+ t_left[i] = !!*(const uint16_t *)&left[i];
+ break;
+ case TX_8X4:
+ for (i = 0; i < num_4x4_w; i += 2)
+ t_above[i] = !!*(const uint16_t *)&above[i];
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+ break;
+ case TX_8X16:
+ for (i = 0; i < num_4x4_w; i += 2)
+ t_above[i] = !!*(const uint16_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
+ break;
+ case TX_16X8:
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 2)
+ t_left[i] = !!*(const uint16_t *)&left[i];
+ break;
+ case TX_16X32:
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 8)
+ t_left[i] = !!*(const uint64_t *)&left[i];
+ break;
+ case TX_32X16:
+ for (i = 0; i < num_4x4_w; i += 8)
+ t_above[i] = !!*(const uint64_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
+ break;
+ default: assert(0 && "Invalid transform size."); break;
+ }
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+ int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+ int i;
+ int zero_seen = 0;
+ int best_index = 0;
+ int best_sad = INT_MAX;
+ int this_sad = INT_MAX;
+ int max_mv = 0;
+ int near_same_nearest;
+ uint8_t *src_y_ptr = x->plane[0].src.buf;
+ uint8_t *ref_y_ptr;
+ const int num_mv_refs =
+ MAX_MV_REF_CANDIDATES +
+ (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size);
+
+ MV pred_mv[3];
+ pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
+ pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+ pred_mv[2] = x->pred_mv[ref_frame];
+ assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+ near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
+ x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
+ // Get the sad for each candidate reference mv.
+ for (i = 0; i < num_mv_refs; ++i) {
+ const MV *this_mv = &pred_mv[i];
+ int fp_row, fp_col;
+
+ if (i == 1 && near_same_nearest) continue;
+ fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+ fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+ max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+ if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+ zero_seen |= (fp_row == 0 && fp_col == 0);
+
+ ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+ // Find sad for current vector.
+ this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+ ref_y_ptr, ref_y_stride);
+ // Note if it is the best so far.
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ best_index = i;
+ }
+ }
+
+ // Note the index of the mv that worked best in the reference list.
+ x->mv_best_ref_index[ref_frame] = best_index;
+ x->max_mv_context[ref_frame] = max_mv;
+ x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv) {
+ int i;
+
+ dst[0].buf = src->y_buffer;
+ dst[0].stride = src->y_stride;
+ dst[1].buf = src->u_buffer;
+ dst[2].buf = src->v_buffer;
+ dst[1].stride = dst[2].stride = src->uv_stride;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ setup_pred_plane(dst + i, xd->mi[0]->mbmi.sb_type, dst[i].buf,
+ i ? src->uv_crop_width : src->y_crop_width,
+ i ? src->uv_crop_height : src->y_crop_height,
+ dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+ xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+ }
+}
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+ int stride) {
+ const int bw = b_width_log2_lookup[plane_bsize];
+ const int y = 4 * (raster_block >> bw);
+ const int x = 4 * (raster_block & ((1 << bw) - 1));
+ return y * stride + x;
+}
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+ int16_t *base) {
+ const int stride = block_size_wide[plane_bsize];
+ return base + av1_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+ int ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+ const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
+ ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
+ : NULL;
+}
+
+#if CONFIG_DUAL_FILTER
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cm->interp_filter == SWITCHABLE) {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int inter_filter_cost = 0;
+ int dir;
+
+ for (dir = 0; dir < 2; ++dir) {
+ if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+ (mbmi->ref_frame[1] > INTRA_FRAME &&
+ has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ inter_filter_cost +=
+ cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+ }
+ }
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+ } else {
+ return 0;
+ }
+}
+#else
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cm->interp_filter == SWITCHABLE) {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const int ctx = av1_get_pred_context_switchable_interp(xd);
+ return SWITCHABLE_INTERP_RATE_FACTOR *
+ cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+ }
+ return 0;
+}
+#endif
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+ int i;
+ RD_OPT *const rd = &cpi->rd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+
+ // Set baseline threshold values.
+ for (i = 0; i < MAX_MODES; ++i) rd->thresh_mult[i] = cpi->oxcf.mode == 0;
+
+ if (sf->adaptive_rd_thresh) {
+ rd->thresh_mult[THR_NEARESTMV] = 300;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEARESTL2] = 300;
+ rd->thresh_mult[THR_NEARESTL3] = 300;
+ rd->thresh_mult[THR_NEARESTB] = 300;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEARESTA] = 300;
+ rd->thresh_mult[THR_NEARESTG] = 300;
+ } else {
+ rd->thresh_mult[THR_NEARESTMV] = 0;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEARESTL2] = 0;
+ rd->thresh_mult[THR_NEARESTL3] = 0;
+ rd->thresh_mult[THR_NEARESTB] = 0;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEARESTA] = 0;
+ rd->thresh_mult[THR_NEARESTG] = 0;
+ }
+
+ rd->thresh_mult[THR_DC] += 1000;
+
+ rd->thresh_mult[THR_NEWMV] += 1000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEWL2] += 1000;
+ rd->thresh_mult[THR_NEWL3] += 1000;
+ rd->thresh_mult[THR_NEWB] += 1000;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEWA] += 1000;
+ rd->thresh_mult[THR_NEWG] += 1000;
+
+ rd->thresh_mult[THR_NEARMV] += 1000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEARL2] += 1000;
+ rd->thresh_mult[THR_NEARL3] += 1000;
+ rd->thresh_mult[THR_NEARB] += 1000;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_NEARA] += 1000;
+ rd->thresh_mult[THR_NEARG] += 1000;
+
+ rd->thresh_mult[THR_ZEROMV] += 2000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_ZEROL2] += 2000;
+ rd->thresh_mult[THR_ZEROL3] += 2000;
+ rd->thresh_mult[THR_ZEROB] += 2000;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_ZEROG] += 2000;
+ rd->thresh_mult[THR_ZEROA] += 2000;
+
+ rd->thresh_mult[THR_TM] += 1000;
+
+#if CONFIG_EXT_INTER
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+#endif // CONFIG_EXT_REFS
+
+#else // CONFIG_EXT_INTER
+
+ rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000;
+ rd->thresh_mult[THR_COMP_NEARESTL3A] += 1000;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEARESTLB] += 1000;
+ rd->thresh_mult[THR_COMP_NEARESTL2B] += 1000;
+ rd->thresh_mult[THR_COMP_NEARESTL3B] += 1000;
+ rd->thresh_mult[THR_COMP_NEARESTGB] += 1000;
+#endif // CONFIG_EXT_REFS
+
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARLA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTLA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500;
+
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEAREST_NEARL2A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTL2A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARL3A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTL3A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500;
+#endif // CONFIG_EXT_REFS
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARGA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTGA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500;
+
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEAREST_NEARLB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTLB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARL2B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTL2B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARL3B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTL3B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARGB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARESTGB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
+ rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+#endif // CONFIG_EXT_REFS
+
+#else // CONFIG_EXT_INTER
+
+ rd->thresh_mult[THR_COMP_NEARLA] += 1500;
+ rd->thresh_mult[THR_COMP_NEWLA] += 2000;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEARL2A] += 1500;
+ rd->thresh_mult[THR_COMP_NEWL2A] += 2000;
+ rd->thresh_mult[THR_COMP_NEARL3A] += 1500;
+ rd->thresh_mult[THR_COMP_NEWL3A] += 2000;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEARGA] += 1500;
+ rd->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_NEARLB] += 1500;
+ rd->thresh_mult[THR_COMP_NEWLB] += 2000;
+ rd->thresh_mult[THR_COMP_NEARL2B] += 1500;
+ rd->thresh_mult[THR_COMP_NEWL2B] += 2000;
+ rd->thresh_mult[THR_COMP_NEARL3B] += 1500;
+ rd->thresh_mult[THR_COMP_NEWL3B] += 2000;
+ rd->thresh_mult[THR_COMP_NEARGB] += 1500;
+ rd->thresh_mult[THR_COMP_NEWGB] += 2000;
+#endif // CONFIG_EXT_REFS
+
+ rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_ZEROL2A] += 2500;
+ rd->thresh_mult[THR_COMP_ZEROL3A] += 2500;
+#endif // CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_ZEROLB] += 2500;
+ rd->thresh_mult[THR_COMP_ZEROL2B] += 2500;
+ rd->thresh_mult[THR_COMP_ZEROL3B] += 2500;
+ rd->thresh_mult[THR_COMP_ZEROGB] += 2500;
+#endif // CONFIG_EXT_REFS
+
+#endif // CONFIG_EXT_INTER
+
+ rd->thresh_mult[THR_H_PRED] += 2000;
+ rd->thresh_mult[THR_V_PRED] += 2000;
+ rd->thresh_mult[THR_D135_PRED] += 2500;
+ rd->thresh_mult[THR_D207_PRED] += 2500;
+ rd->thresh_mult[THR_D153_PRED] += 2500;
+ rd->thresh_mult[THR_D63_PRED] += 2500;
+ rd->thresh_mult[THR_D117_PRED] += 2500;
+ rd->thresh_mult[THR_D45_PRED] += 2500;
+
+#if CONFIG_EXT_INTER
+ rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEWL] += 2000;
+
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2] += 2000;
+
+ rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3] += 2000;
+#endif // CONFIG_EXT_REFS
+
+ rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARG] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEWG] += 2000;
+
+#if CONFIG_EXT_REFS
+ rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000;
+#endif // CONFIG_EXT_REFS
+
+ rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500;
+ rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000;
+#endif // CONFIG_EXT_INTER
+}
+
+void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
+ static const int thresh_mult[MAX_REFS] = {
+#if CONFIG_EXT_REFS
+ 2500,
+ 2500,
+ 2500,
+ 2500,
+ 2500,
+ 2500,
+ 4500,
+ 4500,
+ 4500,
+ 4500,
+ 4500,
+ 4500,
+ 4500,
+ 4500,
+ 2500
+#else
+ 2500,
+ 2500,
+ 2500,
+ 4500,
+ 4500,
+ 2500
+#endif // CONFIG_EXT_REFS
+ };
+ RD_OPT *const rd = &cpi->rd;
+ memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
+}
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+ int (*factor_buf)[MAX_MODES], int rd_thresh,
+ int bsize, int best_mode_index) {
+ if (rd_thresh > 0) {
+#if CONFIG_CB4X4
+ const int top_mode = MAX_MODES;
+#else
+ const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+#endif
+ int mode;
+ for (mode = 0; mode < top_mode; ++mode) {
+ const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
+ const BLOCK_SIZE max_size = AOMMIN(bsize + 2, (int)cm->sb_size);
+ BLOCK_SIZE bs;
+ for (bs = min_size; bs <= max_size; ++bs) {
+ int *const fact = &factor_buf[bs][mode];
+ if (mode == best_mode_index) {
+ *fact -= (*fact >> 4);
+ } else {
+ *fact = AOMMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
+ }
+ }
+ }
+ }
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth) {
+ const int q = av1_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_HIGHBITDEPTH
+ switch (bit_depth) {
+ case AOM_BITS_8: return 20 * q;
+ case AOM_BITS_10: return 5 * q;
+ case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+#else
+ return 20 * q;
+#endif // CONFIG_HIGHBITDEPTH
+}
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
new file mode 100644
index 0000000000..c0ac1f7e75
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.h
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RD_H_
+#define AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#endif // CONFIG_ANS
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, DM, R, D) \
+ (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + (D << DM))
+
+#define RDCOST_DBL(RM, DM, R, D) \
+ (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+ ((double)(D) * (1 << (DM))))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+#define INVALID_MV 0x80008000
+
+#if CONFIG_EXT_REFS
+#define MAX_REFS 15
+#else
+#define MAX_REFS 6
+#endif // CONFIG_EXT_REFS
+
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+ THR_NEARESTMV,
+#if CONFIG_EXT_REFS
+ THR_NEARESTL2,
+ THR_NEARESTL3,
+ THR_NEARESTB,
+#endif // CONFIG_EXT_REFS
+ THR_NEARESTA,
+ THR_NEARESTG,
+
+ THR_DC,
+
+ THR_NEWMV,
+#if CONFIG_EXT_REFS
+ THR_NEWL2,
+ THR_NEWL3,
+ THR_NEWB,
+#endif // CONFIG_EXT_REFS
+ THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
+#if CONFIG_EXT_REFS
+ THR_NEARL2,
+ THR_NEARL3,
+ THR_NEARB,
+#endif // CONFIG_EXT_REFS
+ THR_NEARA,
+ THR_NEARG,
+
+ THR_ZEROMV,
+#if CONFIG_EXT_REFS
+ THR_ZEROL2,
+ THR_ZEROL3,
+ THR_ZEROB,
+#endif // CONFIG_EXT_REFS
+ THR_ZEROG,
+ THR_ZEROA,
+
+#if CONFIG_EXT_INTER
+
+ THR_COMP_NEAREST_NEARESTLA,
+#if CONFIG_EXT_REFS
+ THR_COMP_NEAREST_NEARESTL2A,
+ THR_COMP_NEAREST_NEARESTL3A,
+#endif // CONFIG_EXT_REFS
+ THR_COMP_NEAREST_NEARESTGA,
+#if CONFIG_EXT_REFS
+ THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTGB,
+#endif // CONFIG_EXT_REFS
+
+#else // CONFIG_EXT_INTER
+
+ THR_COMP_NEARESTLA,
+#if CONFIG_EXT_REFS
+ THR_COMP_NEARESTL2A,
+ THR_COMP_NEARESTL3A,
+#endif // CONFIG_EXT_REFS
+ THR_COMP_NEARESTGA,
+#if CONFIG_EXT_REFS
+ THR_COMP_NEARESTLB,
+ THR_COMP_NEARESTL2B,
+ THR_COMP_NEARESTL3B,
+ THR_COMP_NEARESTGB,
+#endif // CONFIG_EXT_REFS
+
+#endif // CONFIG_EXT_INTER
+
+ THR_TM,
+
+#if CONFIG_ALT_INTRA
+ THR_SMOOTH,
+#endif // CONFIG_ALT_INTRA
+
+#if CONFIG_EXT_INTER
+
+ THR_COMP_NEAR_NEARESTLA,
+ THR_COMP_NEAREST_NEARLA,
+ THR_COMP_NEAR_NEARLA,
+ THR_COMP_NEW_NEARESTLA,
+ THR_COMP_NEAREST_NEWLA,
+ THR_COMP_NEW_NEARLA,
+ THR_COMP_NEAR_NEWLA,
+ THR_COMP_NEW_NEWLA,
+ THR_COMP_ZERO_ZEROLA,
+
+#if CONFIG_EXT_REFS
+ THR_COMP_NEAR_NEARESTL2A,
+ THR_COMP_NEAREST_NEARL2A,
+ THR_COMP_NEAR_NEARL2A,
+ THR_COMP_NEW_NEARESTL2A,
+ THR_COMP_NEAREST_NEWL2A,
+ THR_COMP_NEW_NEARL2A,
+ THR_COMP_NEAR_NEWL2A,
+ THR_COMP_NEW_NEWL2A,
+ THR_COMP_ZERO_ZEROL2A,
+
+ THR_COMP_NEAR_NEARESTL3A,
+ THR_COMP_NEAREST_NEARL3A,
+ THR_COMP_NEAR_NEARL3A,
+ THR_COMP_NEW_NEARESTL3A,
+ THR_COMP_NEAREST_NEWL3A,
+ THR_COMP_NEW_NEARL3A,
+ THR_COMP_NEAR_NEWL3A,
+ THR_COMP_NEW_NEWL3A,
+ THR_COMP_ZERO_ZEROL3A,
+#endif // CONFIG_EXT_REFS
+
+ THR_COMP_NEAR_NEARESTGA,
+ THR_COMP_NEAREST_NEARGA,
+ THR_COMP_NEAR_NEARGA,
+ THR_COMP_NEW_NEARESTGA,
+ THR_COMP_NEAREST_NEWGA,
+ THR_COMP_NEW_NEARGA,
+ THR_COMP_NEAR_NEWGA,
+ THR_COMP_NEW_NEWGA,
+ THR_COMP_ZERO_ZEROGA,
+
+#if CONFIG_EXT_REFS
+ THR_COMP_NEAR_NEARESTLB,
+ THR_COMP_NEAREST_NEARLB,
+ THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEW_NEARLB,
+ THR_COMP_NEAR_NEWLB,
+ THR_COMP_NEW_NEWLB,
+ THR_COMP_ZERO_ZEROLB,
+
+ THR_COMP_NEAR_NEARESTL2B,
+ THR_COMP_NEAREST_NEARL2B,
+ THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEAR_NEWL2B,
+ THR_COMP_NEW_NEWL2B,
+ THR_COMP_ZERO_ZEROL2B,
+
+ THR_COMP_NEAR_NEARESTL3B,
+ THR_COMP_NEAREST_NEARL3B,
+ THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEAR_NEWL3B,
+ THR_COMP_NEW_NEWL3B,
+ THR_COMP_ZERO_ZEROL3B,
+
+ THR_COMP_NEAR_NEARESTGB,
+ THR_COMP_NEAREST_NEARGB,
+ THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEW_NEARGB,
+ THR_COMP_NEAR_NEWGB,
+ THR_COMP_NEW_NEWGB,
+ THR_COMP_ZERO_ZEROGB,
+#endif // CONFIG_EXT_REFS
+
+#else // CONFIG_EXT_INTER
+
+ THR_COMP_NEARLA,
+ THR_COMP_NEWLA,
+#if CONFIG_EXT_REFS
+ THR_COMP_NEARL2A,
+ THR_COMP_NEWL2A,
+ THR_COMP_NEARL3A,
+ THR_COMP_NEWL3A,
+#endif // CONFIG_EXT_REFS
+ THR_COMP_NEARGA,
+ THR_COMP_NEWGA,
+
+#if CONFIG_EXT_REFS
+ THR_COMP_NEARLB,
+ THR_COMP_NEWLB,
+ THR_COMP_NEARL2B,
+ THR_COMP_NEWL2B,
+ THR_COMP_NEARL3B,
+ THR_COMP_NEWL3B,
+ THR_COMP_NEARGB,
+ THR_COMP_NEWGB,
+#endif // CONFIG_EXT_REFS
+
+ THR_COMP_ZEROLA,
+#if CONFIG_EXT_REFS
+ THR_COMP_ZEROL2A,
+ THR_COMP_ZEROL3A,
+#endif // CONFIG_EXT_REFS
+ THR_COMP_ZEROGA,
+
+#if CONFIG_EXT_REFS
+ THR_COMP_ZEROLB,
+ THR_COMP_ZEROL2B,
+ THR_COMP_ZEROL3B,
+ THR_COMP_ZEROGB,
+#endif // CONFIG_EXT_REFS
+
+#endif // CONFIG_EXT_INTER
+
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D207_PRED,
+ THR_D153_PRED,
+ THR_D63_PRED,
+ THR_D117_PRED,
+ THR_D45_PRED,
+
+#if CONFIG_EXT_INTER
+ THR_COMP_INTERINTRA_ZEROL,
+ THR_COMP_INTERINTRA_NEARESTL,
+ THR_COMP_INTERINTRA_NEARL,
+ THR_COMP_INTERINTRA_NEWL,
+
+#if CONFIG_EXT_REFS
+ THR_COMP_INTERINTRA_ZEROL2,
+ THR_COMP_INTERINTRA_NEARESTL2,
+ THR_COMP_INTERINTRA_NEARL2,
+ THR_COMP_INTERINTRA_NEWL2,
+
+ THR_COMP_INTERINTRA_ZEROL3,
+ THR_COMP_INTERINTRA_NEARESTL3,
+ THR_COMP_INTERINTRA_NEARL3,
+ THR_COMP_INTERINTRA_NEWL3,
+#endif // CONFIG_EXT_REFS
+
+ THR_COMP_INTERINTRA_ZEROG,
+ THR_COMP_INTERINTRA_NEARESTG,
+ THR_COMP_INTERINTRA_NEARG,
+ THR_COMP_INTERINTRA_NEWG,
+
+#if CONFIG_EXT_REFS
+ THR_COMP_INTERINTRA_ZEROB,
+ THR_COMP_INTERINTRA_NEARESTB,
+ THR_COMP_INTERINTRA_NEARB,
+ THR_COMP_INTERINTRA_NEWB,
+#endif // CONFIG_EXT_REFS
+
+ THR_COMP_INTERINTRA_ZEROA,
+ THR_COMP_INTERINTRA_NEARESTA,
+ THR_COMP_INTERINTRA_NEARA,
+ THR_COMP_INTERINTRA_NEWA,
+#endif // CONFIG_EXT_INTER
+ MAX_MODES
+} THR_MODES;
+
+typedef enum {
+ THR_LAST,
+#if CONFIG_EXT_REFS
+ THR_LAST2,
+ THR_LAST3,
+ THR_BWDR,
+#endif // CONFIG_EXT_REFS
+ THR_GOLD,
+ THR_ALTR,
+
+ THR_COMP_LA,
+#if CONFIG_EXT_REFS
+ THR_COMP_L2A,
+ THR_COMP_L3A,
+#endif // CONFIG_EXT_REFS
+ THR_COMP_GA,
+
+#if CONFIG_EXT_REFS
+ THR_COMP_LB,
+ THR_COMP_L2B,
+ THR_COMP_L3B,
+ THR_COMP_GB,
+#endif // CONFIG_EXT_REFS
+
+ THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+ // Thresh_mult is used to set a threshold for the rd score. A higher value
+ // means that we will accept the best mode so far more often. This number
+ // is used in combination with the current block size, and thresh_freq_fact
+ // to pick a threshold.
+ int thresh_mult[MAX_MODES];
+ int thresh_mult_sub8x8[MAX_REFS];
+
+ int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+
+ int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES];
+
+ int RDMULT;
+ int RDDIV;
+} RD_OPT;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->rdcost = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip = 1;
+#if CONFIG_RD_DEBUG
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = 0;
+#if CONFIG_VAR_TX
+ {
+ int r, c;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+ rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
+ }
+#endif
+ }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = INT_MAX;
+ rd_stats->dist = INT64_MAX;
+ rd_stats->rdcost = INT64_MAX;
+ rd_stats->sse = INT64_MAX;
+ rd_stats->skip = 0;
+#if CONFIG_RD_DEBUG
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = INT_MAX;
+#if CONFIG_VAR_TX
+ {
+ int r, c;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+ rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
+ }
+#endif
+ }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+ const RD_STATS *rd_stats_src) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats_dst->rate += rd_stats_src->rate;
+ rd_stats_dst->dist += rd_stats_src->dist;
+ rd_stats_dst->sse += rd_stats_src->sse;
+ rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_RD_DEBUG
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+#if CONFIG_VAR_TX
+ {
+ // TODO(angiebird): optimize this part
+ int r, c;
+ int ref_txb_coeff_cost = 0;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
+ rd_stats_src->txb_coeff_cost_map[plane][r][c];
+ ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
+ }
+ assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
+ }
+#endif
+ }
+#endif
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+ unsigned int qstep, int *rate, int64_t *dist);
+
+int av1_get_switchable_rate(const struct AV1_COMP *cpi, const MACROBLOCKD *xd);
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+ int stride);
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+ int16_t *base);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+ int ref_frame);
+
+void av1_init_me_luts(void);
+
+#if CONFIG_REF_MV
+void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
+ int ref_mv_idx);
+#endif
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+ int (*fact)[MAX_MODES], int rd_thresh, int bsize,
+ int best_mode_index);
+
+void av1_fill_token_costs(av1_coeff_cost *c,
+ av1_coeff_probs_model (*p)[PLANE_TYPES]);
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+ int thresh_fact) {
+ return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+ BLOCK_SIZE block_size);
+
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+ x->errorperbit = rdmult >> RD_EPB_SHIFT;
+ x->errorperbit += (x->errorperbit == 0);
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
new file mode 100644
index 0000000000..a1096f782d
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -0,0 +1,12713 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#if CONFIG_LV_MAP
+#include "av1/common/txb_common.h"
+#endif
+#if CONFIG_WARPED_MOTION
+#include "av1/common/warped_motion.h"
+#endif // CONFIG_WARPED_MOTION
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/mcomp.h"
+#if CONFIG_PALETTE
+#include "av1/encoder/palette.h"
+#endif // CONFIG_PALETTE
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+#if CONFIG_PVQ
+#include "av1/encoder/pvq_encoder.h"
+#endif // CONFIG_PVQ
+#if CONFIG_PVQ || CONFIG_DAALA_DIST
+#include "av1/common/pvq.h"
+#endif // CONFIG_PVQ || CONFIG_DAALA_DIST
+#if CONFIG_DUAL_FILTER
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
+ { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 },
+ { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+ { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 },
+};
+#endif // CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_REFS
+
+#define LAST_FRAME_MODE_MASK \
+ ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
+ (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define LAST2_FRAME_MODE_MASK \
+ ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \
+ (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define LAST3_FRAME_MODE_MASK \
+ ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+ (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define GOLDEN_FRAME_MODE_MASK \
+ ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+ (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define BWDREF_FRAME_MODE_MASK \
+ ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+ (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))
+#define ALTREF_FRAME_MODE_MASK \
+ ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
+ (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))
+
+#else
+
+#define LAST_FRAME_MODE_MASK \
+ ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK \
+ ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
+#define ALTREF_FRAME_MODE_MASK \
+ ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
+
+#endif // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01)
+#else
+#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
+#endif // CONFIG_EXT_REFS
+
+#define MIN_EARLY_TERM_INDEX 3
+#define NEW_MV_DISCOUNT_FACTOR 8
+
+#if CONFIG_EXT_INTRA
+#define ANGLE_SKIP_THRESH 10
+#define FILTER_FAST_SEARCH 1
+#endif // CONFIG_EXT_INTRA
+
+const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671, // vert
+ -7.7051, -3.2234, -3.6193, 3.4533 }; // horz
+
+typedef struct {
+ PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+
+struct rdcost_block_args {
+ const AV1_COMP *cpi;
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
+ ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
+ RD_STATS rd_stats;
+ int64_t this_rd;
+ int64_t best_rd;
+ int exit_early;
+ int use_fast_coef_costing;
+};
+
+#define LAST_NEW_MV_INDEX 6
+static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
+ { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+
+ { NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { NEARMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { ZEROMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+ { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
+ { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
+ { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
+ { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
+
+// TODO(zoeliu): May need to reconsider the order on the modes to check
+
+#if CONFIG_EXT_INTER
+ { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+#else // CONFIG_EXT_INTER
+
+ { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+#endif // CONFIG_EXT_INTER
+
+ { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
+
+#if CONFIG_ALT_INTRA
+ { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+#endif // CONFIG_ALT_INTRA
+
+#if CONFIG_EXT_INTER
+ { NEAR_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+ { NEAR_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+ { NEAR_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+ { NEAR_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+#else // CONFIG_EXT_INTER
+
+ { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+ { NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+ { NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+ { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+ { ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+ { ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
+ { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+#endif // CONFIG_EXT_INTER
+
+ { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D207_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D153_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D63_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+
+#if CONFIG_EXT_INTER
+ { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
+ { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
+ { NEARMV, { LAST_FRAME, INTRA_FRAME } },
+ { NEWMV, { LAST_FRAME, INTRA_FRAME } },
+
+#if CONFIG_EXT_REFS
+ { ZEROMV, { LAST2_FRAME, INTRA_FRAME } },
+ { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } },
+ { NEARMV, { LAST2_FRAME, INTRA_FRAME } },
+ { NEWMV, { LAST2_FRAME, INTRA_FRAME } },
+
+ { ZEROMV, { LAST3_FRAME, INTRA_FRAME } },
+ { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } },
+ { NEARMV, { LAST3_FRAME, INTRA_FRAME } },
+ { NEWMV, { LAST3_FRAME, INTRA_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+ { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } },
+
+#if CONFIG_EXT_REFS
+ { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } },
+ { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
+ { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
+ { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+ { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
+ { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
+ { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
+ { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
+#endif // CONFIG_EXT_INTER
+};
+
+static const REF_DEFINITION av1_ref_order[MAX_REFS] = {
+ { { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+ { { LAST2_FRAME, NONE_FRAME } }, { { LAST3_FRAME, NONE_FRAME } },
+ { { BWDREF_FRAME, NONE_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { { GOLDEN_FRAME, NONE_FRAME } }, { { ALTREF_FRAME, NONE_FRAME } },
+
+ { { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+ { { LAST2_FRAME, ALTREF_FRAME } }, { { LAST3_FRAME, ALTREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+ { { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+ { { LAST_FRAME, BWDREF_FRAME } }, { { LAST2_FRAME, BWDREF_FRAME } },
+ { { LAST3_FRAME, BWDREF_FRAME } }, { { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif // CONFIG_EXT_REFS
+
+ { { INTRA_FRAME, NONE_FRAME } },
+};
+
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE int write_uniform_cost(int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return 0;
+ if (v < m)
+ return (l - 1) * av1_cost_bit(128, 0);
+ else
+ return l * av1_cost_bit(128, 0);
+}
+#endif // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+
+// constants for prune 1 and prune 2 decision boundaries
+#define FAST_EXT_TX_CORR_MID 0.0
+#define FAST_EXT_TX_EDST_MID 0.1
+#define FAST_EXT_TX_CORR_MARGIN 0.5
+#define FAST_EXT_TX_EDST_MARGIN 0.3
+
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+ DCT_1D, ADST_1D, DCT_1D, ADST_1D,
+#if CONFIG_EXT_TX
+ FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
+ DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D,
+#endif // CONFIG_EXT_TX
+};
+
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+ DCT_1D, DCT_1D, ADST_1D, ADST_1D,
+#if CONFIG_EXT_TX
+ DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
+ IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D,
+#endif // CONFIG_EXT_TX
+};
+
+#if CONFIG_DAALA_DIST
+static int od_compute_var_4x4(od_coeff *x, int stride) {
+ int sum;
+ int s2;
+ int i;
+ sum = 0;
+ s2 = 0;
+ for (i = 0; i < 4; i++) {
+ int j;
+ for (j = 0; j < 4; j++) {
+ int t;
+
+ t = x[i * stride + j];
+ sum += t;
+ s2 += t * t;
+ }
+ }
+ // TODO(yushin) : Check wheter any changes are required for high bit depth.
+ return (s2 - (sum * sum >> 4)) >> 4;
+}
+
+/* OD_DIST_LP_MID controls the frequency weighting filter used for computing
+ the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
+ is applied both horizontally and vertically. For X=5, the filter is
+ a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
+#define OD_DIST_LP_MID (5)
+#define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
+
+static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
+ od_coeff *y, od_coeff *e_lp, int stride) {
+ double sum;
+ int min_var;
+ double mean_var;
+ double var_stat;
+ double activity;
+ double calibration;
+ int i;
+ int j;
+ double vardist;
+
+ vardist = 0;
+ OD_ASSERT(qm != OD_FLAT_QM);
+ (void)qm;
+#if 1
+ min_var = INT_MAX;
+ mean_var = 0;
+ for (i = 0; i < 3; i++) {
+ for (j = 0; j < 3; j++) {
+ int varx;
+ int vary;
+ varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
+ vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
+ min_var = OD_MINI(min_var, varx);
+ mean_var += 1. / (1 + varx);
+ /* The cast to (double) is to avoid an overflow before the sqrt.*/
+ vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
+ }
+ }
+ /* We use a different variance statistic depending on whether activity
+ masking is used, since the harmonic mean appeared slghtly worse with
+ masking off. The calibration constant just ensures that we preserve the
+ rate compared to activity=1. */
+ if (use_activity_masking) {
+ calibration = 1.95;
+ var_stat = 9. / mean_var;
+ } else {
+ calibration = 1.62;
+ var_stat = min_var;
+ }
+ /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
+ activity masking constant. */
+ activity = calibration * pow(.25 + var_stat, -1. / 6);
+#else
+ activity = 1;
+#endif // 1
+ sum = 0;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++)
+ sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
+ }
+ /* Normalize the filter to unit DC response. */
+ sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
+ OD_DIST_LP_NORM);
+ return activity * activity * (sum + vardist);
+}
+
+// Note : Inputs x and y are in a pixel domain
+static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
+ od_coeff *y, int bsize_w, int bsize_h,
+ int qindex) {
+ int i;
+ double sum;
+ sum = 0;
+
+ assert(bsize_w >= 8 && bsize_h >= 8);
+
+ if (qm == OD_FLAT_QM) {
+ for (i = 0; i < bsize_w * bsize_h; i++) {
+ double tmp;
+ tmp = x[i] - y[i];
+ sum += tmp * tmp;
+ }
+ } else {
+ int j;
+ DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+ int mid = OD_DIST_LP_MID;
+ for (i = 0; i < bsize_h; i++) {
+ for (j = 0; j < bsize_w; j++) {
+ e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
+ }
+ }
+ for (i = 0; i < bsize_h; i++) {
+ tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+ tmp[i * bsize_w + bsize_w - 1] =
+ mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+ for (j = 1; j < bsize_w - 1; j++) {
+ tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
+ e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
+ }
+ }
+ for (j = 0; j < bsize_w; j++) {
+ e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
+ e_lp[(bsize_h - 1) * bsize_w + j] =
+ mid * tmp[(bsize_h - 1) * bsize_w + j] +
+ 2 * tmp[(bsize_h - 2) * bsize_w + j];
+ }
+ for (i = 1; i < bsize_h - 1; i++) {
+ for (j = 0; j < bsize_w; j++) {
+ e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
+ tmp[(i - 1) * bsize_w + j] +
+ tmp[(i + 1) * bsize_w + j];
+ }
+ }
+ for (i = 0; i < bsize_h; i += 8) {
+ for (j = 0; j < bsize_w; j += 8) {
+ sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
+ &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
+ bsize_w);
+ }
+ }
+ /* Scale according to linear regression against SSE, for 8x8 blocks. */
+ if (activity_masking) {
+ sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
+ (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
+ } else {
+ sum *= qindex >= 128
+ ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
+ : qindex <= 43
+ ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
+ : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
+ }
+ }
+ return sum;
+}
+
+static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int bsw,
+ int bsh, int qm, int use_activity_masking,
+ int qindex) {
+ int i, j;
+ int64_t d;
+ DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
+
+ assert(qm == OD_HVS_QM);
+
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+
+ d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh,
+ qindex);
+ return d;
+}
+#endif // CONFIG_DAALA_DIST
+
+static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ double *hordist, double *verdist) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ const int f_index = bsize - BLOCK_16X16;
+ if (f_index < 0) {
+ const int w_shift = bw == 8 ? 1 : 2;
+ const int h_shift = bh == 8 ? 1 : 2;
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] +=
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+ }
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+ (src[j + i * src_stride] - dst[j + i * dst_stride]);
+ }
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ } else {
+ cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
+ cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[1]);
+ cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[2]);
+ cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[3]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]);
+ cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[5]);
+ cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[6]);
+ cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[7]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]);
+ cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[9]);
+ cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[10]);
+ cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[11]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]);
+ cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[13]);
+ cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[14]);
+ cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[15]);
+ }
+
+ double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+ esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+ esq[12] + esq[13] + esq[14] + esq[15];
+ if (total > 0) {
+ const double e_recip = 1.0 / total;
+ hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+ hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+ hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+ verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+ verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+ verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+ } else {
+ hordist[0] = verdist[0] = 0.25;
+ hordist[1] = verdist[1] = 0.25;
+ hordist[2] = verdist[2] = 0.25;
+ }
+}
+
+static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride) {
+ int prune_bitmask = 0;
+ double svm_proj_h = 0, svm_proj_v = 0;
+ double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
+ get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride,
+ hdist, vdist);
+
+ svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
+ vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
+ svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
+ hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
+ if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << FLIPADST_1D;
+ else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << ADST_1D;
+
+ if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << (FLIPADST_1D + 8);
+ else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << (ADST_1D + 8);
+
+ return prune_bitmask;
+}
+
+#if CONFIG_EXT_TX
+static void get_horver_correlation(const int16_t *diff, int stride, int w,
+ int h, double *hcorr, double *vcorr) {
+ // Returns hor/ver correlation coefficient
+ const int num = (h - 1) * (w - 1);
+ double num_r;
+ int i, j;
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, y_sum = 0, z_sum = 0;
+ int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
+ double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
+ *hcorr = *vcorr = 1;
+
+ assert(num > 0);
+ num_r = 1.0 / num;
+ for (i = 1; i < h; ++i) {
+ for (j = 1; j < w; ++j) {
+ const int16_t x = diff[i * stride + j];
+ const int16_t y = diff[i * stride + j - 1];
+ const int16_t z = diff[(i - 1) * stride + j];
+ xy_sum += x * y;
+ xz_sum += x * z;
+ x_sum += x;
+ y_sum += y;
+ z_sum += z;
+ x2_sum += x * x;
+ y2_sum += y * y;
+ z2_sum += z * z;
+ }
+ }
+ x_var_n = x2_sum - (x_sum * x_sum) * num_r;
+ y_var_n = y2_sum - (y_sum * y_sum) * num_r;
+ z_var_n = z2_sum - (z_sum * z_sum) * num_r;
+ xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
+ xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
+ if (x_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ }
+ if (x_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ }
+}
+
+int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
+ double hcorr, vcorr;
+ int prune_bitmask = 0;
+ get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
+
+ if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << IDTX_1D;
+ else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << DCT_1D;
+
+ if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << (IDTX_1D + 8);
+ else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << (DCT_1D + 8);
+ return prune_bitmask;
+}
+
+// Performance drop: 0.5%, Speed improvement: 24%
+static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, const MACROBLOCKD *xd,
+ int adst_flipadst, int dct_idtx) {
+ int prune = 0;
+
+ if (adst_flipadst) {
+ const struct macroblock_plane *const p = &x->plane[0];
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+ }
+ if (dct_idtx) {
+ av1_subtract_plane(x, bsize, 0);
+ const struct macroblock_plane *const p = &x->plane[0];
+ const int bw = 4 << (b_width_log2_lookup[bsize]);
+ const int bh = 4 << (b_height_log2_lookup[bsize]);
+ prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
+ }
+
+ return prune;
+}
+#endif // CONFIG_EXT_TX
+
+// Performance drop: 0.3%, Speed improvement: 5%
+static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ const MACROBLOCK *x, const MACROBLOCKD *xd) {
+ const struct macroblock_plane *const p = &x->plane[0];
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride);
+}
+
+static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+ const MACROBLOCKD *const xd, int tx_set) {
+#if CONFIG_EXT_TX
+ const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+#else
+ const int tx_set_1D[TX_TYPES_1D] = { 0 };
+#endif // CONFIG_EXT_TX
+
+ switch (cpi->sf.tx_type_search.prune_mode) {
+ case NO_PRUNE: return 0; break;
+ case PRUNE_ONE:
+ if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
+ return 0;
+ return prune_one_for_sby(cpi, bsize, x, xd);
+ break;
+#if CONFIG_EXT_TX
+ case PRUNE_TWO:
+ if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+ if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
+ return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+ }
+ if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
+ return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+ return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+ break;
+#endif // CONFIG_EXT_TX
+ }
+ assert(0);
+ return 0;
+}
+
+static int do_tx_type_search(TX_TYPE tx_type, int prune) {
+// TODO(sarahparker) implement for non ext tx
+#if CONFIG_EXT_TX
+ return !(((prune >> vtx_tab[tx_type]) & 1) |
+ ((prune >> (htx_tab[tx_type] + 8)) & 1));
+#else
+ // temporary to avoid compiler warnings
+ (void)vtx_tab;
+ (void)htx_tab;
+ (void)tx_type;
+ (void)prune;
+ return 1;
+#endif // CONFIG_EXT_TX
+}
+
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+ int plane, int64_t sse, int *rate,
+ int64_t *dist) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+#if CONFIG_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
+#endif // CONFIG_HIGHBITDEPTH
+ 3;
+
+ // Fast approximate the modelling function.
+ if (cpi->sf.simple_model_rd_from_var) {
+ const int64_t square_error = sse;
+ int quantizer = (pd->dequant[1] >> dequant_shift);
+
+ if (quantizer < 120)
+ *rate = (int)((square_error * (280 - quantizer)) >>
+ (16 - AV1_PROB_COST_SHIFT));
+ else
+ *rate = 0;
+ *dist = (square_error * quantizer) >> 8;
+ } else {
+ av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
+ pd->dequant[1] >> dequant_shift, rate, dist);
+ }
+
+ *dist <<= 4;
+}
+
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+ int plane_to, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb,
+ int64_t *skip_sse_sb) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ int plane;
+ const int ref = xd->mi[0]->mbmi.ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ x->pred_sse[ref] = 0;
+
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+ const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#else
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+
+#if CONFIG_CB4X4
+ if (x->skip_chroma_rd && plane) continue;
+#endif // CONFIG_CB4X4
+
+ // TODO(geza): Write direct sse functions that do not compute
+ // variance as well.
+ cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ &sse);
+
+ if (plane == 0) x->pred_sse[ref] = sse;
+
+ total_sse += sse;
+
+ model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
+
+ rate_sum += rate;
+ dist_sum += dist;
+ }
+
+ *skip_txfm_sb = total_sse == 0;
+ *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += coeff[i] * coeff[i];
+ }
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ int i;
+ int64_t error = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ }
+
+ return error;
+}
+
+#if CONFIG_HIGHBITDEPTH
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, intptr_t block_size,
+ int64_t *ssz, int bd) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int64_t diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PVQ
+// Without PVQ, av1_block_error_c() return two kind of errors,
+// 1) reconstruction (i.e. decoded) error and
+// 2) Squared sum of transformed residue (i.e. 'coeff')
+// However, if PVQ is enabled, coeff does not keep the transformed residue
+// but instead a transformed original is kept.
+// Hence, new parameter ref vector (i.e. transformed predicted signal)
+// is required to derive the residue signal,
+// i.e. coeff - ref = residue (all transformed).
+
+#if CONFIG_HIGHBITDEPTH
+static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ const tran_low_t *ref,
+ intptr_t block_size, int64_t *ssz,
+ int bd) {
+ int64_t error;
+ int64_t sqcoeff;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+ // Use the existing sse codes for calculating distortion of decoded signal:
+ // i.e. (orig - decoded)^2
+ // For high bit depth, throw away ssz until a 32-bit version of
+ // av1_block_error_fp is written.
+ int64_t ssz_trash;
+ error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
+ // prediction residue^2 = (orig - ref)^2
+ sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+ *ssz = sqcoeff;
+ return error;
+}
+#else
+// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
+// a separate function that does not do the extra computations for ssz.
+static int64_t av1_block_error2_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ const tran_low_t *ref, intptr_t block_size,
+ int64_t *ssz) {
+ int64_t error;
+ // Use the existing sse codes for calculating distortion of decoded signal:
+ // i.e. (orig - decoded)^2
+ error = av1_block_error_fp(coeff, dqcoeff, block_size);
+ // prediction residue^2 = (orig - ref)^2
+ *ssz = av1_block_error_fp(coeff, ref, block_size);
+ return error;
+}
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_PVQ
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
+#if !CONFIG_LV_MAP
+static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
+ const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+ int use_fast_coef_costing) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const struct macroblock_plane *p = &x->plane[plane];
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const PLANE_TYPE type = pd->plane_type;
+ const uint16_t *band_count = &band_count_table[tx_size][1];
+ const int eob = p->eobs[block];
+ const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const int tx_size_ctx = txsize_sqr_map[tx_size];
+ unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)];
+ uint8_t token_cache[MAX_TX_SQUARE];
+ int pt = combine_entropy_contexts(*a, *l);
+ int c, cost;
+ const int16_t *scan = scan_order->scan;
+ const int16_t *nb = scan_order->neighbors;
+#if CONFIG_NEW_TOKENSET
+ const int ref = is_inter_block(mbmi);
+ aom_prob *blockz_probs =
+ cm->fc->blockzero_probs[txsize_sqr_map[tx_size]][type][ref];
+
+#endif // CONFIG_NEW_TOKENSET
+
+#if CONFIG_HIGHBITDEPTH
+ const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
+#else
+ const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
+#endif // CONFIG_HIGHBITDEPTH
+
+#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
+ // Check for consistency of tx_size with mode info
+ assert(tx_size == get_tx_size(plane, xd));
+#endif // !CONFIG_VAR_TX && !CONFIG_SUPERTX
+ (void)cm;
+
+ if (eob == 0) {
+#if CONFIG_NEW_TOKENSET
+ // single eob token
+ cost = av1_cost_bit(blockz_probs[pt], 0);
+#else
+ cost = token_costs[0][0][pt][EOB_TOKEN];
+#endif // CONFIG_NEW_TOKENSET
+ } else {
+ if (use_fast_coef_costing) {
+ int band_left = *band_count++;
+
+ // dc token
+ int v = qcoeff[0];
+ int16_t prev_t;
+ cost = av1_get_token_cost(v, &prev_t, cat6_bits);
+#if CONFIG_NEW_TOKENSET
+ cost += (*token_costs)[!prev_t][pt][prev_t];
+#else
+ cost += (*token_costs)[0][pt][prev_t];
+#endif
+
+ token_cache[0] = av1_pt_energy_class[prev_t];
+ ++token_costs;
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+ int16_t t;
+
+ v = qcoeff[rc];
+ cost += av1_get_token_cost(v, &t, cat6_bits);
+#if CONFIG_NEW_TOKENSET
+ cost += (*token_costs)[!t][!prev_t][t];
+#else
+ cost += (*token_costs)[!prev_t][!prev_t][t];
+#endif
+ prev_t = t;
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
+ }
+
+ // eob token
+ if (band_left || CONFIG_NEW_TOKENSET)
+ cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+
+ } else { // !use_fast_coef_costing
+ int band_left = *band_count++;
+
+ // dc token
+ int v = qcoeff[0];
+ int16_t tok;
+#if !CONFIG_NEW_TOKENSET
+ unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+#endif
+ cost = av1_get_token_cost(v, &tok, cat6_bits);
+#if CONFIG_NEW_TOKENSET
+ cost += (*token_costs)[!tok][pt][tok];
+#else
+ cost += (*token_costs)[0][pt][tok];
+#endif
+
+ token_cache[0] = av1_pt_energy_class[tok];
+ ++token_costs;
+
+#if !CONFIG_NEW_TOKENSET
+ tok_cost_ptr = &((*token_costs)[!tok]);
+#endif
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+
+ v = qcoeff[rc];
+ cost += av1_get_token_cost(v, &tok, cat6_bits);
+ pt = get_coef_context(nb, token_cache, c);
+#if CONFIG_NEW_TOKENSET
+ cost += (*token_costs)[!tok][pt][tok];
+#else
+ cost += (*tok_cost_ptr)[pt][tok];
+#endif
+ token_cache[rc] = av1_pt_energy_class[tok];
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
+#if !CONFIG_NEW_TOKENSET
+ tok_cost_ptr = &((*token_costs)[!tok]);
+#endif
+ }
+
+ // eob token
+ if (band_left || CONFIG_NEW_TOKENSET) {
+ pt = get_coef_context(nb, token_cache, c);
+ cost += (*token_costs)[0][pt][EOB_TOKEN];
+ }
+ }
+ }
+
+ return cost;
+}
+#endif // !CONFIG_LV_MAP
+
+int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
+ const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+ int use_fast_coef_costing) {
+#if !CONFIG_LV_MAP
+ const AV1_COMMON *const cm = &cpi->common;
+ return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
+ use_fast_coef_costing);
+#else // !CONFIG_LV_MAP
+ (void)scan_order;
+ (void)use_fast_coef_costing;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+ const BLOCK_SIZE plane_bsize =
+ AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#endif // CONFIG_CHROMA_2X2
+#else // CONFIG_CB4X4
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
+#endif // CONFIG_CB4X4
+
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ return av1_cost_coeffs_txb(cpi, x, plane, block, &txb_ctx);
+#endif // !CONFIG_LV_MAP
+}
+#endif // !CONFIG_PVQ || CONFIG_VAR_TX
+
+// Get transform block visible dimensions cropped to the MI units.
+static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+ BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+ BLOCK_SIZE tx_bsize, int *width, int *height,
+ int *visible_width, int *visible_height) {
+ assert(tx_bsize <= plane_bsize);
+ int txb_height = block_size_high[tx_bsize];
+ int txb_width = block_size_wide[tx_bsize];
+ const int block_height = block_size_high[plane_bsize];
+ const int block_width = block_size_wide[plane_bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+ // than the MI size
+ const int block_rows =
+ (xd->mb_to_bottom_edge >= 0)
+ ? block_height
+ : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+ const int block_cols =
+ (xd->mb_to_right_edge >= 0)
+ ? block_width
+ : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+ const int tx_unit_size = tx_size_wide_log2[0];
+ if (width) *width = txb_width;
+ if (height) *height = txb_height;
+ *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width);
+ *visible_height =
+ clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
+}
+
+// Compute the pixel domain sum square error on all visible 4x4s in the
+// transform block.
+static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+ int plane, const uint8_t *src, const int src_stride,
+ const uint8_t *dst, const int dst_stride, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ int txb_rows, txb_cols, visible_rows, visible_cols;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+ &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+ assert(visible_rows > 0);
+ assert(visible_cols > 0);
+ if (txb_rows == visible_rows && txb_cols == visible_cols) {
+ unsigned sse;
+ cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ return sse;
+ }
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint64_t sse = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+ visible_cols, visible_rows);
+ return (unsigned int)ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ unsigned sse = aom_sse_odd_size(src, src_stride, dst, dst_stride,
+ visible_cols, visible_rows);
+ return sse;
+}
+
+// Compute the squares sum squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd, int plane,
+ const int16_t *diff, const int diff_stride,
+ int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ int visible_rows, visible_cols;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+ NULL, &visible_cols, &visible_rows);
+ return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+}
+
+void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
+ TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
+ OUTPUT_STATUS output_status) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+#if CONFIG_DAALA_DIST
+ int qm = OD_HVS_QM;
+ int use_activity_masking = 0;
+#if CONFIG_PVQ
+ use_activity_masking = x->daala_enc.use_activity_masking;
+#endif // CONFIG_PVQ
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+#else // CONFIG_DAALA_DIST
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif // CONFIG_DAALA_DIST
+
+ if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
+ // Transform domain distortion computation is more efficient as it does
+ // not involve an inverse transform, but it is less accurate.
+ const int buffer_length = tx_size_2d[tx_size];
+ int64_t this_sse;
+ int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_PVQ
+ tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+
+#if CONFIG_HIGHBITDEPTH
+ const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+ *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
+ buffer_length, &this_sse, bd) >>
+ shift;
+#else
+ *out_dist = av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length,
+ &this_sse) >>
+ shift;
+#endif // CONFIG_HIGHBITDEPTH
+#elif CONFIG_HIGHBITDEPTH
+ const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+ *out_dist =
+ av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >>
+ shift;
+#else
+ *out_dist =
+ av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
+#endif // CONFIG_PVQ
+ *out_sse = this_sse >> shift;
+ } else {
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if !CONFIG_PVQ || CONFIG_DAALA_DIST
+ const int bsw = block_size_wide[tx_bsize];
+ const int bsh = block_size_high[tx_bsize];
+#endif
+ const int src_stride = x->plane[plane].src.stride;
+ const int dst_stride = xd->plane[plane].dst.stride;
+ // Scale the transform block index to pixel unit.
+ const int src_idx = (blk_row * src_stride + blk_col)
+ << tx_size_wide_log2[0];
+ const int dst_idx = (blk_row * dst_stride + blk_col)
+ << tx_size_wide_log2[0];
+ const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+ const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+ const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const uint16_t eob = p->eobs[block];
+
+ assert(cpi != NULL);
+ assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+#if CONFIG_DAALA_DIST
+ if (plane == 0 && bsw >= 8 && bsh >= 8) {
+ if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
+ const int pred_stride = block_size_wide[plane_bsize];
+ const int pred_idx = (blk_row * pred_stride + blk_col)
+ << tx_size_wide_log2[0];
+ const int16_t *pred = &pd->pred[pred_idx];
+ int i, j;
+ DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ pred8[j * bsw + i] = pred[j * pred_stride + i];
+ *out_sse = av1_daala_dist(src, src_stride, pred8, bsw, bsw, bsh, qm,
+ use_activity_masking, x->qindex);
+ } else {
+ *out_sse = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
+ qm, use_activity_masking, x->qindex);
+ }
+ } else
+#endif // CONFIG_DAALA_DIST
+ {
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int diff_idx = (blk_row * diff_stride + blk_col)
+ << tx_size_wide_log2[0];
+ const int16_t *diff = &p->src_diff[diff_idx];
+ *out_sse = sum_squares_visible(xd, plane, diff, diff_stride, blk_row,
+ blk_col, plane_bsize, tx_bsize);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
+#endif // CONFIG_HIGHBITDEPTH
+ }
+ *out_sse *= 16;
+
+ if (eob) {
+ if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
+#if CONFIG_DAALA_DIST
+ if (plane == 0 && bsw >= 8 && bsh >= 8)
+ *out_dist = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
+ qm, use_activity_masking, x->qindex);
+ else
+#endif // CONFIG_DAALA_DIST
+ *out_dist =
+ pixel_sse(cpi, xd, plane, src, src_stride, dst, dst_stride,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+ } else {
+#if CONFIG_HIGHBITDEPTH
+ uint8_t *recon;
+ DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ recon = CONVERT_TO_BYTEPTR(recon16);
+ else
+ recon = (uint8_t *)recon16;
+#else
+ DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
+#endif // CONFIG_HIGHBITDEPTH
+
+#if !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0,
+ NULL, 0, bsw, bsh, xd->bd);
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL,
+ 0, bsw, bsh);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+#else
+ (void)dst;
+#endif // !CONFIG_PVQ
+
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+
+ av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, recon,
+ MAX_TX_SIZE, eob);
+
+#if CONFIG_DAALA_DIST
+ if (plane == 0 && bsw >= 8 && bsh >= 8) {
+ *out_dist = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, bsw,
+ bsh, qm, use_activity_masking, x->qindex);
+ } else {
+ if (plane == 0) {
+ // Save decoded pixels for inter block in pd->pred to avoid
+ // block_8x8_rd_txfm_daala_dist() need to produce them
+ // by calling av1_inverse_transform_block() again.
+ const int pred_stride = block_size_wide[plane_bsize];
+ const int pred_idx = (blk_row * pred_stride + blk_col)
+ << tx_size_wide_log2[0];
+ int16_t *pred = &pd->pred[pred_idx];
+ int i, j;
+
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
+ }
+#endif // CONFIG_DAALA_DIST
+ *out_dist =
+ pixel_sse(cpi, xd, plane, src, src_stride, recon, MAX_TX_SIZE,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+#if CONFIG_DAALA_DIST
+ }
+#endif // CONFIG_DAALA_DIST
+ }
+ *out_dist *= 16;
+ } else {
+ *out_dist = *out_sse;
+ }
+ }
+}
+
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const AV1_COMP *cpi = args->cpi;
+ ENTROPY_CONTEXT *a = args->t_above + blk_col;
+ ENTROPY_CONTEXT *l = args->t_left + blk_row;
+#if !CONFIG_TXK_SEL
+ const AV1_COMMON *cm = &cpi->common;
+#endif
+ int64_t rd1, rd2, rd;
+ RD_STATS this_rd_stats;
+
+ assert(tx_size == get_tx_size(plane, xd));
+
+ av1_init_rd_stats(&this_rd_stats);
+
+ if (args->exit_early) return;
+
+ if (!is_inter_block(mbmi)) {
+ av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+ }
+
+#if !CONFIG_TXK_SEL
+ // full forward transform and quantization
+ const int coeff_ctx = combine_entropy_contexts(*a, *l);
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ coeff_ctx, AV1_XFORM_QUANT_FP);
+ if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
+ av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+
+ if (!is_inter_block(mbmi)) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+ p->eobs[block]);
+ av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
+ tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
+ OUTPUT_HAS_DECODED_PIXELS);
+ } else {
+ av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
+ tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
+ OUTPUT_HAS_PREDICTED_PIXELS);
+ }
+#if CONFIG_CFL
+ if (plane == AOM_PLANE_Y && x->cfl_store_y) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+ }
+#endif
+ rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
+ if (args->this_rd + rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
+ }
+#if !CONFIG_PVQ
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const SCAN_ORDER *scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+ this_rd_stats.rate =
+ av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l,
+ args->use_fast_coef_costing);
+#else // !CONFIG_PVQ
+ this_rd_stats.rate = x->rate;
+#endif // !CONFIG_PVQ
+#else // !CONFIG_TXK_SEL
+ av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, a, l, args->use_fast_coef_costing,
+ &this_rd_stats);
+#endif // !CONFIG_TXK_SEL
+
+#if !CONFIG_PVQ
+#if CONFIG_RD_DEBUG
+ av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
+ this_rd_stats.rate);
+#endif // CONFIG_RD_DEBUG
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+#endif // !CONFIG_PVQ
+
+ rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+
+ // TODO(jingning): temporarily enabled only for luma component
+ rd = AOMMIN(rd1, rd2);
+
+#if CONFIG_DAALA_DIST
+ if (plane == 0 &&
+ (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
+ this_rd_stats.dist = 0;
+ this_rd_stats.sse = 0;
+ rd = 0;
+ x->rate_4x4[block] = this_rd_stats.rate;
+ }
+#endif // CONFIG_DAALA_DIST
+
+#if !CONFIG_PVQ
+ this_rd_stats.skip &= !x->plane[plane].eobs[block];
+#else
+ this_rd_stats.skip &= x->pvq_skip[plane];
+#endif // !CONFIG_PVQ
+ av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+ args->this_rd += rd;
+
+ if (args->this_rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
+ }
+}
+
+#if CONFIG_DAALA_DIST
+static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int64_t rd, rd1, rd2;
+ RD_STATS this_rd_stats;
+ int qm = OD_HVS_QM;
+ int use_activity_masking = 0;
+
+ (void)tx_size;
+#if CONFIG_PVQ
+ use_activity_masking = x->daala_enc.use_activity_masking;
+#endif // CONFIG_PVQ
+ av1_init_rd_stats(&this_rd_stats);
+
+ if (args->exit_early) return;
+
+ {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const int diff_stride = block_size_wide[plane_bsize];
+
+ const uint8_t *src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+ const uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+
+ unsigned int tmp1, tmp2;
+ int qindex = x->qindex;
+ const int pred_stride = block_size_wide[plane_bsize];
+ const int pred_idx = (blk_row * pred_stride + blk_col)
+ << tx_size_wide_log2[0];
+ int16_t *pred = &pd->pred[pred_idx];
+ int i, j;
+ const int tx_blk_size = 8;
+
+ DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
+
+ for (j = 0; j < tx_blk_size; j++)
+ for (i = 0; i < tx_blk_size; i++)
+ pred8[j * tx_blk_size + i] = pred[j * diff_stride + i];
+
+ tmp1 = av1_daala_dist(src, src_stride, pred8, tx_blk_size, 8, 8, qm,
+ use_activity_masking, qindex);
+ tmp2 = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, qm,
+ use_activity_masking, qindex);
+
+ if (!is_inter_block(mbmi)) {
+ this_rd_stats.sse = (int64_t)tmp1 * 16;
+ this_rd_stats.dist = (int64_t)tmp2 * 16;
+ } else {
+ // For inter mode, the decoded pixels are provided in pd->pred,
+ // while the predicted pixels are in dst.
+ this_rd_stats.sse = (int64_t)tmp2 * 16;
+ this_rd_stats.dist = (int64_t)tmp1 * 16;
+ }
+ }
+
+ rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
+ if (args->this_rd + rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
+ }
+
+ {
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
+ this_rd_stats.rate = x->rate_4x4[block - max_blocks_wide - 1] +
+ x->rate_4x4[block - max_blocks_wide] +
+ x->rate_4x4[block - 1] + x->rate_4x4[block];
+ }
+ rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+ rd = AOMMIN(rd1, rd2);
+
+ args->rd_stats.dist += this_rd_stats.dist;
+ args->rd_stats.sse += this_rd_stats.sse;
+
+ args->this_rd += rd;
+
+ if (args->this_rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
+ }
+}
+#endif // CONFIG_DAALA_DIST
+
+static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+ RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int use_fast_coef_casting) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct rdcost_block_args args;
+ av1_zero(args);
+ args.x = x;
+ args.cpi = cpi;
+ args.best_rd = ref_best_rd;
+ args.use_fast_coef_costing = use_fast_coef_casting;
+ av1_init_rd_stats(&args.rd_stats);
+
+ if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+
+ av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+#if CONFIG_DAALA_DIST
+ if (plane == 0 &&
+ (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
+ av1_foreach_8x8_transformed_block_in_plane(
+ xd, bsize, plane, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
+ else
+#endif // CONFIG_DAALA_DIST
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+ &args);
+
+ if (args.exit_early) {
+ av1_invalid_rd_stats(rd_stats);
+ } else {
+ *rd_stats = args.rd_stats;
+ }
+}
+
+#if CONFIG_SUPERTX
+void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
+ int64_t *distortion, int *skippable,
+ int64_t *sse, int64_t ref_best_rd, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int use_fast_coef_casting) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct rdcost_block_args args;
+ av1_zero(args);
+ args.cpi = cpi;
+ args.x = x;
+ args.best_rd = ref_best_rd;
+ args.use_fast_coef_costing = use_fast_coef_casting;
+
+#if CONFIG_EXT_TX
+ assert(tx_size < TX_SIZES);
+#endif // CONFIG_EXT_TX
+
+ if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+
+ av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+ block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size,
+ &args);
+
+ if (args.exit_early) {
+ *rate = INT_MAX;
+ *distortion = INT64_MAX;
+ *sse = INT64_MAX;
+ *skippable = 0;
+ } else {
+ *distortion = args.rd_stats.dist;
+ *rate = args.rd_stats.rate;
+ *sse = args.rd_stats.sse;
+ *skippable = !x->plane[plane].eobs[0];
+ }
+}
+#endif // CONFIG_SUPERTX
+
+static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+ const int tx_select =
+ cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
+
+ if (tx_select) {
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+ : intra_tx_size_cat_lookup[bsize];
+ const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+ const int depth = tx_size_to_depth(coded_tx_size);
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+ return r_tx_size;
+ } else {
+ return 0;
+ }
+}
+
+// #TODO(angiebird): use this function whenever it's possible
+int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type) {
+ if (plane > 0) return 0;
+
+ const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+ const AV1_COMMON *cm = &cpi->common;
+ if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
+ !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ const int ext_tx_set =
+ get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+ if (is_inter) {
+ if (ext_tx_set > 0)
+ return cpi
+ ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
+ } else {
+ if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+ return cpi->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
+ [mbmi->mode][tx_type];
+ }
+ }
+#else
+ (void)bsize;
+ if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+ !FIXED_TX_TYPE) {
+ if (is_inter) {
+ return cpi->inter_tx_type_costs[tx_size][tx_type];
+ } else {
+ return cpi->intra_tx_type_costs[tx_size]
+ [intra_mode_to_tx_type_context[mbmi->mode]]
+ [tx_type];
+ }
+ }
+#endif // CONFIG_EXT_TX
+ return 0;
+}
+static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
+ TX_TYPE tx_type, int tx_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int64_t rd = INT64_MAX;
+ aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+ int s0, s1;
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_select =
+ cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
+
+ const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
+
+ assert(skip_prob > 0);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+ s0 = av1_cost_bit(skip_prob, 0);
+ s1 = av1_cost_bit(skip_prob, 1);
+
+ mbmi->tx_type = tx_type;
+ mbmi->tx_size = tx_size;
+ txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size,
+ cpi->sf.use_fast_coef_costing);
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+#if !CONFIG_TXK_SEL
+ int plane = 0;
+ rd_stats->rate += av1_tx_type_cost(cpi, xd, bs, plane, tx_size, tx_type);
+#endif
+
+ if (rd_stats->skip) {
+ if (is_inter) {
+ rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+ } else {
+ rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select,
+ rd_stats->sse);
+ }
+ } else {
+ rd = RDCOST(x->rdmult, x->rddiv,
+ rd_stats->rate + s0 + r_tx_size * tx_select, rd_stats->dist);
+ }
+
+ if (tx_select) rd_stats->rate += r_tx_size;
+
+ if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+ !(rd_stats->skip))
+ rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+
+ return rd;
+}
+
+static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+ TX_TYPE tx_type, TX_SIZE tx_size) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ const int is_inter = is_inter_block(mbmi);
+ int prune = 0;
+ if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+ // passing -1 in for tx_type indicates that all 1D
+ // transforms should be considered for pruning
+ prune = prune_tx_types(cpi, bs, x, xd, -1);
+
+#if CONFIG_REF_MV
+ if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
+#endif // CONFIG_REF_MV
+ if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
+ return 1;
+ if (!is_inter && x->use_default_intra_tx_type &&
+ tx_type != get_default_tx_type(0, xd, 0, tx_size))
+ return 1;
+ if (is_inter && x->use_default_inter_tx_type &&
+ tx_type != get_default_tx_type(0, xd, 0, tx_size))
+ return 1;
+ if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
+#if CONFIG_EXT_TX
+ const AV1_COMMON *const cm = &cpi->common;
+ int ext_tx_set =
+ get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+ if (is_inter) {
+ if (!ext_tx_used_inter[ext_tx_set][tx_type]) return 1;
+ if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+ if (!do_tx_type_search(tx_type, prune)) return 1;
+ }
+ } else {
+ if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+ if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
+ }
+ if (!ext_tx_used_intra[ext_tx_set][tx_type]) return 1;
+ }
+#else // CONFIG_EXT_TX
+ if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
+ if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+ !do_tx_type_search(tx_type, prune))
+ return 1;
+#endif // CONFIG_EXT_TX
+ return 0;
+}
+
+#if CONFIG_EXT_INTER
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+ MACROBLOCK *x, int *r, int64_t *d, int *s,
+ int64_t *sse, int64_t ref_best_rd) {
+ RD_STATS rd_stats;
+ int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT,
+ max_txsize_lookup[bs]);
+ *r = rd_stats.rate;
+ *d = rd_stats.dist;
+ *s = rd_stats.skip;
+ *sse = rd_stats.sse;
+ return rd;
+}
+#endif // CONFIG_EXT_INTER
+
+static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ TX_TYPE tx_type, best_tx_type = DCT_DCT;
+ int64_t this_rd, best_rd = INT64_MAX;
+ aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+ int s0 = av1_cost_bit(skip_prob, 0);
+ int s1 = av1_cost_bit(skip_prob, 1);
+ const int is_inter = is_inter_block(mbmi);
+ int prune = 0;
+ const int plane = 0;
+#if CONFIG_EXT_TX
+ int ext_tx_set;
+#endif // CONFIG_EXT_TX
+ av1_invalid_rd_stats(rd_stats);
+
+ mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+#if CONFIG_VAR_TX
+ mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif // CONFIG_VAR_TX
+#if CONFIG_EXT_TX
+ ext_tx_set =
+ get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
+#endif // CONFIG_EXT_TX
+
+ if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+ prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
+#else
+ prune = prune_tx_types(cpi, bs, x, xd, 0);
+#endif // CONFIG_EXT_TX
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) >
+ 1 &&
+ !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_PVQ
+ od_rollback_buffer pre_buf, post_buf;
+
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+
+ for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+ RD_STATS this_rd_stats;
+ if (is_inter) {
+ if (x->use_default_inter_tx_type &&
+ tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+ continue;
+ if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+ if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+ if (!do_tx_type_search(tx_type, prune)) continue;
+ }
+ } else {
+ if (x->use_default_intra_tx_type &&
+ tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+ continue;
+ if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+ if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
+ }
+ if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
+ }
+
+ mbmi->tx_type = tx_type;
+
+ txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
+ mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+ if (this_rd_stats.rate == INT_MAX) continue;
+ av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+
+ if (this_rd_stats.skip)
+ this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+ else
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
+ this_rd_stats.dist);
+ if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
+ !this_rd_stats.skip)
+ this_rd =
+ AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+
+ if (this_rd < best_rd) {
+ best_rd = this_rd;
+ best_tx_type = mbmi->tx_type;
+ *rd_stats = this_rd_stats;
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+ }
+ }
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+ } else {
+ mbmi->tx_type = DCT_DCT;
+ txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+ cpi->sf.use_fast_coef_costing);
+ }
+#else // CONFIG_EXT_TX
+ if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
+ for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ RD_STATS this_rd_stats;
+ if (!is_inter && x->use_default_intra_tx_type &&
+ tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+ continue;
+ if (is_inter && x->use_default_inter_tx_type &&
+ tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+ continue;
+ mbmi->tx_type = tx_type;
+ txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
+ mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+ if (this_rd_stats.rate == INT_MAX) continue;
+
+ av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+ if (is_inter) {
+ if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+ !do_tx_type_search(tx_type, prune))
+ continue;
+ }
+ if (this_rd_stats.skip)
+ this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+ else
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
+ this_rd_stats.dist);
+ if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
+ this_rd =
+ AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+
+ if (this_rd < best_rd) {
+ best_rd = this_rd;
+ best_tx_type = mbmi->tx_type;
+ *rd_stats = this_rd_stats;
+ }
+ }
+ } else {
+ mbmi->tx_type = DCT_DCT;
+ txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+ cpi->sf.use_fast_coef_costing);
+ }
+#endif // CONFIG_EXT_TX
+ mbmi->tx_type = best_tx_type;
+}
+
+static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+ mbmi->tx_size = TX_4X4;
+ mbmi->tx_type = DCT_DCT;
+#if CONFIG_VAR_TX
+ mbmi->min_tx_size = get_min_tx_size(TX_4X4);
+#endif // CONFIG_VAR_TX
+
+ txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+ cpi->sf.use_fast_coef_costing);
+}
+
+#if CONFIG_TXK_SEL || CONFIG_VAR_TX
+static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+ int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
+ return num_blk;
+}
+#endif // CONFIG_TXK_SEL || CONFIG_VAR_TX
+
+static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+ MACROBLOCK *x, RD_STATS *rd_stats,
+ int64_t ref_best_rd, BLOCK_SIZE bs) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int64_t rd = INT64_MAX;
+ int n;
+ int start_tx, end_tx;
+ int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ TX_SIZE best_tx_size = max_tx_size;
+ TX_TYPE best_tx_type = DCT_DCT;
+#if CONFIG_TXK_SEL
+ TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+ const int num_blk = bsize_to_num_blk(bs);
+#endif // CONFIG_TXK_SEL
+ const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+ const int is_inter = is_inter_block(mbmi);
+#if CONFIG_PVQ
+ od_rollback_buffer buf;
+ od_encode_checkpoint(&x->daala_enc, &buf);
+#endif // CONFIG_PVQ
+
+ av1_invalid_rd_stats(rd_stats);
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ int evaluate_rect_tx = 0;
+ if (tx_select) {
+ evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi);
+ } else {
+ const TX_SIZE chosen_tx_size =
+ tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+ evaluate_rect_tx = is_rect_tx(chosen_tx_size);
+ assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi)));
+ }
+ if (evaluate_rect_tx) {
+ TX_TYPE tx_start = DCT_DCT;
+ TX_TYPE tx_end = TX_TYPES;
+#if CONFIG_TXK_SEL
+ // The tx_type becomes dummy when lv_map is on. The tx_type search will be
+ // performed in av1_search_txk_type()
+ tx_end = DCT_DCT + 1;
+#endif
+ TX_TYPE tx_type;
+ for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
+#if CONFIG_REF_MV
+ if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
+#endif // CONFIG_REF_MV
+ const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
+ RD_STATS this_rd_stats;
+ int ext_tx_set =
+ get_ext_tx_set(rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
+ if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
+ (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
+ rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
+ rect_tx_size);
+ if (rd < best_rd) {
+#if CONFIG_TXK_SEL
+ memcpy(best_txk_type, mbmi->txk_type,
+ sizeof(best_txk_type[0]) * num_blk);
+#endif
+ best_tx_type = tx_type;
+ best_tx_size = rect_tx_size;
+ best_rd = rd;
+ *rd_stats = this_rd_stats;
+ }
+ }
+#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+ const int is_inter = is_inter_block(mbmi);
+ if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
+#endif // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+ }
+ }
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+ if (tx_select) {
+ start_tx = max_tx_size;
+ end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4;
+ } else {
+ const TX_SIZE chosen_tx_size =
+ tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+ start_tx = chosen_tx_size;
+ end_tx = chosen_tx_size;
+ }
+
+ last_rd = INT64_MAX;
+ for (n = start_tx; n >= end_tx; --n) {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ if (is_rect_tx(n)) break;
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+ TX_TYPE tx_start = DCT_DCT;
+ TX_TYPE tx_end = TX_TYPES;
+#if CONFIG_TXK_SEL
+ // The tx_type becomes dummy when lv_map is on. The tx_type search will be
+ // performed in av1_search_txk_type()
+ tx_end = DCT_DCT + 1;
+#endif
+ TX_TYPE tx_type;
+ for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
+ RD_STATS this_rd_stats;
+ if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue;
+ rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n);
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &buf);
+#endif // CONFIG_PVQ
+ // Early termination in transform size search.
+ if (cpi->sf.tx_size_search_breakout &&
+ (rd == INT64_MAX ||
+ (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
+ (n < (int)max_tx_size && rd > last_rd)))
+ break;
+
+ last_rd = rd;
+ if (rd < best_rd) {
+#if CONFIG_TXK_SEL
+ memcpy(best_txk_type, mbmi->txk_type,
+ sizeof(best_txk_type[0]) * num_blk);
+#endif
+ best_tx_type = tx_type;
+ best_tx_size = n;
+ best_rd = rd;
+ *rd_stats = this_rd_stats;
+ }
+#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+ const int is_inter = is_inter_block(mbmi);
+ if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
+#endif // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+ }
+ }
+ mbmi->tx_size = best_tx_size;
+ mbmi->tx_type = best_tx_type;
+#if CONFIG_TXK_SEL
+ memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * num_blk);
+#endif
+
+#if CONFIG_VAR_TX
+ mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif // CONFIG_VAR_TX
+
+#if !CONFIG_EXT_TX
+ if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
+#endif // !CONFIG_EXT_TX
+#if CONFIG_PVQ
+ if (best_rd != INT64_MAX) {
+ txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size);
+ }
+#endif // CONFIG_PVQ
+}
+
+static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bs,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ av1_init_rd_stats(rd_stats);
+
+ assert(bs == xd->mi[0]->mbmi.sb_type);
+
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+ choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else {
+ choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+ }
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+ PREDICTION_MODE best_intra_mode) {
+ if (mode == D117_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ if (mode == D63_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D207_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D153_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ return 0;
+}
+
+// Model based RD estimation for luma intra blocks.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mode_cost) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ RD_STATS this_rd_stats;
+ int row, col;
+ int64_t temp_sse, this_rd;
+ const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0);
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ mbmi->tx_size = tx_size;
+ // Prediction.
+ const int step = stepr * stepc;
+ int block = 0;
+ for (row = 0; row < max_blocks_high; row += stepr) {
+ for (col = 0; col < max_blocks_wide; col += stepc) {
+ av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
+ block += step;
+ }
+ }
+ // RD estimation.
+ model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
+ &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
+#if CONFIG_EXT_INTRA
+ if (av1_is_directional_mode(mbmi->mode, bsize)) {
+ mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+ }
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ if (mbmi->mode == DC_PRED) {
+ const aom_prob prob = cpi->common.fc->filter_intra_probs[0];
+ if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+ const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0];
+ mode_cost += (av1_cost_bit(prob, 1) +
+ write_uniform_cost(FILTER_INTRA_MODES, mode));
+ } else {
+ mode_cost += av1_cost_bit(prob, 0);
+ }
+ }
+#endif // CONFIG_FILTER_INTRA
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + mode_cost,
+ this_rd_stats.dist);
+ return this_rd;
+}
+
+#if CONFIG_PALETTE
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
+ int orig_height, int new_width,
+ int new_height) {
+ int j;
+ assert(new_width >= orig_width);
+ assert(new_height >= orig_height);
+ if (new_width == orig_width && new_height == orig_height) return;
+
+ for (j = orig_height - 1; j >= 0; --j) {
+ memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+ // Copy last column to extra columns.
+ memset(color_map + j * new_width + orig_width,
+ color_map[j * new_width + orig_width - 1], new_width - orig_width);
+ }
+ // Copy last row to extra rows.
+ for (j = orig_height; j < new_height; ++j) {
+ memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+ new_width);
+ }
+}
+
+static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int palette_ctx,
+ int dc_mode_cost, MB_MODE_INFO *best_mbmi,
+ uint8_t *best_palette_color_map,
+ int64_t *best_rd, int64_t *best_model_rd,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable) {
+ int rate_overhead = 0;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mic = xd->mi[0];
+ MB_MODE_INFO *const mbmi = &mic->mbmi;
+ int this_rate, colors, n;
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *const src = x->plane[0].src.buf;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+
+ assert(cpi->common.allow_screen_content_tools);
+
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth)
+ colors = av1_count_colors_highbd(src, src_stride, rows, cols,
+ cpi->common.bit_depth);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ colors = av1_count_colors(src, src_stride, rows, cols);
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif // CONFIG_FILTER_INTRA
+
+ if (colors > 1 && colors <= 64) {
+ int r, c, i, j, k, palette_mode_cost;
+ const int max_itr = 50;
+ uint8_t color_order[PALETTE_MAX_SIZE];
+ float *const data = x->palette_buffer->kmeans_data_buf;
+ float centroids[PALETTE_MAX_SIZE];
+ float lb, ub, val;
+ RD_STATS tokenonly_rd_stats;
+ int64_t this_rd, this_model_rd;
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#if CONFIG_HIGHBITDEPTH
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ if (cpi->common.use_highbitdepth)
+ lb = ub = src16[0];
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ lb = ub = src[0];
+
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ val = src16[r * src_stride + c];
+ data[r * cols + c] = val;
+ if (val < lb)
+ lb = val;
+ else if (val > ub)
+ ub = val;
+ }
+ }
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ val = src[r * src_stride + c];
+ data[r * cols + c] = val;
+ if (val < lb)
+ lb = val;
+ else if (val > ub)
+ ub = val;
+ }
+ }
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ mbmi->mode = DC_PRED;
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif // CONFIG_FILTER_INTRA
+
+ if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
+
+ for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+ --n) {
+ for (i = 0; i < n; ++i)
+ centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+ av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
+ k = av1_remove_duplicates(centroids, n);
+
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth)
+ for (i = 0; i < k; ++i)
+ pmi->palette_colors[i] =
+ clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ for (i = 0; i < k; ++i)
+ pmi->palette_colors[i] = clip_pixel((int)centroids[i]);
+ pmi->palette_size[0] = k;
+
+ av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
+ extend_palette_color_map(color_map, cols, rows, block_width,
+ block_height);
+ palette_mode_cost =
+ dc_mode_cost +
+ cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
+ write_uniform_cost(k, color_map[0]) +
+ av1_cost_bit(
+ av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
+ 1);
+ palette_mode_cost += av1_palette_color_cost_y(pmi, cpi->common.bit_depth);
+ for (i = 0; i < rows; ++i) {
+ for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+ int color_idx;
+ const int color_ctx = av1_get_palette_color_index_context(
+ color_map, block_width, i, j, k, color_order, &color_idx);
+ assert(color_idx >= 0 && color_idx < k);
+ palette_mode_cost += cpi->palette_y_color_cost[k - PALETTE_MIN_SIZE]
+ [color_ctx][color_idx];
+ }
+ }
+ this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+ continue;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+ if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+ tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+ }
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ memcpy(best_palette_color_map, color_map,
+ block_width * block_height * sizeof(color_map[0]));
+ *best_mbmi = *mbmi;
+ rate_overhead = this_rate - tokenonly_rd_stats.rate;
+ if (rate) *rate = this_rate;
+ if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+ if (distortion) *distortion = tokenonly_rd_stats.dist;
+ if (skippable) *skippable = tokenonly_rd_stats.skip;
+ }
+ }
+ }
+
+ if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+ }
+ *mbmi = *best_mbmi;
+ return rate_overhead;
+}
+#endif // CONFIG_PALETTE
+
+static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
+ const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
+ PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion,
+ BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PREDICTION_MODE mode;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int64_t best_rd = rd_thresh;
+ struct macroblock_plane *p = &x->plane[0];
+ struct macroblockd_plane *pd = &xd->plane[0];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+ uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4];
+#if CONFIG_CB4X4
+ // TODO(jingning): This is a temporal change. The whole function should be
+ // out when cb4x4 is enabled.
+ ENTROPY_CONTEXT ta[4], tempa[4];
+ ENTROPY_CONTEXT tl[4], templ[4];
+#else
+ ENTROPY_CONTEXT ta[2], tempa[2];
+ ENTROPY_CONTEXT tl[2], templ[2];
+#endif // CONFIG_CB4X4
+
+ const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
+ const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
+ const int tx_width_unit = tx_size_wide_unit[tx_size];
+ const int tx_height_unit = tx_size_high_unit[tx_size];
+ const int pred_block_width = block_size_wide[bsize];
+ const int pred_block_height = block_size_high[bsize];
+ const int tx_width = tx_size_wide[tx_size];
+ const int tx_height = tx_size_high[tx_size];
+ const int pred_width_in_transform_blocks = pred_block_width / tx_width;
+ const int pred_height_in_transform_blocks = pred_block_height / tx_height;
+ int idx, idy;
+ int best_can_skip = 0;
+ uint8_t best_dst[8 * 8];
+#if CONFIG_HIGHBITDEPTH
+ uint16_t best_dst16[8 * 8];
+#endif // CONFIG_HIGHBITDEPTH
+ const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ const int sub_bsize = bsize;
+#else
+ const int sub_bsize = BLOCK_4X4;
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+#if CONFIG_PVQ
+ od_rollback_buffer pre_buf, post_buf;
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+
+ assert(bsize < BLOCK_8X8);
+ assert(tx_width < 8 || tx_height < 8);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ if (is_lossless)
+ assert(tx_width == 4 && tx_height == 4);
+ else
+ assert(tx_width == pred_block_width && tx_height == pred_block_height);
+#else
+ assert(tx_width == 4 && tx_height == 4);
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+ memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0]));
+ memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0]));
+
+ xd->mi[0]->mbmi.tx_size = tx_size;
+
+#if CONFIG_PALETTE
+ xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
+#endif // CONFIG_PALETTE
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ int64_t this_rd;
+ int ratey = 0;
+ int64_t distortion = 0;
+ int rate = bmode_costs[mode];
+ int can_skip = 1;
+
+ if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
+ (1 << mode)))
+ continue;
+
+ // Only do the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mode, *best_mode)) continue;
+ }
+
+ memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
+ memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
+
+ for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) {
+ for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) {
+ const int block_raster_idx = (row + idy) * 2 + (col + idx);
+ const int block =
+ av1_raster_order_to_block_index(tx_size, block_raster_idx);
+ const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+ uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+#if !CONFIG_PVQ
+ int16_t *const src_diff = av1_raster_block_offset_int16(
+ BLOCK_8X8, block_raster_idx, p->src_diff);
+#endif
+ int skip;
+ assert(block < 4);
+ assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+ idx == 0 && idy == 0));
+ assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+ block == 0 || block == 2));
+ xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
+ av1_predict_intra_block(
+ xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode, dst,
+ dst_stride, dst, dst_stride, col + idx, row + idy, 0);
+#if !CONFIG_PVQ
+ aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
+ src_stride, dst, dst_stride, xd->bd);
+#endif
+ if (is_lossless) {
+ TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+ const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+ const int coeff_ctx =
+ combine_entropy_contexts(tempa[idx], templ[idy]);
+#if !CONFIG_PVQ
+ av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+ tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+ ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
+ tempa + idx, templ + idy,
+ cpi->sf.use_fast_coef_costing);
+ skip = (p->eobs[block] == 0);
+ can_skip &= skip;
+ tempa[idx] = !skip;
+ templ[idy] = !skip;
+#if CONFIG_EXT_TX
+ if (tx_size == TX_8X4) {
+ tempa[idx + 1] = tempa[idx];
+ } else if (tx_size == TX_4X8) {
+ templ[idy + 1] = templ[idy];
+ }
+#endif // CONFIG_EXT_TX
+#else
+ (void)scan_order;
+
+ av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+ tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
+
+ ratey += x->rate;
+ skip = x->pvq_skip[0];
+ tempa[idx] = !skip;
+ templ[idy] = !skip;
+ can_skip &= skip;
+#endif
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next_highbd;
+#if CONFIG_PVQ
+ if (!skip)
+#endif
+ av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+ DCT_DCT, tx_size, dst, dst_stride,
+ p->eobs[block]);
+ } else {
+ int64_t dist;
+ unsigned int tmp;
+ TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+ const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+ const int coeff_ctx =
+ combine_entropy_contexts(tempa[idx], templ[idy]);
+#if !CONFIG_PVQ
+ av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+ tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+ av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+ ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
+ tempa + idx, templ + idy,
+ cpi->sf.use_fast_coef_costing);
+ skip = (p->eobs[block] == 0);
+ can_skip &= skip;
+ tempa[idx] = !skip;
+ templ[idy] = !skip;
+#if CONFIG_EXT_TX
+ if (tx_size == TX_8X4) {
+ tempa[idx + 1] = tempa[idx];
+ } else if (tx_size == TX_4X8) {
+ templ[idy + 1] = templ[idy];
+ }
+#endif // CONFIG_EXT_TX
+#else
+ (void)scan_order;
+
+ av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+ tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+ ratey += x->rate;
+ skip = x->pvq_skip[0];
+ tempa[idx] = !skip;
+ templ[idy] = !skip;
+ can_skip &= skip;
+#endif
+#if CONFIG_PVQ
+ if (!skip)
+#endif
+ av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+ tx_type, tx_size, dst, dst_stride,
+ p->eobs[block]);
+ cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+ dist = (int64_t)tmp << 4;
+ distortion += dist;
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next_highbd;
+ }
+ }
+ }
+
+ rate += ratey;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ best_can_skip = can_skip;
+ *best_mode = mode;
+ memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
+ memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif
+ for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
+ memcpy(best_dst16 + idy * 8,
+ CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
+ }
+ }
+ next_highbd : {}
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif
+ }
+
+ if (best_rd >= rd_thresh) return best_rd;
+
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &post_buf);
+#endif
+
+ if (y_skip) *y_skip &= best_can_skip;
+
+ for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
+ memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ best_dst16 + idy * 8,
+ pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
+ }
+
+ return best_rd;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ int64_t this_rd;
+ int ratey = 0;
+ int64_t distortion = 0;
+ int rate = bmode_costs[mode];
+ int can_skip = 1;
+
+ if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
+ (1 << mode))) {
+ continue;
+ }
+
+ // Only do the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mode, *best_mode)) continue;
+ }
+
+ memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
+ memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
+
+ for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) {
+ for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) {
+ const int block_raster_idx = (row + idy) * 2 + (col + idx);
+ int block = av1_raster_order_to_block_index(tx_size, block_raster_idx);
+ const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+ uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+#if !CONFIG_PVQ
+ int16_t *const src_diff = av1_raster_block_offset_int16(
+ BLOCK_8X8, block_raster_idx, p->src_diff);
+#endif // !CONFIG_PVQ
+ int skip;
+ assert(block < 4);
+ assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+ idx == 0 && idy == 0));
+ assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+ block == 0 || block == 2));
+ xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
+ av1_predict_intra_block(xd, pd->width, pd->height,
+ txsize_to_bsize[tx_size], mode, dst, dst_stride,
+ dst, dst_stride,
+#if CONFIG_CB4X4
+ 2 * (col + idx), 2 * (row + idy),
+#else
+ col + idx, row + idy,
+#endif // CONFIG_CB4X4
+ 0);
+#if !CONFIG_PVQ
+ aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
+ dst, dst_stride);
+#endif // !CONFIG_PVQ
+
+ TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+ const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+ const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
+#if CONFIG_CB4X4
+ block = 4 * block;
+#endif // CONFIG_CB4X4
+#if !CONFIG_PVQ
+ const AV1_XFORM_QUANT xform_quant =
+ is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+ av1_xform_quant(cm, x, 0, block,
+#if CONFIG_CB4X4
+ 2 * (row + idy), 2 * (col + idx),
+#else
+ row + idy, col + idx,
+#endif // CONFIG_CB4X4
+ BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
+
+ if (!is_lossless) {
+ av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+ }
+
+ ratey +=
+ av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, tempa + idx,
+ templ + idy, cpi->sf.use_fast_coef_costing);
+ skip = (p->eobs[block] == 0);
+ can_skip &= skip;
+ tempa[idx] = !skip;
+ templ[idy] = !skip;
+#if CONFIG_EXT_TX
+ if (tx_size == TX_8X4) {
+ tempa[idx + 1] = tempa[idx];
+ } else if (tx_size == TX_4X8) {
+ templ[idy + 1] = templ[idy];
+ }
+#endif // CONFIG_EXT_TX
+#else
+ (void)scan_order;
+
+ av1_xform_quant(cm, x, 0, block,
+#if CONFIG_CB4X4
+ 2 * (row + idy), 2 * (col + idx),
+#else
+ row + idy, col + idx,
+#endif // CONFIG_CB4X4
+ BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+
+ ratey += x->rate;
+ skip = x->pvq_skip[0];
+ tempa[idx] = !skip;
+ templ[idy] = !skip;
+ can_skip &= skip;
+#endif // !CONFIG_PVQ
+
+ if (!is_lossless) { // To use the pixel domain distortion, we need to
+ // calculate inverse txfm *before* calculating RD
+ // cost. Compared to calculating the distortion in
+ // the frequency domain, the overhead of encoding
+ // effort is low.
+#if CONFIG_PVQ
+ if (!skip)
+#endif // CONFIG_PVQ
+ av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+ tx_type, tx_size, dst, dst_stride,
+ p->eobs[block]);
+ unsigned int tmp;
+ cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+ const int64_t dist = (int64_t)tmp << 4;
+ distortion += dist;
+ }
+
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next;
+
+ if (is_lossless) { // Calculate inverse txfm *after* RD cost.
+#if CONFIG_PVQ
+ if (!skip)
+#endif // CONFIG_PVQ
+ av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+ DCT_DCT, tx_size, dst, dst_stride,
+ p->eobs[block]);
+ }
+ }
+ }
+
+ rate += ratey;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ best_can_skip = can_skip;
+ *best_mode = mode;
+ memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
+ memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+ for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
+ memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+ pred_width_in_transform_blocks * 4);
+ }
+ next : {}
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+ } // mode decision loop
+
+ if (best_rd >= rd_thresh) return best_rd;
+
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+
+ if (y_skip) *y_skip &= best_can_skip;
+
+ for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
+ memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+ pred_width_in_transform_blocks * 4);
+
+ return best_rd;
+}
+
+static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
+ MACROBLOCK *mb, int *rate,
+ int *rate_y, int64_t *distortion,
+ int *y_skip, int64_t best_rd) {
+ const MACROBLOCKD *const xd = &mb->e_mbd;
+ MODE_INFO *const mic = xd->mi[0];
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
+ MB_MODE_INFO *const mbmi = &mic->mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
+ const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ int cost = 0;
+ int64_t total_distortion = 0;
+ int tot_rate_y = 0;
+ int64_t total_rd = 0;
+ const int *bmode_costs = cpi->mbmode_cost[0];
+ const int is_lossless = xd->lossless[mbmi->segment_id];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
+#else
+ const TX_SIZE tx_size = TX_4X4;
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+ mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif // CONFIG_INTRA_INTERP
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif // CONFIG_FILTER_INTRA
+
+ // TODO(any): Add search of the tx_type to improve rd performance at the
+ // expense of speed.
+ mbmi->tx_type = DCT_DCT;
+ mbmi->tx_size = tx_size;
+
+ if (y_skip) *y_skip = 1;
+
+ // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this
+ // 8x8 coding block.
+ for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) {
+ for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) {
+ PREDICTION_MODE best_mode = DC_PRED;
+ int r = INT_MAX, ry = INT_MAX;
+ int64_t d = INT64_MAX, this_rd = INT64_MAX;
+ int j;
+ const int pred_block_idx = idy * 2 + idx;
+ if (cpi->common.frame_type == KEY_FRAME) {
+ const PREDICTION_MODE A =
+ av1_above_block_mode(mic, above_mi, pred_block_idx);
+ const PREDICTION_MODE L =
+ av1_left_block_mode(mic, left_mi, pred_block_idx);
+
+ bmode_costs = cpi->y_mode_costs[A][L];
+ }
+ this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
+ cpi, mb, idy, idx, &best_mode, bmode_costs,
+ xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
+ &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
+#if !CONFIG_DAALA_DIST
+ if (this_rd >= best_rd - total_rd) return INT64_MAX;
+#endif // !CONFIG_DAALA_DIST
+ total_rd += this_rd;
+ cost += r;
+ total_distortion += d;
+ tot_rate_y += ry;
+
+ mic->bmi[pred_block_idx].as_mode = best_mode;
+ for (j = 1; j < pred_height_in_4x4_blocks; ++j)
+ mic->bmi[pred_block_idx + j * 2].as_mode = best_mode;
+ for (j = 1; j < pred_width_in_4x4_blocks; ++j)
+ mic->bmi[pred_block_idx + j].as_mode = best_mode;
+
+ if (total_rd >= best_rd) return INT64_MAX;
+ }
+ }
+ mbmi->mode = mic->bmi[3].as_mode;
+
+#if CONFIG_DAALA_DIST
+ {
+ const struct macroblock_plane *p = &mb->plane[0];
+ const struct macroblockd_plane *pd = &xd->plane[0];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ uint8_t *src = p->src.buf;
+ uint8_t *dst = pd->dst.buf;
+ int use_activity_masking = 0;
+ int qm = OD_HVS_QM;
+
+#if CONFIG_PVQ
+ use_activity_masking = mb->daala_enc.use_activity_masking;
+#endif // CONFIG_PVQ
+ // Daala-defined distortion computed for the block of 8x8 pixels
+ total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8,
+ qm, use_activity_masking, mb->qindex)
+ << 4;
+ }
+#endif // CONFIG_DAALA_DIST
+ // Add in the cost of the transform type
+ if (!is_lossless) {
+ int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) >
+ 1) {
+ const int eset =
+ get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
+ rate_tx_type = cpi->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
+ [mbmi->mode][mbmi->tx_type];
+ }
+#else
+ rate_tx_type =
+ cpi->intra_tx_type_costs[txsize_sqr_map[tx_size]]
+ [intra_mode_to_tx_type_context[mbmi->mode]]
+ [mbmi->tx_type];
+#endif // CONFIG_EXT_TX
+ assert(mbmi->tx_size == tx_size);
+ cost += rate_tx_type;
+ tot_rate_y += rate_tx_type;
+ }
+
+ *rate = cost;
+ *rate_y = tot_rate_y;
+ *distortion = total_distortion;
+
+ return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
+}
+
+#if CONFIG_FILTER_INTRA
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable,
+ BLOCK_SIZE bsize, int mode_cost,
+ int64_t *best_rd, int64_t *best_model_rd,
+ uint16_t skip_mask) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mic = xd->mi[0];
+ MB_MODE_INFO *mbmi = &mic->mbmi;
+ int filter_intra_selected_flag = 0;
+ FILTER_INTRA_MODE mode;
+ TX_SIZE best_tx_size = TX_4X4;
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ TX_TYPE best_tx_type;
+
+ av1_zero(filter_intra_mode_info);
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
+ mbmi->mode = DC_PRED;
+#if CONFIG_PALETTE
+ mbmi->palette_mode_info.palette_size[0] = 0;
+#endif // CONFIG_PALETTE
+
+ for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+ int this_rate;
+ int64_t this_rd, this_model_rd;
+ RD_STATS tokenonly_rd_stats;
+ if (skip_mask & (1 << mode)) continue;
+ mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode;
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+ continue;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate = tokenonly_rd_stats.rate +
+ av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
+ write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ best_tx_size = mic->mbmi.tx_size;
+ filter_intra_mode_info = mbmi->filter_intra_mode_info;
+ best_tx_type = mic->mbmi.tx_type;
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip;
+ filter_intra_selected_flag = 1;
+ }
+ }
+
+ if (filter_intra_selected_flag) {
+ mbmi->mode = DC_PRED;
+ mbmi->tx_size = best_tx_size;
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
+ filter_intra_mode_info.use_filter_intra_mode[0];
+ mbmi->filter_intra_mode_info.filter_intra_mode[0] =
+ filter_intra_mode_info.filter_intra_mode[0];
+ mbmi->tx_type = best_tx_type;
+ return 1;
+ } else {
+ return 0;
+ }
+}
+#endif // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+// Run RD calculation with given luma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t calc_rd_given_intra_angle(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
+ int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
+ RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
+ TX_TYPE *best_tx_type,
+#if CONFIG_INTRA_INTERP
+ INTRA_FILTER *best_filter,
+#endif // CONFIG_INTRA_INTERP
+ int64_t *best_rd, int64_t *best_model_rd) {
+ int this_rate;
+ RD_STATS tokenonly_rd_stats;
+ int64_t this_rd, this_model_rd;
+ MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+
+ mbmi->angle_delta[0] = angle_delta;
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+ return INT64_MAX;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
+ if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
+
+ this_rate = tokenonly_rd_stats.rate + mode_cost +
+ write_uniform_cost(2 * max_angle_delta + 1,
+ mbmi->angle_delta[0] + max_angle_delta);
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_angle_delta = mbmi->angle_delta[0];
+ *best_tx_size = mbmi->tx_size;
+#if CONFIG_INTRA_INTERP
+ *best_filter = mbmi->intra_filter;
+#endif // CONFIG_INTRA_INTERP
+ *best_tx_type = mbmi->tx_type;
+ *rate = this_rate;
+ rd_stats->rate = tokenonly_rd_stats.rate;
+ rd_stats->dist = tokenonly_rd_stats.dist;
+ rd_stats->skip = tokenonly_rd_stats.skip;
+ }
+ return this_rd;
+}
+
+// With given luma directional intra prediction mode, pick the best angle delta
+// Return the RD cost corresponding to the best angle delta.
+static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, RD_STATS *rd_stats,
+ BLOCK_SIZE bsize, int mode_cost,
+ int64_t best_rd,
+ int64_t *best_model_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mic = xd->mi[0];
+ MB_MODE_INFO *mbmi = &mic->mbmi;
+ int i, angle_delta, best_angle_delta = 0;
+ int first_try = 1;
+#if CONFIG_INTRA_INTERP
+ int p_angle;
+ const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+ INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
+#endif // CONFIG_INTRA_INTERP
+ int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+ TX_SIZE best_tx_size = mic->mbmi.tx_size;
+ TX_TYPE best_tx_type = mbmi->tx_type;
+
+ for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+ for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+#if CONFIG_INTRA_INTERP
+ for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+ if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
+ mic->mbmi.intra_filter = filter;
+#endif // CONFIG_INTRA_INTERP
+ for (i = 0; i < 2; ++i) {
+ best_rd_in = (best_rd == INT64_MAX)
+ ? INT64_MAX
+ : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+ this_rd = calc_rd_given_intra_angle(
+ cpi, x, bsize,
+#if CONFIG_INTRA_INTERP
+ mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+#else
+ mode_cost,
+#endif // CONFIG_INTRA_INTERP
+ best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
+ rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_INTRA_INTERP
+ &best_filter,
+#endif // CONFIG_INTRA_INTERP
+ &best_rd, best_model_rd);
+ rd_cost[2 * angle_delta + i] = this_rd;
+ if (first_try && this_rd == INT64_MAX) return best_rd;
+ first_try = 0;
+ if (angle_delta == 0) {
+ rd_cost[1] = this_rd;
+ break;
+ }
+ }
+#if CONFIG_INTRA_INTERP
+ }
+#endif // CONFIG_INTRA_INTERP
+ }
+
+ assert(best_rd != INT64_MAX);
+ for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ int64_t rd_thresh;
+#if CONFIG_INTRA_INTERP
+ for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+ if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
+ mic->mbmi.intra_filter = filter;
+#endif // CONFIG_INTRA_INTERP
+ for (i = 0; i < 2; ++i) {
+ int skip_search = 0;
+ rd_thresh = best_rd + (best_rd >> 5);
+ if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+ rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+ skip_search = 1;
+ if (!skip_search) {
+ calc_rd_given_intra_angle(
+ cpi, x, bsize,
+#if CONFIG_INTRA_INTERP
+ mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+#else
+ mode_cost,
+#endif // CONFIG_INTRA_INTERP
+ best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
+ rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_INTRA_INTERP
+ &best_filter,
+#endif // CONFIG_INTRA_INTERP
+ &best_rd, best_model_rd);
+ }
+ }
+#if CONFIG_INTRA_INTERP
+ }
+#endif // CONFIG_INTRA_INTERP
+ }
+
+#if CONFIG_INTRA_INTERP
+ if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) {
+ p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP;
+ if (av1_is_intra_filter_switchable(p_angle)) {
+ for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
+ mic->mbmi.intra_filter = filter;
+ this_rd = calc_rd_given_intra_angle(
+ cpi, x, bsize,
+ mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+ best_rd, best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+ &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
+ &best_rd, best_model_rd);
+ }
+ }
+ }
+#endif // CONFIG_INTRA_INTERP
+
+ mbmi->tx_size = best_tx_size;
+ mbmi->angle_delta[0] = best_angle_delta;
+#if CONFIG_INTRA_INTERP
+ mic->mbmi.intra_filter = best_filter;
+#endif // CONFIG_INTRA_INTERP
+ mbmi->tx_type = best_tx_type;
+ return best_rd;
+}
+
+// Indices are sign, integer, and fractional part of the gradient value
+static const uint8_t gradient_to_angle_bin[2][7][16] = {
+ {
+ { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ },
+ {
+ { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
+ { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ },
+};
+
+static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
+ 0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+};
+
+static void angle_estimation(const uint8_t *src, int src_stride, int rows,
+ int cols, uint8_t *directional_mode_skip_mask) {
+ int i, r, c, index, dx, dy, temp, sn, remd, quot;
+ uint64_t hist[DIRECTIONAL_MODES];
+ uint64_t hist_sum = 0;
+
+ memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+ src += src_stride;
+ for (r = 1; r < rows; ++r) {
+ for (c = 1; c < cols; ++c) {
+ dx = src[c] - src[c - 1];
+ dy = src[c] - src[c - src_stride];
+ temp = dx * dx + dy * dy;
+ if (dy == 0) {
+ index = 2;
+ } else {
+ sn = (dx > 0) ^ (dy > 0);
+ dx = abs(dx);
+ dy = abs(dy);
+ remd = dx % dy;
+ quot = dx / dy;
+ remd = remd * 16 / dy;
+ index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+ }
+ hist[index] += temp;
+ }
+ src += src_stride;
+ }
+
+ for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+ for (i = 0; i < INTRA_MODES; ++i) {
+ if (i != DC_PRED && i != TM_PRED) {
+ const uint8_t angle_bin = mode_to_angle_bin[i];
+ uint64_t score = 2 * hist[angle_bin];
+ int weight = 2;
+ if (angle_bin > 0) {
+ score += hist[angle_bin - 1];
+ ++weight;
+ }
+ if (angle_bin < DIRECTIONAL_MODES - 1) {
+ score += hist[angle_bin + 1];
+ ++weight;
+ }
+ if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+ directional_mode_skip_mask[i] = 1;
+ }
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
+ int rows, int cols,
+ uint8_t *directional_mode_skip_mask) {
+ int i, r, c, index, dx, dy, temp, sn, remd, quot;
+ uint64_t hist[DIRECTIONAL_MODES];
+ uint64_t hist_sum = 0;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+ memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+ src += src_stride;
+ for (r = 1; r < rows; ++r) {
+ for (c = 1; c < cols; ++c) {
+ dx = src[c] - src[c - 1];
+ dy = src[c] - src[c - src_stride];
+ temp = dx * dx + dy * dy;
+ if (dy == 0) {
+ index = 2;
+ } else {
+ sn = (dx > 0) ^ (dy > 0);
+ dx = abs(dx);
+ dy = abs(dy);
+ remd = dx % dy;
+ quot = dx / dy;
+ remd = remd * 16 / dy;
+ index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+ }
+ hist[index] += temp;
+ }
+ src += src_stride;
+ }
+
+ for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+ for (i = 0; i < INTRA_MODES; ++i) {
+ if (i != DC_PRED && i != TM_PRED) {
+ const uint8_t angle_bin = mode_to_angle_bin[i];
+ uint64_t score = 2 * hist[angle_bin];
+ int weight = 2;
+ if (angle_bin > 0) {
+ score += hist[angle_bin - 1];
+ ++weight;
+ }
+ if (angle_bin < DIRECTIONAL_MODES - 1) {
+ score += hist[angle_bin + 1];
+ ++weight;
+ }
+ if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+ directional_mode_skip_mask[i] = 1;
+ }
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_EXT_INTRA
+
+// This function is used only for intra_only frames
+static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable,
+ BLOCK_SIZE bsize, int64_t best_rd) {
+ uint8_t mode_idx;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mic = xd->mi[0];
+ MB_MODE_INFO *const mbmi = &mic->mbmi;
+ MB_MODE_INFO best_mbmi = *mbmi;
+ int64_t best_model_rd = INT64_MAX;
+#if CONFIG_EXT_INTRA
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+#if CONFIG_INTRA_INTERP
+ const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+#endif // CONFIG_INTRA_INTERP
+ int is_directional_mode;
+ uint8_t directional_mode_skip_mask[INTRA_MODES];
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *src = x->plane[0].src.buf;
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ int beat_best_rd = 0;
+ uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
+#endif // CONFIG_FILTER_INTRA
+ const int *bmode_costs;
+#if CONFIG_PALETTE
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ uint8_t *best_palette_color_map =
+ cpi->common.allow_screen_content_tools
+ ? x->palette_buffer->best_palette_color_map
+ : NULL;
+ int palette_y_mode_ctx = 0;
+ const int try_palette =
+ cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
+#endif // CONFIG_PALETTE
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
+ const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
+ const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0);
+ const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
+#if CONFIG_PVQ
+ od_rollback_buffer pre_buf, post_buf;
+
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+ bmode_costs = cpi->y_mode_costs[A][L];
+
+#if CONFIG_EXT_INTRA
+ mbmi->angle_delta[0] = 0;
+ memset(directional_mode_skip_mask, 0,
+ sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ highbd_angle_estimation(src, src_stride, rows, cols,
+ directional_mode_skip_mask);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask);
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif // CONFIG_FILTER_INTRA
+#if CONFIG_PALETTE
+ pmi->palette_size[0] = 0;
+ if (above_mi)
+ palette_y_mode_ctx +=
+ (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+ if (left_mi)
+ palette_y_mode_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+#endif // CONFIG_PALETTE
+
+ if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+ x->use_default_intra_tx_type = 1;
+ else
+ x->use_default_intra_tx_type = 0;
+
+ /* Y Search for intra prediction mode */
+ for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+ RD_STATS this_rd_stats;
+ int this_rate, this_rate_tokenonly, s;
+ int64_t this_distortion, this_rd, this_model_rd;
+ if (mode_idx == FINAL_MODE_SEARCH) {
+ if (x->use_default_intra_tx_type == 0) break;
+ mbmi->mode = best_mbmi.mode;
+ x->use_default_intra_tx_type = 0;
+ } else {
+ mbmi->mode = mode_idx;
+ }
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+#if CONFIG_EXT_INTRA
+ mbmi->angle_delta[0] = 0;
+#endif // CONFIG_EXT_INTRA
+ this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
+ if (best_model_rd != INT64_MAX &&
+ this_model_rd > best_model_rd + (best_model_rd >> 1))
+ continue;
+ if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
+#if CONFIG_EXT_INTRA
+ is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
+ if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+ if (is_directional_mode) {
+ this_rd_stats.rate = INT_MAX;
+ rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
+ bmode_costs[mbmi->mode], best_rd, &best_model_rd);
+ } else {
+ super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+ }
+#else
+ super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+#endif // CONFIG_EXT_INTRA
+ this_rate_tokenonly = this_rd_stats.rate;
+ this_distortion = this_rd_stats.dist;
+ s = this_rd_stats.skip;
+
+ if (this_rate_tokenonly == INT_MAX) continue;
+
+ this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
+
+ if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+ // super_block_yrd above includes the cost of the tx_size in the
+ // tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+ }
+#if CONFIG_PALETTE
+ if (try_palette && mbmi->mode == DC_PRED) {
+ this_rate +=
+ av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+ [palette_y_mode_ctx],
+ 0);
+ }
+#endif // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+ if (mbmi->mode == DC_PRED)
+ this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
+#endif // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+ if (is_directional_mode) {
+#if CONFIG_INTRA_INTERP
+ const int p_angle =
+ mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+ if (av1_is_intra_filter_switchable(p_angle))
+ this_rate +=
+ cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+#endif // CONFIG_INTRA_INTERP
+ this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+ }
+#endif // CONFIG_EXT_INTRA
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+#if CONFIG_FILTER_INTRA
+ if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
+ filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
+ }
+#endif // CONFIG_FILTER_INTRA
+
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+#if CONFIG_FILTER_INTRA
+ beat_best_rd = 1;
+#endif // CONFIG_FILTER_INTRA
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+ }
+ }
+
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+
+#if CONFIG_CFL
+ // Perform one extra txfm_rd_in_plane() call, this time with the best value so
+ // we can store reconstructed luma values
+ RD_STATS this_rd_stats;
+ x->cfl_store_y = 1;
+ txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, 0, bsize,
+ mic->mbmi.tx_size, cpi->sf.use_fast_coef_costing);
+ x->cfl_store_y = 0;
+#endif
+
+#if CONFIG_PALETTE
+ if (try_palette) {
+ rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
+ bmode_costs[DC_PRED], &best_mbmi,
+ best_palette_color_map, &best_rd, &best_model_rd,
+ rate, rate_tokenonly, distortion, skippable);
+ }
+#endif // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+ if (beat_best_rd) {
+ if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+ skippable, bsize, bmode_costs[DC_PRED],
+ &best_rd, &best_model_rd,
+ filter_intra_mode_skip_mask)) {
+ best_mbmi = *mbmi;
+ }
+ }
+#endif // CONFIG_FILTER_INTRA
+
+ *mbmi = best_mbmi;
+ return best_rd;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+ int plane;
+ int is_cost_valid = 1;
+ av1_init_rd_stats(rd_stats);
+
+ if (ref_best_rd < 0) is_cost_valid = 0;
+
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+ if (x->skip_chroma_rd) return is_cost_valid;
+
+ bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y);
+#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+
+#if !CONFIG_PVQ
+ if (is_inter_block(mbmi) && is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+ av1_subtract_plane(x, bsize, plane);
+ }
+#endif // !CONFIG_PVQ
+
+ if (is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ RD_STATS pn_rd_stats;
+ txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
+ uv_tx_size, cpi->sf.use_fast_coef_costing);
+ if (pn_rd_stats.rate == INT_MAX) {
+ is_cost_valid = 0;
+ break;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >
+ ref_best_rd &&
+ RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse) > ref_best_rd) {
+ is_cost_valid = 0;
+ break;
+ }
+ }
+ }
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+
+ return is_cost_valid;
+}
+
+#if CONFIG_VAR_TX
+// FIXME crop these calls
+static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
+ TX_SIZE tx_size) {
+ return aom_sum_squares_2d_i16(diff, diff_stride, tx_size_wide[tx_size],
+ tx_size_high[tx_size]);
+}
+
+void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+ int blk_row, int blk_col, int plane, int block,
+ int plane_bsize, const ENTROPY_CONTEXT *a,
+ const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ int64_t tmp;
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+ BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
+ int bh = block_size_high[txm_bsize];
+ int bw = block_size_wide[txm_bsize];
+ int txb_h = tx_size_high_unit[tx_size];
+ int txb_w = tx_size_wide_unit[tx_size];
+
+ int src_stride = p->src.stride;
+ uint8_t *src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+ uint8_t *dst =
+ &pd->dst
+ .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+#if CONFIG_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
+ uint8_t *rec_buffer;
+#else
+ DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
+#endif // CONFIG_HIGHBITDEPTH
+ int max_blocks_high = block_size_high[plane_bsize];
+ int max_blocks_wide = block_size_wide[plane_bsize];
+ const int diff_stride = max_blocks_wide;
+ const int16_t *diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+ int txb_coeff_cost;
+
+ assert(tx_size < TX_SIZES_ALL);
+
+ if (xd->mb_to_bottom_edge < 0)
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+ if (xd->mb_to_right_edge < 0)
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+ max_blocks_high >>= tx_size_wide_log2[0];
+ max_blocks_wide >>= tx_size_wide_log2[0];
+
+ int coeff_ctx = get_entropy_context(tx_size, a, l);
+
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ coeff_ctx, AV1_XFORM_QUANT_FP);
+
+ av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+
+// TODO(any): Use av1_dist_block to compute distortion
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
+ aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
+ 0, NULL, 0, bw, bh, xd->bd);
+ } else {
+ rec_buffer = (uint8_t *)rec_buffer16;
+ aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
+ NULL, 0, bw, bh);
+ }
+#else
+ aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
+ 0, bw, bh);
+#endif // CONFIG_HIGHBITDEPTH
+
+ if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
+ int idx, idy;
+ int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+ int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
+ tmp = 0;
+ for (idy = 0; idy < blocks_height; ++idy) {
+ for (idx = 0; idx < blocks_width; ++idx) {
+ const int16_t *d =
+ diff + ((idy * diff_stride + idx) << tx_size_wide_log2[0]);
+ tmp += sum_squares_2d(d, diff_stride, 0);
+ }
+ }
+ } else {
+ tmp = sum_squares_2d(diff, diff_stride, tx_size);
+ }
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif // CONFIG_HIGHBITDEPTH
+ rd_stats->sse += tmp * 16;
+ const int eob = p->eobs[block];
+
+ av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer,
+ MAX_TX_SIZE, eob);
+ if (eob > 0) {
+ if (txb_w + blk_col > max_blocks_wide ||
+ txb_h + blk_row > max_blocks_high) {
+ int idx, idy;
+ unsigned int this_dist;
+ int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+ int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
+ tmp = 0;
+ for (idy = 0; idy < blocks_height; ++idy) {
+ for (idx = 0; idx < blocks_width; ++idx) {
+ uint8_t *const s =
+ src + ((idy * src_stride + idx) << tx_size_wide_log2[0]);
+ uint8_t *const r =
+ rec_buffer + ((idy * MAX_TX_SIZE + idx) << tx_size_wide_log2[0]);
+ cpi->fn_ptr[0].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
+ tmp += this_dist;
+ }
+ }
+ } else {
+ uint32_t this_dist;
+ cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
+ &this_dist);
+ tmp = this_dist;
+ }
+ }
+ rd_stats->dist += tmp * 16;
+ txb_coeff_cost =
+ av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l, 0);
+ rd_stats->rate += txb_coeff_cost;
+ rd_stats->skip &= (eob == 0);
+
+#if CONFIG_RD_DEBUG
+ av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+ txb_coeff_cost);
+#endif // CONFIG_RD_DEBUG
+}
+
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+ int blk_col, int plane, int block, int block32,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+ ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+ TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ int *is_cost_valid, RD_STATS *rd_stats_stack) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int tx_row = blk_row >> (1 - pd->subsampling_y);
+ const int tx_col = blk_col >> (1 - pd->subsampling_x);
+ TX_SIZE(*const inter_tx_size)
+ [MAX_MIB_SIZE] =
+ (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ int64_t this_rd = INT64_MAX;
+ ENTROPY_CONTEXT *pta = ta + blk_col;
+ ENTROPY_CONTEXT *ptl = tl + blk_row;
+ int coeff_ctx, i;
+ int ctx =
+ txfm_partition_context(tx_above + (blk_col >> 1),
+ tx_left + (blk_row >> 1), mbmi->sb_type, tx_size);
+ int64_t sum_rd = INT64_MAX;
+ int tmp_eob = 0;
+ int zero_blk_rate;
+ RD_STATS sum_rd_stats;
+ const int tx_size_ctx = txsize_sqr_map[tx_size];
+
+ av1_init_rd_stats(&sum_rd_stats);
+
+ assert(tx_size < TX_SIZES_ALL);
+
+ if (ref_best_rd < 0) {
+ *is_cost_valid = 0;
+ return;
+ }
+
+ coeff_ctx = get_entropy_context(tx_size, pta, ptl);
+
+ av1_init_rd_stats(rd_stats);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
+ [coeff_ctx][EOB_TOKEN];
+
+ if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
+ inter_tx_size[0][0] = tx_size;
+
+ if (tx_size == TX_32X32 && mbmi->tx_type != DCT_DCT &&
+ rd_stats_stack[block32].rate != INT_MAX) {
+ *rd_stats = rd_stats_stack[block32];
+ p->eobs[block] = !rd_stats->skip;
+ x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+ } else {
+ av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+ plane_bsize, pta, ptl, rd_stats);
+ if (tx_size == TX_32X32) {
+ rd_stats_stack[block32] = *rd_stats;
+ }
+ }
+
+ if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip == 1) &&
+ !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+ av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+ zero_blk_rate - rd_stats->rate);
+#endif // CONFIG_RD_DEBUG
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+ p->eobs[block] = 0;
+ } else {
+ x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+ rd_stats->skip = 0;
+ }
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate +=
+ av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+ tmp_eob = p->eobs[block];
+ }
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsl = tx_size_wide_unit[sub_txs];
+ int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+ RD_STATS this_rd_stats;
+ int this_cost_valid = 1;
+ int64_t tmp_rd = 0;
+
+ sum_rd_stats.rate =
+ av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+
+ assert(tx_size < TX_SIZES_ALL);
+
+ for (i = 0; i < 4 && this_cost_valid; ++i) {
+ int offsetr = blk_row + (i >> 1) * bsl;
+ int offsetc = blk_col + (i & 0x01) * bsl;
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ select_tx_block(cpi, x, offsetr, offsetc, plane, block, block32, sub_txs,
+ depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
+ &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
+ rd_stats_stack);
+
+ av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
+
+ tmp_rd =
+ RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist);
+ if (this_rd < tmp_rd) break;
+ block += sub_step;
+ }
+ if (this_cost_valid) sum_rd = tmp_rd;
+ }
+
+ if (this_rd < sum_rd) {
+ int idx, idy;
+ for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
+ for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
+ txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1),
+ tx_size, tx_size);
+ inter_tx_size[0][0] = tx_size;
+ for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+ for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+ inter_tx_size[idy][idx] = tx_size;
+ mbmi->tx_size = tx_size;
+ if (this_rd == INT64_MAX) *is_cost_valid = 0;
+ x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+ } else {
+ *rd_stats = sum_rd_stats;
+ if (sum_rd == INT64_MAX) *is_cost_valid = 0;
+ }
+}
+
+static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd, RD_STATS *rd_stats_stack) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int is_cost_valid = 1;
+ int64_t this_rd = 0;
+
+ if (ref_best_rd < 0) is_cost_valid = 0;
+
+ av1_init_rd_stats(rd_stats);
+
+ if (is_cost_valid) {
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+ int block = 0;
+ int block32 = 0;
+ int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+
+ RD_STATS pn_rd_stats;
+ av1_init_rd_stats(&pn_rd_stats);
+
+ av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context,
+ sizeof(TXFM_CONTEXT) * (mi_width >> 1));
+ memcpy(tx_left, xd->left_txfm_context,
+ sizeof(TXFM_CONTEXT) * (mi_height >> 1));
+
+ for (idy = 0; idy < mi_height; idy += bh) {
+ for (idx = 0; idx < mi_width; idx += bw) {
+ select_tx_block(cpi, x, idy, idx, 0, block, block32, max_tx_size,
+ mi_height != mi_width, plane_bsize, ctxa, ctxl,
+ tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
+ &is_cost_valid, rd_stats_stack);
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd += AOMMIN(
+ RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
+ RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
+ block += step;
+ ++block32;
+ }
+ }
+ }
+
+ this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+ RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+ if (this_rd > ref_best_rd) is_cost_valid = 0;
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+}
+
+static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd, TX_TYPE tx_type,
+ RD_STATS *rd_stats_stack) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const int is_inter = is_inter_block(mbmi);
+ aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+ int s0 = av1_cost_bit(skip_prob, 0);
+ int s1 = av1_cost_bit(skip_prob, 1);
+ int64_t rd;
+ int row, col;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+
+ mbmi->tx_type = tx_type;
+ mbmi->min_tx_size = TX_SIZES_ALL;
+ inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, rd_stats_stack);
+
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ for (row = 0; row < max_blocks_high / 2; ++row)
+ for (col = 0; col < max_blocks_wide / 2; ++col)
+ mbmi->min_tx_size = AOMMIN(
+ mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
+
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
+ cm->reduced_tx_set_used) > 1 &&
+ !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
+ cm->reduced_tx_set_used);
+ if (is_inter) {
+ if (ext_tx_set > 0)
+ rd_stats->rate +=
+ cpi->inter_tx_type_costs[ext_tx_set]
+ [txsize_sqr_map[mbmi->min_tx_size]]
+ [mbmi->tx_type];
+ } else {
+ if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+ rd_stats->rate +=
+ cpi->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
+ [mbmi->tx_type];
+ }
+ }
+#else // CONFIG_EXT_TX
+ if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
+ rd_stats->rate +=
+ cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
+#endif // CONFIG_EXT_TX
+
+ if (rd_stats->skip)
+ rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+ else
+ rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
+
+ if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+ !(rd_stats->skip))
+ rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+
+ return rd;
+}
+
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int64_t rd = INT64_MAX;
+ int64_t best_rd = INT64_MAX;
+ TX_TYPE tx_type, best_tx_type = DCT_DCT;
+ const int is_inter = is_inter_block(mbmi);
+ TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
+ TX_SIZE best_tx = max_txsize_lookup[bsize];
+ TX_SIZE best_min_tx_size = TX_SIZES_ALL;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+ const int n4 = bsize_to_num_blk(bsize);
+ int idx, idy;
+ int prune = 0;
+ const int count32 =
+ 1 << (2 * (cm->mib_size_log2 - mi_width_log2_lookup[BLOCK_32X32]));
+#if CONFIG_EXT_PARTITION
+ RD_STATS rd_stats_stack[16];
+#else
+ RD_STATS rd_stats_stack[4];
+#endif // CONFIG_EXT_PARTITION
+#if CONFIG_EXT_TX
+ const int ext_tx_set =
+ get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+#endif // CONFIG_EXT_TX
+
+ if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+ prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
+#else
+ prune = prune_tx_types(cpi, bsize, x, xd, 0);
+#endif // CONFIG_EXT_TX
+
+ av1_invalid_rd_stats(rd_stats);
+
+ for (idx = 0; idx < count32; ++idx)
+ av1_invalid_rd_stats(&rd_stats_stack[idx]);
+
+ for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+#if CONFIG_EXT_TX
+ if (is_inter) {
+ if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+ if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+ if (!do_tx_type_search(tx_type, prune)) continue;
+ }
+ } else {
+ if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
+ if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
+ }
+ if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
+ }
+#else // CONFIG_EXT_TX
+ if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+ !do_tx_type_search(tx_type, prune))
+ continue;
+#endif // CONFIG_EXT_TX
+ if (is_inter && x->use_default_inter_tx_type &&
+ tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
+ continue;
+
+ if (xd->lossless[mbmi->segment_id])
+ if (tx_type != DCT_DCT) continue;
+
+ rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+ tx_type, rd_stats_stack);
+
+ if (rd < best_rd) {
+ best_rd = rd;
+ *rd_stats = this_rd_stats;
+ best_tx_type = mbmi->tx_type;
+ best_tx = mbmi->tx_size;
+ best_min_tx_size = mbmi->min_tx_size;
+ memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+ for (idy = 0; idy < xd->n8_h; ++idy)
+ for (idx = 0; idx < xd->n8_w; ++idx)
+ best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+ }
+ }
+
+ mbmi->tx_type = best_tx_type;
+ for (idy = 0; idy < xd->n8_h; ++idy)
+ for (idx = 0; idx < xd->n8_w; ++idx)
+ mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
+ mbmi->tx_size = best_tx;
+ mbmi->min_tx_size = best_min_tx_size;
+ memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+}
+
+static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+ int blk_col, int plane, int block, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
+ ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ const int tx_row = blk_row >> (1 - pd->subsampling_y);
+ const int tx_col = blk_col >> (1 - pd->subsampling_x);
+ TX_SIZE plane_tx_size;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ assert(tx_size < TX_SIZES_ALL);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ plane_tx_size =
+ plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+ : mbmi->inter_tx_size[tx_row][tx_col];
+
+ if (tx_size == plane_tx_size) {
+ int i;
+ ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+ ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+ av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+ plane_bsize, ta, tl, rd_stats);
+
+ for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
+ ta[i] = !(p->eobs[block] == 0);
+ for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
+ tl[i] = !(p->eobs[block] == 0);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsl = tx_size_wide_unit[sub_txs];
+ int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+ int i;
+
+ assert(bsl > 0);
+
+ for (i = 0; i < 4; ++i) {
+ int offsetr = blk_row + (i >> 1) * bsl;
+ int offsetc = blk_col + (i & 0x01) * bsl;
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize,
+ above_ctx, left_ctx, rd_stats);
+ block += step;
+ }
+ }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int plane;
+ int is_cost_valid = 1;
+ int64_t this_rd;
+
+ if (ref_best_rd < 0) is_cost_valid = 0;
+
+ av1_init_rd_stats(rd_stats);
+
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+ if (x->skip_chroma_rd) return is_cost_valid;
+ bsize = AOMMAX(BLOCK_8X8, bsize);
+#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ if (is_rect_tx(mbmi->tx_size)) {
+ return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd);
+ }
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+ if (is_inter_block(mbmi) && is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+ av1_subtract_plane(x, bsize, plane);
+ }
+
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+ int block = 0;
+ const int step = bh * bw;
+ ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+ ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
+ RD_STATS pn_rd_stats;
+ av1_init_rd_stats(&pn_rd_stats);
+
+ av1_get_entropy_contexts(bsize, 0, pd, ta, tl);
+
+ for (idy = 0; idy < mi_height; idy += bh) {
+ for (idx = 0; idx < mi_width; idx += bw) {
+ tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
+ ta, tl, &pn_rd_stats);
+ block += step;
+ }
+ }
+
+ if (pn_rd_stats.rate == INT_MAX) {
+ is_cost_valid = 0;
+ break;
+ }
+
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+
+ this_rd =
+ AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+ RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+
+ if (this_rd > ref_best_rd) {
+ is_cost_valid = 0;
+ break;
+ }
+ }
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+
+ return is_cost_valid;
+}
+#endif // CONFIG_VAR_TX
+
+#if CONFIG_PALETTE
+static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int dc_mode_cost,
+ uint8_t *best_palette_color_map,
+ MB_MODE_INFO *const best_mbmi,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ int this_rate;
+ int64_t this_rd;
+ int colors_u, colors_v, colors;
+ const int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ RD_STATS tokenonly_rd_stats;
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+ if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
+
+ mbmi->uv_mode = DC_PRED;
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif // CONFIG_FILTER_INTRA
+
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) {
+ colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
+ cpi->common.bit_depth);
+ colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
+ cpi->common.bit_depth);
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ colors_u = av1_count_colors(src_u, src_stride, rows, cols);
+ colors_v = av1_count_colors(src_v, src_stride, rows, cols);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ colors = colors_u > colors_v ? colors_u : colors_v;
+ if (colors > 1 && colors <= 64) {
+ int r, c, n, i, j;
+ const int max_itr = 50;
+ uint8_t color_order[PALETTE_MAX_SIZE];
+ float lb_u, ub_u, val_u;
+ float lb_v, ub_v, val_v;
+ float *const data = x->palette_buffer->kmeans_data_buf;
+ float centroids[2 * PALETTE_MAX_SIZE];
+
+#if CONFIG_HIGHBITDEPTH
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+ if (cpi->common.use_highbitdepth) {
+ lb_u = src_u16[0];
+ ub_u = src_u16[0];
+ lb_v = src_v16[0];
+ ub_v = src_v16[0];
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ lb_u = src_u[0];
+ ub_u = src_u[0];
+ lb_v = src_v[0];
+ ub_v = src_v[0];
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) {
+ val_u = src_u16[r * src_stride + c];
+ val_v = src_v16[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ val_u = src_u[r * src_stride + c];
+ val_v = src_v[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ if (val_u < lb_u)
+ lb_u = val_u;
+ else if (val_u > ub_u)
+ ub_u = val_u;
+ if (val_v < lb_v)
+ lb_v = val_v;
+ else if (val_v > ub_v)
+ ub_v = val_v;
+ }
+ }
+
+ for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+ --n) {
+ for (i = 0; i < n; ++i) {
+ centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+ centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+ }
+ av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+#if CONFIG_PALETTE_DELTA_ENCODING
+ // Sort the U channel colors in ascending order.
+ for (i = 0; i < 2 * (n - 1); i += 2) {
+ int min_idx = i;
+ float min_val = centroids[i];
+ for (j = i + 2; j < 2 * n; j += 2)
+ if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+ if (min_idx != i) {
+ float temp_u = centroids[i], temp_v = centroids[i + 1];
+ centroids[i] = centroids[min_idx];
+ centroids[i + 1] = centroids[min_idx + 1];
+ centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+ }
+ }
+ av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+#endif // CONFIG_PALETTE_DELTA_ENCODING
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+ pmi->palette_size[1] = n;
+ for (i = 1; i < 3; ++i) {
+ for (j = 0; j < n; ++j) {
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth)
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+ (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+ clip_pixel((int)centroids[j * 2 + i - 1]);
+ }
+ }
+
+ super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate =
+ tokenonly_rd_stats.rate + dc_mode_cost +
+ cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
+ write_uniform_cost(n, color_map[0]) +
+ av1_cost_bit(
+ av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
+ this_rate += av1_palette_color_cost_uv(pmi, cpi->common.bit_depth);
+ for (i = 0; i < rows; ++i) {
+ for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+ int color_idx;
+ const int color_ctx = av1_get_palette_color_index_context(
+ color_map, plane_block_width, i, j, n, color_order, &color_idx);
+ assert(color_idx >= 0 && color_idx < n);
+ this_rate += cpi->palette_uv_color_cost[n - PALETTE_MIN_SIZE]
+ [color_ctx][color_idx];
+ }
+ }
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_mbmi = *mbmi;
+ memcpy(best_palette_color_map, color_map,
+ plane_block_width * plane_block_height *
+ sizeof(best_palette_color_map[0]));
+ *rate = this_rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *skippable = tokenonly_rd_stats.skip;
+ }
+ }
+ }
+ if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+ }
+}
+#endif // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable,
+ BLOCK_SIZE bsize, int64_t *best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ int filter_intra_selected_flag = 0;
+ int this_rate;
+ int64_t this_rd;
+ FILTER_INTRA_MODE mode;
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ RD_STATS tokenonly_rd_stats;
+
+ av1_zero(filter_intra_mode_info);
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
+ mbmi->uv_mode = DC_PRED;
+#if CONFIG_PALETTE
+ mbmi->palette_mode_info.palette_size[1] = 0;
+#endif // CONFIG_PALETTE
+
+ for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+ mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
+ if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd))
+ continue;
+
+ this_rate = tokenonly_rd_stats.rate +
+ av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
+ cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+ write_uniform_cost(FILTER_INTRA_MODES, mode);
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip;
+ filter_intra_mode_info = mbmi->filter_intra_mode_info;
+ filter_intra_selected_flag = 1;
+ }
+ }
+
+ if (filter_intra_selected_flag) {
+ mbmi->uv_mode = DC_PRED;
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+ filter_intra_mode_info.use_filter_intra_mode[1];
+ mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+ filter_intra_mode_info.filter_intra_mode[1];
+ return 1;
+ } else {
+ return 0;
+ }
+}
+#endif // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+ int *best_angle_delta, int64_t *best_rd) {
+ MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+ int this_rate;
+ int64_t this_rd;
+ RD_STATS tokenonly_rd_stats;
+
+ if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+ return INT64_MAX;
+ this_rate = tokenonly_rd_stats.rate + rate_overhead;
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_angle_delta = mbmi->angle_delta[1];
+ *rate = this_rate;
+ rd_stats->rate = tokenonly_rd_stats.rate;
+ rd_stats->dist = tokenonly_rd_stats.dist;
+ rd_stats->skip = tokenonly_rd_stats.skip;
+ }
+ return this_rd;
+}
+
+// With given chroma directional intra prediction mode, pick the best angle
+// delta. Return true if a RD cost that is smaller than the input one is found.
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int rate_overhead,
+ int64_t best_rd, int *rate,
+ RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ int i, angle_delta, best_angle_delta = 0;
+ int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+ rd_stats->rate = INT_MAX;
+ rd_stats->skip = 0;
+ rd_stats->dist = INT64_MAX;
+ for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+ for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (i = 0; i < 2; ++i) {
+ best_rd_in = (best_rd == INT64_MAX)
+ ? INT64_MAX
+ : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+ mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+ this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+ best_rd_in, rate, rd_stats,
+ &best_angle_delta, &best_rd);
+ rd_cost[2 * angle_delta + i] = this_rd;
+ if (angle_delta == 0) {
+ if (this_rd == INT64_MAX) return 0;
+ rd_cost[1] = this_rd;
+ break;
+ }
+ }
+ }
+
+ assert(best_rd != INT64_MAX);
+ for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ int64_t rd_thresh;
+ for (i = 0; i < 2; ++i) {
+ int skip_search = 0;
+ rd_thresh = best_rd + (best_rd >> 5);
+ if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+ rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+ skip_search = 1;
+ if (!skip_search) {
+ mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+ pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ rate, rd_stats, &best_angle_delta,
+ &best_rd);
+ }
+ }
+ }
+
+ mbmi->angle_delta[1] = best_angle_delta;
+ return rd_stats->rate != INT_MAX;
+}
+#endif // CONFIG_EXT_INTRA
+
+static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ MB_MODE_INFO best_mbmi = *mbmi;
+ PREDICTION_MODE mode;
+ int64_t best_rd = INT64_MAX, this_rd;
+ int this_rate;
+ RD_STATS tokenonly_rd_stats;
+#if CONFIG_PVQ
+ od_rollback_buffer buf;
+ od_encode_checkpoint(&x->daala_enc, &buf);
+#endif // CONFIG_PVQ
+#if CONFIG_PALETTE
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ uint8_t *best_palette_color_map = NULL;
+#endif // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif // CONFIG_FILTER_INTRA
+#if CONFIG_PALETTE
+ pmi->palette_size[1] = 0;
+#endif // CONFIG_PALETTE
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+#if CONFIG_EXT_INTRA
+ const int is_directional_mode =
+ av1_is_directional_mode(mode, mbmi->sb_type);
+#endif // CONFIG_EXT_INTRA
+ if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+ (1 << mode)))
+ continue;
+
+ mbmi->uv_mode = mode;
+#if CONFIG_EXT_INTRA
+ mbmi->angle_delta[1] = 0;
+ if (is_directional_mode) {
+ const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
+ write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
+ if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ &this_rate, &tokenonly_rd_stats))
+ continue;
+ } else {
+#endif // CONFIG_EXT_INTRA
+ if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &buf);
+#endif // CONFIG_PVQ
+ continue;
+ }
+#if CONFIG_EXT_INTRA
+ }
+#endif // CONFIG_EXT_INTRA
+ this_rate =
+ tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode];
+
+#if CONFIG_EXT_INTRA
+ if (is_directional_mode) {
+ this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+ }
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ if (mbmi->sb_type >= BLOCK_8X8 && mode == DC_PRED)
+ this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
+#endif // CONFIG_FILTER_INTRA
+#if CONFIG_PALETTE
+ if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
+ mode == DC_PRED)
+ this_rate += av1_cost_bit(
+ av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
+#endif // CONFIG_PALETTE
+
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &buf);
+#endif // CONFIG_PVQ
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip;
+ }
+ }
+
+#if CONFIG_PALETTE
+ if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
+ best_palette_color_map = x->palette_buffer->best_palette_color_map;
+ rd_pick_palette_intra_sbuv(cpi, x,
+ cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED],
+ best_palette_color_map, &best_mbmi, &best_rd,
+ rate, rate_tokenonly, distortion, skippable);
+ }
+#endif // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+ if (mbmi->sb_type >= BLOCK_8X8) {
+ if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
+ skippable, bsize, &best_rd))
+ best_mbmi = *mbmi;
+ }
+#endif // CONFIG_FILTER_INTRA
+
+ *mbmi = best_mbmi;
+ // Make sure we actually chose a mode
+ assert(best_rd < INT64_MAX);
+ return best_rd;
+}
+
+static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+ TX_SIZE max_tx_size, int *rate_uv,
+ int *rate_uv_tokenonly, int64_t *dist_uv,
+ int *skip_uv, PREDICTION_MODE *mode_uv) {
+ // Use an estimated rd for uv_intra based on DC_PRED if the
+ // appropriate speed flag is set.
+ (void)ctx;
+#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
+ rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+ bsize, max_tx_size);
+#else
+ max_tx_size = AOMMAX(max_tx_size, TX_4X4);
+ if (x->skip_chroma_rd) {
+ *rate_uv = 0;
+ *rate_uv_tokenonly = 0;
+ *dist_uv = 0;
+ *skip_uv = 1;
+ *mode_uv = DC_PRED;
+ return;
+ }
+ BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x,
+ x->e_mbd.plane[1].subsampling_y);
+ rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+ bs, max_tx_size);
+#endif // CONFIG_CHROMA_2X2
+#else
+ rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+ bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
+#endif // CONFIG_CB4X4
+ *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
+}
+
+static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
+ int16_t mode_context) {
+#if CONFIG_EXT_INTER
+ if (is_inter_compound_mode(mode)) {
+ return cpi
+ ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+ }
+#endif
+
+#if CONFIG_REF_MV
+ int mode_cost = 0;
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+ int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
+
+ assert(is_inter_mode(mode));
+
+ if (mode == NEWMV) {
+ mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+
+ if (is_all_zero_mv) return mode_cost;
+
+ if (mode == ZEROMV) {
+ mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+ if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
+ if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
+ if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+
+ mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+ return mode_cost;
+ }
+ }
+#else
+ assert(is_inter_mode(mode));
+ return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+#endif // CONFIG_REF_MV
+}
+
+#if CONFIG_EXT_INTER
+static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
+ COMPOUND_TYPE comp_type) {
+ (void)bsize;
+ switch (comp_type) {
+ case COMPOUND_AVERAGE: return 0;
+#if CONFIG_WEDGE
+ case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize);
+#endif // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+ case COMPOUND_SEG: return 1;
+#endif // CONFIG_COMPOUND_SEGMENT
+ default: assert(0); return 0;
+ }
+}
+#endif // CONFIG_EXT_INTER
+
+static int set_and_cost_bmi_mvs(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd, int i,
+ PREDICTION_MODE mode, int_mv this_mv[2],
+ int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME],
+ int_mv seg_mvs[TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER
+ int_mv compound_seg_newmvs[2],
+#endif // CONFIG_EXT_INTER
+ int_mv *best_ref_mv[2], const int *mvjcost, int *mvcost[2], int mi_row,
+ int mi_col) {
+ MODE_INFO *const mic = xd->mi[0];
+ const MB_MODE_INFO *const mbmi = &mic->mbmi;
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ int thismvcost = 0;
+ int idx, idy;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+ const int is_compound = has_second_ref(mbmi);
+ int mode_ctx;
+ (void)mi_row;
+ (void)mi_col;
+
+ switch (mode) {
+ case NEWMV: this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+#if CONFIG_EXT_INTER
+ if (!cpi->common.allow_high_precision_mv)
+ lower_mv_precision(&this_mv[0].as_mv, 0);
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_REF_MV
+ for (idx = 0; idx < 1 + is_compound; ++idx) {
+ this_mv[idx] = seg_mvs[mbmi->ref_frame[idx]];
+ av1_set_mvcost(x, mbmi->ref_frame[idx], idx, mbmi->ref_mv_idx);
+ thismvcost +=
+ av1_mv_bit_cost(&this_mv[idx].as_mv, &best_ref_mv[idx]->as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT_SUB);
+ }
+ (void)mvjcost;
+ (void)mvcost;
+#else
+ thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+#if !CONFIG_EXT_INTER
+ if (is_compound) {
+ this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+ thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+ }
+#endif // !CONFIG_EXT_INTER
+#endif // CONFIG_REF_MV
+ break;
+ case NEARMV:
+ case NEARESTMV:
+ this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+ if (is_compound)
+ this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+ break;
+ case ZEROMV: {
+ int ref;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_GLOBAL_MOTION
+ this_mv[ref].as_int =
+ gm_get_motion_vector(
+ &cpi->common.global_motion[mbmi->ref_frame[ref]],
+ cpi->common.allow_high_precision_mv, mbmi->sb_type, mi_col,
+ mi_row, i)
+ .as_int;
+#else
+ this_mv[ref].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ }
+ break;
+ }
+#if CONFIG_EXT_INTER
+ case NEW_NEWMV:
+ if (compound_seg_newmvs[0].as_int == INVALID_MV ||
+ compound_seg_newmvs[1].as_int == INVALID_MV) {
+ this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+ this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+ } else {
+ this_mv[0].as_int = compound_seg_newmvs[0].as_int;
+ this_mv[1].as_int = compound_seg_newmvs[1].as_int;
+ }
+ if (!cpi->common.allow_high_precision_mv)
+ lower_mv_precision(&this_mv[0].as_mv, 0);
+ if (!cpi->common.allow_high_precision_mv)
+ lower_mv_precision(&this_mv[1].as_mv, 0);
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif
+ thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
+#endif
+ thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+ break;
+ case NEW_NEARMV:
+ case NEW_NEARESTMV:
+ this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+ if (!cpi->common.allow_high_precision_mv)
+ lower_mv_precision(&this_mv[0].as_mv, 0);
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif
+ thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+ this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+ break;
+ case NEAR_NEWMV:
+ case NEAREST_NEWMV:
+ this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+ this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+ if (!cpi->common.allow_high_precision_mv)
+ lower_mv_precision(&this_mv[1].as_mv, 0);
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
+#endif
+ thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+ break;
+ case NEAREST_NEARMV:
+ case NEAR_NEARESTMV:
+ case NEAREST_NEARESTMV:
+ case NEAR_NEARMV:
+ this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+ this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+ break;
+ case ZERO_ZEROMV:
+#if CONFIG_GLOBAL_MOTION
+ this_mv[0].as_int =
+ gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[0]],
+ cpi->common.allow_high_precision_mv,
+ mbmi->sb_type, mi_col, mi_row, i)
+ .as_int;
+ this_mv[1].as_int =
+ gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[1]],
+ cpi->common.allow_high_precision_mv,
+ mbmi->sb_type, mi_col, mi_row, i)
+ .as_int;
+#else
+ this_mv[0].as_int = 0;
+ this_mv[1].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ break;
+#endif // CONFIG_EXT_INTER
+ default: break;
+ }
+
+ mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
+ if (is_compound) mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
+
+ mic->bmi[i].as_mode = mode;
+
+#if CONFIG_REF_MV
+ if (mode == NEWMV) {
+ mic->bmi[i].pred_mv[0].as_int =
+ mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_int;
+ if (is_compound)
+ mic->bmi[i].pred_mv[1].as_int =
+ mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_int;
+ } else {
+ mic->bmi[i].pred_mv[0].as_int = this_mv[0].as_int;
+ if (is_compound) mic->bmi[i].pred_mv[1].as_int = this_mv[1].as_int;
+ }
+#endif // CONFIG_REF_MV
+
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+ memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (is_compound)
+ mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+ else
+#endif // CONFIG_EXT_INTER
+ mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+ mbmi->ref_frame, mbmi->sb_type, i);
+#else // CONFIG_REF_MV
+ mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+#endif // CONFIG_REF_MV
+ return cost_mv_ref(cpi, mode, mode_ctx) + thismvcost;
+}
+
+static int64_t encode_inter_mb_segment_sub8x8(
+ const AV1_COMP *const cpi, MACROBLOCK *x, int64_t best_yrd, int i,
+ int *labelyrate, int64_t *distortion, int64_t *sse, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, int ir, int ic, int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ MODE_INFO *const mi = xd->mi[0];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+ const int txb_width = max_block_wide(xd, plane_bsize, 0);
+ const int txb_height = max_block_high(xd, plane_bsize, 0);
+ const int width = block_size_wide[plane_bsize];
+ const int height = block_size_high[plane_bsize];
+ int idx, idy;
+ const uint8_t *const src =
+ &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ uint8_t *const dst =
+ &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+ int64_t thisdistortion = 0, thissse = 0;
+ int thisrate = 0;
+ TX_SIZE tx_size = mi->mbmi.tx_size;
+ TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
+ const int num_4x4_w = tx_size_wide_unit[tx_size];
+ const int num_4x4_h = tx_size_high_unit[tx_size];
+#if !CONFIG_PVQ
+ const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
+#else
+ (void)cpi;
+ (void)ta;
+ (void)tl;
+ (void)tx_type;
+#endif // !CONFIG_PVQ
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4));
+ assert(IMPLIES(!xd->lossless[mi->mbmi.segment_id],
+ tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type]));
+#else
+ assert(tx_size == TX_4X4);
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+ assert(tx_type == DCT_DCT);
+
+ av1_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_subtract_block(
+ height, width, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
+ } else {
+ aom_subtract_block(height, width,
+ av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
+ }
+#else
+ aom_subtract_block(height, width,
+ av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
+#endif // CONFIG_HIGHBITDEPTH
+
+ for (idy = 0; idy < txb_height; idy += num_4x4_h) {
+ for (idx = 0; idx < txb_width; idx += num_4x4_w) {
+ int64_t dist, ssz, rd, rd1, rd2;
+ int coeff_ctx;
+ const int k = i + (idy * 2 + idx);
+ const int block = av1_raster_order_to_block_index(tx_size, k);
+ assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
+ idx == 0 && idy == 0));
+ coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1)));
+ av1_xform_quant(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
+ BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
+ av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+ av1_dist_block(cpi, x, 0, BLOCK_8X8, block, idy + (i >> 1),
+ idx + (i & 0x1), tx_size, &dist, &ssz,
+ OUTPUT_HAS_PREDICTED_PIXELS);
+ thisdistortion += dist;
+ thissse += ssz;
+#if !CONFIG_PVQ
+ thisrate +=
+ av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, (ta + (k & 1)),
+ (tl + (k >> 1)), cpi->sf.use_fast_coef_costing);
+ *(ta + (k & 1)) = !(p->eobs[block] == 0);
+ *(tl + (k >> 1)) = !(p->eobs[block] == 0);
+#else
+ thisrate += x->rate;
+#endif // !CONFIG_PVQ
+#if CONFIG_EXT_TX
+ if (tx_size == TX_8X4) {
+ *(ta + (k & 1) + 1) = *(ta + (k & 1));
+ }
+ if (tx_size == TX_4X8) {
+ *(tl + (k >> 1) + 1) = *(tl + (k >> 1));
+ }
+#endif // CONFIG_EXT_TX
+ rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse);
+ rd = AOMMIN(rd1, rd2);
+ if (rd >= best_yrd) return INT64_MAX;
+ }
+ }
+
+ *distortion = thisdistortion;
+ *labelyrate = thisrate;
+ *sse = thissse;
+
+ return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+typedef struct {
+ int eobs;
+ int brate;
+ int byrate;
+ int64_t bdist;
+ int64_t bsse;
+ int64_t brdcost;
+ int_mv mvs[2];
+#if CONFIG_REF_MV
+ int_mv pred_mv[2];
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ int_mv ref_mv[2];
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_CB4X4
+ ENTROPY_CONTEXT ta[4];
+ ENTROPY_CONTEXT tl[4];
+#else
+ ENTROPY_CONTEXT ta[2];
+ ENTROPY_CONTEXT tl[2];
+#endif // CONFIG_CB4X4
+} SEG_RDSTAT;
+
+typedef struct {
+ int_mv *ref_mv[2];
+ int_mv mvp;
+
+ int64_t segment_rd;
+ int r;
+ int64_t d;
+ int64_t sse;
+ int segment_yrate;
+ PREDICTION_MODE modes[4];
+#if CONFIG_EXT_INTER
+ SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
+#else
+ SEG_RDSTAT rdstat[4][INTER_MODES];
+#endif // CONFIG_EXT_INTER
+ int mvthresh;
+} BEST_SEG_INFO;
+
+static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
+ return (mv->row >> 3) < mv_limits->row_min ||
+ (mv->row >> 3) > mv_limits->row_max ||
+ (mv->col >> 3) < mv_limits->col_min ||
+ (mv->col >> 3) > mv_limits->col_max;
+}
+
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+ MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+
+ p->src.buf =
+ &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
+ pd->pre[0].buf =
+ &pd->pre[0].buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)];
+ if (has_second_ref(mbmi))
+ pd->pre[1].buf =
+ &pd->pre[1]
+ .buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[1].stride)];
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+ struct buf_2d orig_pre[2]) {
+ MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+ x->plane[0].src = orig_src;
+ x->e_mbd.plane[0].pre[0] = orig_pre[0];
+ if (has_second_ref(mbmi)) x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
+// TODO(aconverse): Find out if this is still productive then clean up or remove
+static int check_best_zero_mv(
+ const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME],
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+ const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
+#endif // CONFIG_REF_MV && CONFIG_EXT_INTER
+ int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
+ const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
+ int mi_row, int mi_col) {
+ int_mv zeromv[2];
+ int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
+ int cur_frm;
+ (void)mi_row;
+ (void)mi_col;
+ for (cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
+#if CONFIG_GLOBAL_MOTION
+ if (this_mode == ZEROMV
+#if CONFIG_EXT_INTER
+ || this_mode == ZERO_ZEROMV
+#endif // CONFIG_EXT_INTER
+ )
+ zeromv[cur_frm].as_int =
+ gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
+ cpi->common.allow_high_precision_mv, bsize,
+ mi_col, mi_row, block)
+ .as_int;
+ else
+#endif // CONFIG_GLOBAL_MOTION
+ zeromv[cur_frm].as_int = 0;
+ }
+#if !CONFIG_EXT_INTER
+ assert(ref_frames[1] != INTRA_FRAME); // Just sanity check
+#endif // !CONFIG_EXT_INTER
+ if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+ frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
+ (ref_frames[1] <= INTRA_FRAME ||
+ frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
+#if CONFIG_REF_MV
+ int16_t rfc =
+ av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
+#else
+ int16_t rfc = mode_context[ref_frames[0]];
+#endif // CONFIG_REF_MV
+ int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+ int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+ int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+#if !CONFIG_REF_MV
+ (void)bsize;
+ (void)block;
+#endif // !CONFIG_REF_MV
+
+ if (this_mode == NEARMV) {
+ if (c1 > c3) return 0;
+ } else if (this_mode == NEARESTMV) {
+ if (c2 > c3) return 0;
+ } else {
+ assert(this_mode == ZEROMV);
+ if (ref_frames[1] <= INTRA_FRAME) {
+ if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
+ (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
+ return 0;
+ } else {
+ if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
+ (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEARMV][ref_frames[1]].as_int == 0))
+ return 0;
+ }
+ }
+ }
+#if CONFIG_EXT_INTER
+ else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAREST_NEARMV ||
+ this_mode == NEAR_NEARESTMV || this_mode == NEAR_NEARMV ||
+ this_mode == ZERO_ZEROMV) &&
+ frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
+ frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
+#if CONFIG_REF_MV
+ int16_t rfc = compound_mode_context[ref_frames[0]];
+#else
+ int16_t rfc = mode_context[ref_frames[0]];
+#endif // CONFIG_REF_MV
+ int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, rfc);
+ int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
+ int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
+ int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, rfc);
+ int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
+
+ if (this_mode == NEAREST_NEARMV) {
+ if (c1 > c3) return 0;
+ } else if (this_mode == NEAREST_NEARESTMV) {
+ if (c2 > c3) return 0;
+ } else if (this_mode == NEAR_NEARESTMV) {
+ if (c4 > c3) return 0;
+ } else if (this_mode == NEAR_NEARMV) {
+ if (c5 > c3) return 0;
+ } else {
+ assert(this_mode == ZERO_ZEROMV);
+ if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
+ (c3 >= c1 && frame_mv[NEAREST_NEARMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEAREST_NEARMV][ref_frames[1]].as_int == 0) ||
+ (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0) ||
+ (c3 >= c4 && frame_mv[NEAR_NEARESTMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEAR_NEARESTMV][ref_frames[1]].as_int == 0))
+ return 0;
+ }
+ }
+#endif // CONFIG_EXT_INTER
+ return 1;
+}
+
+static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row,
+ int mi_col,
+#if CONFIG_EXT_INTER
+ int_mv *ref_mv_sub8x8[2],
+#endif // CONFIG_EXT_INTER
+ int *rate_mv, const int block) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(mbmi));
+ const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+ int_mv ref_mv[2];
+ int ite, ref;
+#if CONFIG_DUAL_FILTER
+ InterpFilter interp_filter[4] = {
+ mbmi->interp_filter[0], mbmi->interp_filter[1], mbmi->interp_filter[2],
+ mbmi->interp_filter[3],
+ };
+#else
+ const InterpFilter interp_filter = mbmi->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ struct scale_factors sf;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
+ const int ic = block & 1;
+ const int ir = (block - ic) >> 1;
+ const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+ const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+#if CONFIG_GLOBAL_MOTION
+ int is_global[2];
+ for (ref = 0; ref < 2; ++ref) {
+ WarpedMotionParams *const wm =
+ &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
+ is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
+ }
+#endif // CONFIG_GLOBAL_MOTION
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+ int last_besterr[2] = { INT_MAX, INT_MAX };
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+ av1_get_scaled_ref_frame(cpi, refs[0]),
+ av1_get_scaled_ref_frame(cpi, refs[1])
+ };
+
+// Prediction buffer from second frame.
+#if CONFIG_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+ uint8_t *second_pred;
+#else
+ DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_EXT_INTER && CONFIG_CB4X4
+ (void)ref_mv_sub8x8;
+#endif // CONFIG_EXT_INTER && CONFIG_CB4X4
+
+ for (ref = 0; ref < 2; ++ref) {
+#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+ if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
+ ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
+ else
+#endif // CONFIG_EXT_INTER && !CONFIG_CB4X4
+ ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
+
+ if (scaled_ref_frame[ref]) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+ NULL);
+ }
+ }
+
+// Since we have scaled the reference frames to match the size of the current
+// frame we must use a unit scaling factor during mode selection.
+#if CONFIG_HIGHBITDEPTH
+ av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+ cm->height, cm->use_highbitdepth);
+#else
+ av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+ cm->height);
+#endif // CONFIG_HIGHBITDEPTH
+
+ // Allow joint search multiple times iteratively for each reference frame
+ // and break out of the search loop if it couldn't find a better mv.
+ for (ite = 0; ite < 4; ite++) {
+ struct buf_2d ref_yv12[2];
+ int bestsme = INT_MAX;
+ int sadpb = x->sadperbit16;
+ MV *const best_mv = &x->best_mv.as_mv;
+ int search_range = 3;
+
+ MvLimits tmp_mv_limits = x->mv_limits;
+ int id = ite % 2; // Even iterations search in the first reference frame,
+ // odd iterations search in the second. The predictor
+ // found for the 'other' reference frame is factored in.
+ const int plane = 0;
+ ConvolveParams conv_params = get_conv_params(0, plane);
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ WarpTypesAllowed warp_types;
+#if CONFIG_GLOBAL_MOTION
+ warp_types.global_warp_allowed = is_global[!id];
+#endif // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+ warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
+#endif // CONFIG_WARPED_MOTION
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+ // Initialized here because of compiler problem in Visual Studio.
+ ref_yv12[0] = xd->plane[plane].pre[0];
+ ref_yv12[1] = xd->plane[plane].pre[1];
+
+#if CONFIG_DUAL_FILTER
+ // reload the filter types
+ interp_filter[0] =
+ (id == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0];
+ interp_filter[1] =
+ (id == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1];
+#endif // CONFIG_DUAL_FILTER
+
+// Get the prediction block from the 'other' reference frame.
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+ av1_highbd_build_inter_predictor(
+ ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
+ &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, p_col, p_row,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+ } else {
+ second_pred = (uint8_t *)second_pred_alloc_16;
+#endif // CONFIG_HIGHBITDEPTH
+ av1_build_inter_predictor(
+ ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
+ &frame_mv[refs[!id]].as_mv, &sf, pw, ph, &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, p_col, p_row, plane, !id,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ // Do compound motion search on the current reference frame.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
+
+ // Use the mv result from the single mode as mv predictor.
+ *best_mv = frame_mv[refs[id]].as_mv;
+
+ best_mv->col >>= 3;
+ best_mv->row >>= 3;
+
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+
+ // Small-range full-pixel motion search.
+ bestsme =
+ av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize],
+ &ref_mv[id].as_mv, second_pred);
+ if (bestsme < INT_MAX)
+ bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
+ second_pred, &cpi->fn_ptr[bsize], 1);
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (bestsme < INT_MAX) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ if (cpi->sf.use_upsampled_references) {
+ // Use up-sampled reference frames.
+ struct buf_2d backup_pred = pd->pre[0];
+ const YV12_BUFFER_CONFIG *upsampled_ref =
+ get_upsampled_ref(cpi, refs[id]);
+
+ // Set pred for Y plane
+ setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
+ upsampled_ref->y_crop_width,
+ upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+ (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+ pd->subsampling_y);
+
+// If bsize < BLOCK_8X8, adjust pred pointer for this block
+#if !CONFIG_CB4X4
+ if (bsize < BLOCK_8X8)
+ pd->pre[0].buf =
+ &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
+ pd->pre[0].stride))
+ << 3];
+#endif // !CONFIG_CB4X4
+
+ bestsme = cpi->find_fractional_mv_step(
+ x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], 0,
+ cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
+ &dis, &sse, second_pred, pw, ph, 1);
+
+ // Restore the reference frames.
+ pd->pre[0] = backup_pred;
+ } else {
+ (void)block;
+ bestsme = cpi->find_fractional_mv_step(
+ x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], 0,
+ cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
+ &dis, &sse, second_pred, pw, ph, 0);
+ }
+ }
+
+ // Restore the pointer to the first (possibly scaled) prediction buffer.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+ if (bestsme < last_besterr[id]) {
+ frame_mv[refs[id]].as_mv = *best_mv;
+ last_besterr[id] = bestsme;
+ } else {
+ break;
+ }
+ }
+
+ *rate_mv = 0;
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // Restore the prediction frame pointers to their unscaled versions.
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+ if (bsize >= BLOCK_8X8)
+#endif // CONFIG_EXT_INTER && !CONFIG_CB4X4
+ *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+ &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+ else
+ *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+ &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+#endif // CONFIG_EXT_INTER && !CONFIG_CB4X4
+ }
+}
+
+#if CONFIG_REF_MV && !CONFIG_EXT_INTER
+static void update_mv_search_and_seg_mvs(
+ int *const run_mv_search, int_mv *const seg_mvs, int has_second_rf,
+ const MV_REFERENCE_FRAME *const ref_frame,
+ const SEG_RDSTAT *const ref_rdstat, int_mv *const bsi_ref_mv[2]) {
+ if (has_second_rf) {
+ if (seg_mvs[ref_frame[0]].as_int == ref_rdstat->mvs[0].as_int &&
+ ref_rdstat->mvs[0].as_int != INVALID_MV)
+ if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
+ --*run_mv_search;
+
+ if (seg_mvs[ref_frame[1]].as_int == ref_rdstat->mvs[1].as_int &&
+ ref_rdstat->mvs[1].as_int != INVALID_MV)
+ if (bsi_ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+ --*run_mv_search;
+ } else {
+ if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int &&
+ ref_rdstat->mvs[0].as_int != INVALID_MV) {
+ *run_mv_search = 0;
+ seg_mvs[ref_frame[0]].as_int = ref_rdstat->mvs[0].as_int;
+ }
+ }
+}
+#endif // CONFIG_REF_MV && !CONFIG_EXT_INTER
+
+static int64_t rd_pick_inter_best_sub8x8_mode(
+ const AV1_COMP *const cpi, MACROBLOCK *x, int_mv *best_ref_mv,
+ int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
+ int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
+ int mvthresh, int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER
+ int_mv compound_seg_newmvs[4][2],
+#endif // CONFIG_EXT_INTER
+ BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) {
+ BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+#if CONFIG_REF_MV
+ int_mv tmp_ref_mv[2];
+#endif // CONFIG_REF_MV
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ int mode_idx;
+ int k, br = 0, idx, idy;
+ int64_t bd = 0, block_sse = 0;
+ PREDICTION_MODE this_mode;
+ const AV1_COMMON *cm = &cpi->common;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int label_count = 4;
+ int64_t this_segment_rd = 0;
+ int label_mv_thresh;
+ int segmentyrate = 0;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+#if CONFIG_CB4X4
+ ENTROPY_CONTEXT t_above[4], t_left[4];
+#else
+ ENTROPY_CONTEXT t_above[2], t_left[2];
+#endif // CONFIG_CB4X4
+ int subpelmv = 1, have_ref = 0;
+ const int has_second_rf = has_second_ref(mbmi);
+ const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+#if CONFIG_PVQ
+ od_rollback_buffer pre_buf;
+
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ mbmi->tx_size =
+ xd->lossless[mbmi->segment_id] ? TX_4X4 : max_txsize_rect_lookup[bsize];
+#else
+ mbmi->tx_size = TX_4X4;
+#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+ av1_zero(*bsi);
+
+ bsi->segment_rd = best_rd;
+ bsi->ref_mv[0] = best_ref_mv;
+ bsi->ref_mv[1] = second_best_ref_mv;
+ bsi->mvp.as_int = best_ref_mv->as_int;
+ bsi->mvthresh = mvthresh;
+
+ for (idx = 0; idx < 4; ++idx) bsi->modes[idx] = ZEROMV;
+
+#if CONFIG_REF_MV
+ for (idx = 0; idx < 4; ++idx) {
+ for (k = NEARESTMV; k <= NEWMV; ++k) {
+ bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[0].as_int = INVALID_MV;
+ bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[1].as_int = INVALID_MV;
+
+ bsi->rdstat[idx][INTER_OFFSET(k)].mvs[0].as_int = INVALID_MV;
+ bsi->rdstat[idx][INTER_OFFSET(k)].mvs[1].as_int = INVALID_MV;
+ }
+ }
+#endif // CONFIG_REF_MV
+
+ memcpy(t_above, pd->above_context, sizeof(t_above));
+ memcpy(t_left, pd->left_context, sizeof(t_left));
+
+ // 64 makes this threshold really big effectively
+ // making it so that we very rarely check mvs on
+ // segments. setting this to 1 would make mv thresh
+ // roughly equal to what it is for macroblocks
+ label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+ // Segmentation method overheads
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ // TODO(jingning,rbultje): rewrite the rate-distortion optimization
+ // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
+ int_mv mode_mv[MB_MODE_COUNT][2];
+ int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+ PREDICTION_MODE mode_selected = ZEROMV;
+ int64_t new_best_rd = INT64_MAX;
+ const int index = idy * 2 + idx;
+ int ref;
+#if CONFIG_REF_MV
+ CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+ uint8_t ref_mv_count[2];
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ int_mv ref_mvs_sub8x8[2][2];
+#endif // CONFIG_EXT_INTER
+#if CONFIG_PVQ
+ od_rollback_buffer idx_buf, post_buf;
+ od_encode_checkpoint(&x->daala_enc, &idx_buf);
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+
+ for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+ const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+#if CONFIG_EXT_INTER
+ int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
+ av1_update_mv_context(cm, xd, mi, frame, mv_ref_list, index, mi_row,
+ mi_col, NULL);
+#endif // CONFIG_EXT_INTER
+#if CONFIG_GLOBAL_MOTION
+ frame_mv[ZEROMV][frame].as_int =
+ gm_get_motion_vector(&cm->global_motion[frame],
+ cm->allow_high_precision_mv, mbmi->sb_type,
+ mi_col, mi_row, index)
+ .as_int;
+#else // CONFIG_GLOBAL_MOTION
+ frame_mv[ZEROMV][frame].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ av1_append_sub8x8_mvs_for_idx(cm, xd, index, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+ ref_mv_stack[ref], &ref_mv_count[ref],
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ mv_ref_list,
+#endif // CONFIG_EXT_INTER
+ &frame_mv[NEARESTMV][frame],
+ &frame_mv[NEARMV][frame]);
+
+#if CONFIG_REF_MV
+ tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]];
+ lower_mv_precision(&tmp_ref_mv[ref].as_mv, cm->allow_high_precision_mv);
+ bsi->ref_mv[ref] = &tmp_ref_mv[ref];
+ mbmi_ext->ref_mvs[frame][0] = tmp_ref_mv[ref];
+#endif // CONFIG_REF_MV
+
+#if CONFIG_EXT_INTER
+ mv_ref_list[0].as_int = frame_mv[NEARESTMV][frame].as_int;
+ mv_ref_list[1].as_int = frame_mv[NEARMV][frame].as_int;
+ av1_find_best_ref_mvs(cm->allow_high_precision_mv, mv_ref_list,
+ &ref_mvs_sub8x8[0][ref], &ref_mvs_sub8x8[1][ref]);
+
+ if (has_second_rf) {
+#if CONFIG_GLOBAL_MOTION
+ frame_mv[ZERO_ZEROMV][frame].as_int =
+ gm_get_motion_vector(&cm->global_motion[frame],
+ cm->allow_high_precision_mv, mbmi->sb_type,
+ mi_col, mi_row, index)
+ .as_int;
+#else
+ frame_mv[ZERO_ZEROMV][frame].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ frame_mv[NEAREST_NEARESTMV][frame].as_int =
+ frame_mv[NEARESTMV][frame].as_int;
+
+ if (ref == 0) {
+ frame_mv[NEAREST_NEARMV][frame].as_int =
+ frame_mv[NEARESTMV][frame].as_int;
+ frame_mv[NEAR_NEARESTMV][frame].as_int =
+ frame_mv[NEARMV][frame].as_int;
+ frame_mv[NEAREST_NEWMV][frame].as_int =
+ frame_mv[NEARESTMV][frame].as_int;
+ frame_mv[NEAR_NEWMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
+ frame_mv[NEAR_NEARMV][frame].as_int =
+ frame_mv[NEARMV][frame].as_int;
+ } else if (ref == 1) {
+ frame_mv[NEAREST_NEARMV][frame].as_int =
+ frame_mv[NEARMV][frame].as_int;
+ frame_mv[NEAR_NEARESTMV][frame].as_int =
+ frame_mv[NEARESTMV][frame].as_int;
+ frame_mv[NEW_NEARESTMV][frame].as_int =
+ frame_mv[NEARESTMV][frame].as_int;
+ frame_mv[NEW_NEARMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
+ frame_mv[NEAR_NEARMV][frame].as_int =
+ frame_mv[NEARMV][frame].as_int;
+ }
+ }
+#endif // CONFIG_EXT_INTER
+ }
+
+// search for the best motion vector on this segment
+#if CONFIG_EXT_INTER
+ for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV);
+ this_mode <= (has_second_rf ? NEW_NEWMV : NEWMV); ++this_mode)
+#else
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode)
+#endif // CONFIG_EXT_INTER
+ {
+ const struct buf_2d orig_src = x->plane[0].src;
+ struct buf_2d orig_pre[2];
+ // This flag controls if the motion estimation will kick off. When it
+ // is set to a non-zero value, the encoder will force motion estimation.
+ int run_mv_search = 0;
+
+ mode_idx = INTER_OFFSET(this_mode);
+#if CONFIG_EXT_INTER
+ for (ref = 0; ref < 1 + has_second_rf; ++ref)
+ bsi->ref_mv[ref]->as_int = ref_mvs_sub8x8[0][ref].as_int;
+#endif // CONFIG_EXT_INTER
+ bsi->rdstat[index][mode_idx].brdcost = INT64_MAX;
+ if (!(inter_mode_mask & (1 << this_mode))) continue;
+
+#if CONFIG_REF_MV
+ run_mv_search = 2;
+#if !CONFIG_EXT_INTER
+ if (filter_idx > 0 && this_mode == NEWMV) {
+ const BEST_SEG_INFO *ref_bsi = bsi_buf;
+ const SEG_RDSTAT *ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
+
+ update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index],
+ has_second_rf, mbmi->ref_frame,
+ ref_rdstat, bsi->ref_mv);
+
+ if (run_mv_search != 0 && filter_idx > 1) {
+ ref_bsi = bsi_buf + 1;
+ ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
+ run_mv_search = 2;
+ update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index],
+ has_second_rf, mbmi->ref_frame,
+ ref_rdstat, bsi->ref_mv);
+ }
+ }
+#endif // !CONFIG_EXT_INTER
+#endif // CONFIG_REF_MV
+
+#if CONFIG_GLOBAL_MOTION
+ if (cm->global_motion[mbmi->ref_frame[0]].wmtype == IDENTITY &&
+ (!has_second_rf ||
+ cm->global_motion[mbmi->ref_frame[1]].wmtype == IDENTITY))
+#endif // CONFIG_GLOBAL_MOTION
+
+ if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+ mbmi_ext->compound_mode_context,
+#endif // CONFIG_REF_MV && CONFIG_EXT_INTER
+ frame_mv, this_mode, mbmi->ref_frame, bsize,
+ index, mi_row, mi_col))
+ continue;
+
+ memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+ memcpy(bsi->rdstat[index][mode_idx].ta, t_above,
+ sizeof(bsi->rdstat[index][mode_idx].ta));
+ memcpy(bsi->rdstat[index][mode_idx].tl, t_left,
+ sizeof(bsi->rdstat[index][mode_idx].tl));
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &idx_buf);
+#endif // CONFIG_PVQ
+
+ // motion search for newmv (single predictor case only)
+ if (!has_second_rf &&
+#if CONFIG_EXT_INTER
+ have_newmv_in_inter_mode(this_mode) &&
+ (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#else
+ this_mode == NEWMV &&
+ (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV ||
+ run_mv_search)
+#endif // CONFIG_EXT_INTER
+ ) {
+ int step_param = 0;
+ int bestsme = INT_MAX;
+ int sadpb = x->sadperbit4;
+ MV mvp_full;
+ int max_mv;
+ int cost_list[5];
+ MvLimits tmp_mv_limits = x->mv_limits;
+
+ /* Is the best so far sufficiently good that we cant justify doing
+ * and new motion search. */
+ if (new_best_rd < label_mv_thresh) break;
+
+#if CONFIG_EXT_INTER
+ bsi->mvp.as_int = bsi->ref_mv[0]->as_int;
+#else
+// use previous block's result as next block's MV predictor.
+#if !CONFIG_REF_MV
+ if (index > 0) {
+ bsi->mvp.as_int = mi->bmi[index - 1].as_mv[0].as_int;
+ if (index == 2)
+ bsi->mvp.as_int = mi->bmi[index - 2].as_mv[0].as_int;
+ }
+#endif // !CONFIG_REF_MV
+#endif // CONFIG_EXT_INTER
+ max_mv = (index == 0) ? (int)x->max_mv_context[mbmi->ref_frame[0]]
+ : AOMMAX(abs(bsi->mvp.as_mv.row),
+ abs(bsi->mvp.as_mv.col)) >>
+ 3;
+
+ if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and the best ref mvs of the current block for
+ // the given reference.
+ step_param =
+ (av1_init_search_range(max_mv) + cpi->mv_step_param) / 2;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
+
+#if CONFIG_REF_MV
+ mvp_full.row = bsi->ref_mv[0]->as_mv.row >> 3;
+ mvp_full.col = bsi->ref_mv[0]->as_mv.col >> 3;
+#else
+ mvp_full.row = bsi->mvp.as_mv.row >> 3;
+ mvp_full.col = bsi->mvp.as_mv.col >> 3;
+#endif // CONFIG_REF_MV
+
+ if (cpi->sf.adaptive_motion_search) {
+ mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
+ mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
+ step_param = AOMMAX(step_param, 8);
+ }
+
+ // adjust src pointer for this block
+ mi_buf_shift(x, index);
+
+ av1_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
+
+ x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+ bestsme = av1_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, sadpb,
+ cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+ &bsi->ref_mv[0]->as_mv, INT_MAX, 1);
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (bestsme < INT_MAX) {
+ int distortion;
+ if (cpi->sf.use_upsampled_references) {
+ int best_mv_var;
+ const int try_second =
+ x->second_best_mv.as_int != INVALID_MV &&
+ x->second_best_mv.as_int != x->best_mv.as_int;
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ // Use up-sampled reference frames.
+ struct buf_2d backup_pred = pd->pre[0];
+ const YV12_BUFFER_CONFIG *upsampled_ref =
+ get_upsampled_ref(cpi, mbmi->ref_frame[0]);
+
+ // Set pred for Y plane
+ setup_pred_plane(
+ &pd->pre[0], bsize, upsampled_ref->y_buffer,
+ upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
+ upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
+ pd->subsampling_x, pd->subsampling_y);
+
+ // adjust pred pointer for this block
+ pd->pre[0].buf =
+ &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, index,
+ pd->pre[0].stride))
+ << 3];
+
+ best_mv_var = cpi->find_fractional_mv_step(
+ x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+ &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, pw, ph,
+ 1);
+
+ if (try_second) {
+ int this_var;
+ MV best_mv = x->best_mv.as_mv;
+ const MV ref_mv = bsi->ref_mv[0]->as_mv;
+ const int minc =
+ AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
+ const int maxc =
+ AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
+ const int minr =
+ AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
+ const int maxr =
+ AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
+
+ x->best_mv = x->second_best_mv;
+ if (x->best_mv.as_mv.row * 8 <= maxr &&
+ x->best_mv.as_mv.row * 8 >= minr &&
+ x->best_mv.as_mv.col * 8 <= maxc &&
+ x->best_mv.as_mv.col * 8 >= minc) {
+ this_var = cpi->find_fractional_mv_step(
+ x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), x->nmvjointcost,
+ x->mvcost, &distortion, &x->pred_sse[mbmi->ref_frame[0]],
+ NULL, pw, ph, 1);
+ if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+ x->best_mv.as_mv = best_mv;
+ }
+ }
+
+ // Restore the reference frames.
+ pd->pre[0] = backup_pred;
+ } else {
+ cpi->find_fractional_mv_step(
+ x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+ &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, 0, 0, 0);
+ }
+
+// save motion search result for use in compound prediction
+#if CONFIG_EXT_INTER
+ seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#else
+ seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#endif // CONFIG_EXT_INTER
+ }
+
+ if (cpi->sf.adaptive_motion_search)
+ x->pred_mv[mbmi->ref_frame[0]] = x->best_mv.as_mv;
+
+#if CONFIG_EXT_INTER
+ mode_mv[this_mode][0] = x->best_mv;
+#else
+ mode_mv[NEWMV][0] = x->best_mv;
+#endif // CONFIG_EXT_INTER
+
+ // restore src pointers
+ mi_buf_restore(x, orig_src, orig_pre);
+ }
+
+ if (has_second_rf) {
+#if CONFIG_EXT_INTER
+ if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+ seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#else
+ if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+ seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#endif // CONFIG_EXT_INTER
+ continue;
+ }
+
+#if CONFIG_DUAL_FILTER
+ (void)run_mv_search;
+#endif // CONFIG_DUAL_FILTER
+
+ if (has_second_rf &&
+#if CONFIG_EXT_INTER
+ this_mode == NEW_NEWMV &&
+#else
+ this_mode == NEWMV &&
+#endif // CONFIG_EXT_INTER
+#if CONFIG_DUAL_FILTER
+ (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search))
+#else
+ (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search))
+#endif // CONFIG_DUAL_FILTER
+ {
+ // adjust src pointers
+ mi_buf_shift(x, index);
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+ int rate_mv;
+ frame_mv[this_mode][mbmi->ref_frame[0]].as_int =
+ seg_mvs[index][mbmi->ref_frame[0]].as_int;
+ frame_mv[this_mode][mbmi->ref_frame[1]].as_int =
+ seg_mvs[index][mbmi->ref_frame[1]].as_int;
+ joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
+ mi_col,
+#if CONFIG_EXT_INTER
+ bsi->ref_mv,
+#endif // CONFIG_EXT_INTER
+ &rate_mv, index);
+#if CONFIG_EXT_INTER
+ compound_seg_newmvs[index][0].as_int =
+ frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+ compound_seg_newmvs[index][1].as_int =
+ frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#else
+ seg_mvs[index][mbmi->ref_frame[0]].as_int =
+ frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+ seg_mvs[index][mbmi->ref_frame[1]].as_int =
+ frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#endif // CONFIG_EXT_INTER
+ }
+ // restore src pointers
+ mi_buf_restore(x, orig_src, orig_pre);
+ }
+
+ bsi->rdstat[index][mode_idx].brate = set_and_cost_bmi_mvs(
+ cpi, x, xd, index, this_mode, mode_mv[this_mode], frame_mv,
+ seg_mvs[index],
+#if CONFIG_EXT_INTER
+ compound_seg_newmvs[index],
+#endif // CONFIG_EXT_INTER
+ bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row, mi_col);
+
+ for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+ bsi->rdstat[index][mode_idx].mvs[ref].as_int =
+ mode_mv[this_mode][ref].as_int;
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[index + 1][mode_idx].mvs[ref].as_int =
+ mode_mv[this_mode][ref].as_int;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[index + 2][mode_idx].mvs[ref].as_int =
+ mode_mv[this_mode][ref].as_int;
+#if CONFIG_REF_MV
+ bsi->rdstat[index][mode_idx].pred_mv[ref].as_int =
+ mi->bmi[index].pred_mv[ref].as_int;
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[index + 1][mode_idx].pred_mv[ref].as_int =
+ mi->bmi[index].pred_mv[ref].as_int;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[index + 2][mode_idx].pred_mv[ref].as_int =
+ mi->bmi[index].pred_mv[ref].as_int;
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ bsi->rdstat[index][mode_idx].ref_mv[ref].as_int =
+ bsi->ref_mv[ref]->as_int;
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[index + 1][mode_idx].ref_mv[ref].as_int =
+ bsi->ref_mv[ref]->as_int;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[index + 2][mode_idx].ref_mv[ref].as_int =
+ bsi->ref_mv[ref]->as_int;
+#endif // CONFIG_EXT_INTER
+ }
+
+ // Trap vectors that reach beyond the UMV borders
+ if (mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][0].as_mv) ||
+ (has_second_rf &&
+ mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][1].as_mv)))
+ continue;
+
+ if (filter_idx > 0) {
+ BEST_SEG_INFO *ref_bsi = bsi_buf;
+ subpelmv = 0;
+ have_ref = 1;
+
+ for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+ subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
+#if CONFIG_EXT_INTER
+ if (have_newmv_in_inter_mode(this_mode))
+ have_ref &=
+ ((mode_mv[this_mode][ref].as_int ==
+ ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
+ (bsi->ref_mv[ref]->as_int ==
+ ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
+ else
+#endif // CONFIG_EXT_INTER
+ have_ref &= mode_mv[this_mode][ref].as_int ==
+ ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
+ }
+
+ have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
+
+ if (filter_idx > 1 && !subpelmv && !have_ref) {
+ ref_bsi = bsi_buf + 1;
+ have_ref = 1;
+ for (ref = 0; ref < 1 + has_second_rf; ++ref)
+#if CONFIG_EXT_INTER
+ if (have_newmv_in_inter_mode(this_mode))
+ have_ref &=
+ ((mode_mv[this_mode][ref].as_int ==
+ ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
+ (bsi->ref_mv[ref]->as_int ==
+ ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
+ else
+#endif // CONFIG_EXT_INTER
+ have_ref &= mode_mv[this_mode][ref].as_int ==
+ ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
+
+ have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
+ }
+
+ if (!subpelmv && have_ref &&
+ ref_bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
+#if CONFIG_REF_MV
+ bsi->rdstat[index][mode_idx].byrate =
+ ref_bsi->rdstat[index][mode_idx].byrate;
+ bsi->rdstat[index][mode_idx].bdist =
+ ref_bsi->rdstat[index][mode_idx].bdist;
+ bsi->rdstat[index][mode_idx].bsse =
+ ref_bsi->rdstat[index][mode_idx].bsse;
+ bsi->rdstat[index][mode_idx].brate +=
+ ref_bsi->rdstat[index][mode_idx].byrate;
+ bsi->rdstat[index][mode_idx].eobs =
+ ref_bsi->rdstat[index][mode_idx].eobs;
+
+ bsi->rdstat[index][mode_idx].brdcost =
+ RDCOST(x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate,
+ bsi->rdstat[index][mode_idx].bdist);
+
+ memcpy(bsi->rdstat[index][mode_idx].ta,
+ ref_bsi->rdstat[index][mode_idx].ta,
+ sizeof(bsi->rdstat[index][mode_idx].ta));
+ memcpy(bsi->rdstat[index][mode_idx].tl,
+ ref_bsi->rdstat[index][mode_idx].tl,
+ sizeof(bsi->rdstat[index][mode_idx].tl));
+#else
+ memcpy(&bsi->rdstat[index][mode_idx],
+ &ref_bsi->rdstat[index][mode_idx], sizeof(SEG_RDSTAT));
+#endif // CONFIG_REF_MV
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[index + 1][mode_idx].eobs =
+ ref_bsi->rdstat[index + 1][mode_idx].eobs;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[index + 2][mode_idx].eobs =
+ ref_bsi->rdstat[index + 2][mode_idx].eobs;
+
+ if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
+#if CONFIG_REF_MV
+ // If the NEWMV mode is using the same motion vector as the
+ // NEARESTMV mode, skip the rest rate-distortion calculations
+ // and use the inferred motion vector modes.
+ if (this_mode == NEWMV) {
+ if (has_second_rf) {
+ if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+ bsi->ref_mv[0]->as_int &&
+ bsi->rdstat[index][mode_idx].mvs[1].as_int ==
+ bsi->ref_mv[1]->as_int)
+ continue;
+ } else {
+ if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+ bsi->ref_mv[0]->as_int)
+ continue;
+ }
+ }
+#endif // CONFIG_REF_MV
+ mode_selected = this_mode;
+ new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+ }
+ continue;
+ }
+ }
+
+ bsi->rdstat[index][mode_idx].brdcost = encode_inter_mb_segment_sub8x8(
+ cpi, x, bsi->segment_rd - this_segment_rd, index,
+ &bsi->rdstat[index][mode_idx].byrate,
+ &bsi->rdstat[index][mode_idx].bdist,
+ &bsi->rdstat[index][mode_idx].bsse, bsi->rdstat[index][mode_idx].ta,
+ bsi->rdstat[index][mode_idx].tl, idy, idx, mi_row, mi_col);
+
+ if (bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
+ bsi->rdstat[index][mode_idx].brdcost += RDCOST(
+ x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate, 0);
+ bsi->rdstat[index][mode_idx].brate +=
+ bsi->rdstat[index][mode_idx].byrate;
+ bsi->rdstat[index][mode_idx].eobs = p->eobs[index];
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[index + 1][mode_idx].eobs = p->eobs[index + 1];
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[index + 2][mode_idx].eobs = p->eobs[index + 2];
+ }
+
+ if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
+#if CONFIG_REF_MV
+ // If the NEWMV mode is using the same motion vector as the
+ // NEARESTMV mode, skip the rest rate-distortion calculations
+ // and use the inferred motion vector modes.
+ if (this_mode == NEWMV) {
+ if (has_second_rf) {
+ if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+ bsi->ref_mv[0]->as_int &&
+ bsi->rdstat[index][mode_idx].mvs[1].as_int ==
+ bsi->ref_mv[1]->as_int)
+ continue;
+ } else {
+ if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+ bsi->ref_mv[0]->as_int)
+ continue;
+ }
+ }
+#endif // CONFIG_REF_MV
+ mode_selected = this_mode;
+ new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
+
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+ }
+ } /*for each 4x4 mode*/
+
+ if (new_best_rd == INT64_MAX) {
+ int iy, midx;
+ for (iy = index + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+ for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
+ for (midx = 0; midx < INTER_MODES; ++midx)
+#endif // CONFIG_EXT_INTER
+ bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+ bsi->segment_rd = INT64_MAX;
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+ return INT64_MAX;
+ }
+
+ mode_idx = INTER_OFFSET(mode_selected);
+ memcpy(t_above, bsi->rdstat[index][mode_idx].ta, sizeof(t_above));
+ memcpy(t_left, bsi->rdstat[index][mode_idx].tl, sizeof(t_left));
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &post_buf);
+#endif // CONFIG_PVQ
+
+#if CONFIG_EXT_INTER
+ bsi->ref_mv[0]->as_int = bsi->rdstat[index][mode_idx].ref_mv[0].as_int;
+ if (has_second_rf)
+ bsi->ref_mv[1]->as_int = bsi->rdstat[index][mode_idx].ref_mv[1].as_int;
+#endif // CONFIG_EXT_INTER
+ set_and_cost_bmi_mvs(cpi, x, xd, index, mode_selected,
+ mode_mv[mode_selected], frame_mv, seg_mvs[index],
+#if CONFIG_EXT_INTER
+ compound_seg_newmvs[index],
+#endif // CONFIG_EXT_INTER
+ bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row,
+ mi_col);
+
+ br += bsi->rdstat[index][mode_idx].brate;
+ bd += bsi->rdstat[index][mode_idx].bdist;
+ block_sse += bsi->rdstat[index][mode_idx].bsse;
+ segmentyrate += bsi->rdstat[index][mode_idx].byrate;
+ this_segment_rd += bsi->rdstat[index][mode_idx].brdcost;
+
+ if (this_segment_rd > bsi->segment_rd) {
+ int iy, midx;
+ for (iy = index + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+ for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
+ for (midx = 0; midx < INTER_MODES; ++midx)
+#endif // CONFIG_EXT_INTER
+ bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+ bsi->segment_rd = INT64_MAX;
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+ return INT64_MAX;
+ }
+ }
+ } /* for each label */
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+
+ bsi->r = br;
+ bsi->d = bd;
+ bsi->segment_yrate = segmentyrate;
+ bsi->segment_rd = this_segment_rd;
+ bsi->sse = block_sse;
+
+ // update the coding decisions
+ for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
+
+#if CONFIG_DAALA_DIST
+ // Compute prediction (i.e. skip) and decoded distortion by daala-distortion.
+ {
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ uint8_t *src = p->src.buf;
+ uint8_t *dst = pd->dst.buf;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+ const int use_activity_masking = 0;
+ const int qm = OD_HVS_QM;
+ const int bsw = block_size_wide[plane_bsize];
+ const int bsh = block_size_high[plane_bsize];
+ int64_t rd1, rd2;
+ int64_t daala_sse, daala_dist;
+ TX_SIZE tx_size = mbmi->tx_size;
+
+#if CONFIG_HIGHBITDEPTH
+ uint8_t *recon_8x8;
+ DECLARE_ALIGNED(16, uint16_t, recon16[8 * 8]);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ recon_8x8 = CONVERT_TO_BYTEPTR(recon16);
+ else
+ recon_8x8 = (uint8_t *)recon16;
+#else
+ DECLARE_ALIGNED(16, uint8_t, recon_8x8[8 * 8]);
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_PVQ
+ use_activity_masking = x->daala_enc.use_activity_masking;
+#endif // CONFIG_PVQ
+
+ // For each of sub8x8 prediction block in a 8x8 block
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ int i = idy * 2 + idx;
+ const uint8_t *const src_sub8x8 =
+ src + av1_raster_block_offset(BLOCK_8X8, i, p->src.stride);
+ uint8_t *const dst_sub8x8 =
+ dst + av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride);
+ uint8_t *recon_sub8x8 = recon_8x8 + (idy * 8 + idx) * 4;
+ const int txb_width = max_block_wide(xd, plane_bsize, 0);
+ const int txb_height = max_block_high(xd, plane_bsize, 0);
+ int idx_, idy_;
+
+ av1_build_inter_predictor_sub8x8(xd, 0, i, idy, idx, mi_row, mi_col);
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_subtract_block(
+ height, width,
+ av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+ src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride, xd->bd);
+ } else {
+ aom_subtract_block(
+ height, width,
+ av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+ src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
+ }
+#else
+ aom_subtract_block(
+ bsh, bsw, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
+#endif // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8,
+ NULL, 0, NULL, 0, bsw, bsh, xd->bd);
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ aom_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8, NULL, 0,
+ NULL, 0, bsw, bsh);
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ // To get decoded pixels, do 4x4 xform and quant for each 4x4 block
+ // in a sub8x8 prediction block. In case remaining parts of
+ // sub8x8 inter mode rdo assume pd->dst stores predicted pixels,
+ // use local buffer to store decoded pixels.
+ for (idy_ = 0; idy_ < txb_height; idy_++) {
+ for (idx_ = 0; idx_ < txb_width; idx_++) {
+ int coeff_ctx = 0;
+ const tran_low_t *dqcoeff;
+ uint16_t eob;
+ const PLANE_TYPE plane_type = PLANE_TYPE_Y;
+ uint8_t *recon_4x4 = recon_sub8x8 + (idy_ * 8 + idx_) * 4;
+ const int block_raster_idx = (idy + idy_) * 2 + (idx + idx_);
+ const int block =
+ av1_raster_order_to_block_index(tx_size, block_raster_idx);
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+
+ dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ av1_xform_quant(cm, x, 0, block, idy + idy_, idx + idx_, BLOCK_8X8,
+ tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
+ av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+
+ eob = p->eobs[block];
+ av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size,
+ recon_4x4, 8, eob);
+ }
+ }
+ }
+ }
+ // Compute daala-distortion for a 8x8 block
+ daala_sse = av1_daala_dist(src, src_stride, pd->dst.buf, dst_stride, 8, 8,
+ qm, use_activity_masking, x->qindex)
+ << 4;
+
+ daala_dist = av1_daala_dist(src, src_stride, recon_8x8, 8, 8, 8, qm,
+ use_activity_masking, x->qindex)
+ << 4;
+
+ bsi->sse = daala_sse;
+ bsi->d = daala_dist;
+
+ rd1 = RDCOST(x->rdmult, x->rddiv, bsi->r, bsi->d);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, bsi->sse);
+ bsi->segment_rd = AOMMIN(rd1, rd2);
+ }
+#endif // CONFIG_DAALA_DIST
+
+ if (bsi->segment_rd > best_rd) return INT64_MAX;
+ /* set it to the best */
+ for (idx = 0; idx < 4; idx++) {
+ mode_idx = INTER_OFFSET(bsi->modes[idx]);
+ mi->bmi[idx].as_mv[0].as_int = bsi->rdstat[idx][mode_idx].mvs[0].as_int;
+ if (has_second_ref(mbmi))
+ mi->bmi[idx].as_mv[1].as_int = bsi->rdstat[idx][mode_idx].mvs[1].as_int;
+#if CONFIG_REF_MV
+ mi->bmi[idx].pred_mv[0] = bsi->rdstat[idx][mode_idx].pred_mv[0];
+ if (has_second_ref(mbmi))
+ mi->bmi[idx].pred_mv[1] = bsi->rdstat[idx][mode_idx].pred_mv[1];
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ mi->bmi[idx].ref_mv[0].as_int = bsi->rdstat[idx][mode_idx].ref_mv[0].as_int;
+ if (has_second_rf)
+ mi->bmi[idx].ref_mv[1].as_int =
+ bsi->rdstat[idx][mode_idx].ref_mv[1].as_int;
+#endif // CONFIG_EXT_INTER
+ x->plane[0].eobs[idx] = bsi->rdstat[idx][mode_idx].eobs;
+ mi->bmi[idx].as_mode = bsi->modes[idx];
+ }
+
+ /*
+ * used to set mbmi->mv.as_int
+ */
+ *returntotrate = bsi->r;
+ *returndistortion = bsi->d;
+ *returnyrate = bsi->segment_yrate;
+ *skippable = av1_is_skippable_in_plane(x, BLOCK_8X8, 0);
+ *psse = bsi->sse;
+ mbmi->mode = bsi->modes[3];
+
+ return bsi->segment_rd;
+}
+
+static void estimate_ref_frame_costs(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, int segment_id,
+ unsigned int *ref_costs_single,
+ unsigned int *ref_costs_comp,
+ aom_prob *comp_mode_p) {
+ int seg_ref_active =
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (seg_ref_active) {
+ memset(ref_costs_single, 0,
+ TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
+ memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
+ *comp_mode_p = 128;
+ } else {
+ aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
+ aom_prob comp_inter_p = 128;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ comp_inter_p = av1_get_reference_mode_prob(cm, xd);
+ *comp_mode_p = comp_inter_p;
+ } else {
+ *comp_mode_p = 128;
+ }
+
+ ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0);
+
+ if (cm->reference_mode != COMPOUND_REFERENCE) {
+ aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd);
+ aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd);
+#if CONFIG_EXT_REFS
+ aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
+ aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
+ aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
+#endif // CONFIG_EXT_REFS
+
+ unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
+
+ ref_costs_single[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+ ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
+ ref_costs_single[BWDREF_FRAME] =
+#endif // CONFIG_EXT_REFS
+ ref_costs_single[GOLDEN_FRAME] =
+ ref_costs_single[ALTREF_FRAME] = base_cost;
+
+#if CONFIG_EXT_REFS
+ ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
+ ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0);
+ ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
+ ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
+ ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+ ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+
+ ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
+ ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0);
+ ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1);
+ ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
+
+ ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
+ ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
+
+ ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
+ ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1);
+
+ ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
+ ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
+#else
+ ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
+ ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
+ ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+
+ ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0);
+ ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
+#endif // CONFIG_EXT_REFS
+ } else {
+ ref_costs_single[LAST_FRAME] = 512;
+#if CONFIG_EXT_REFS
+ ref_costs_single[LAST2_FRAME] = 512;
+ ref_costs_single[LAST3_FRAME] = 512;
+ ref_costs_single[BWDREF_FRAME] = 512;
+#endif // CONFIG_EXT_REFS
+ ref_costs_single[GOLDEN_FRAME] = 512;
+ ref_costs_single[ALTREF_FRAME] = 512;
+ }
+
+ if (cm->reference_mode != SINGLE_REFERENCE) {
+ aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd);
+#if CONFIG_EXT_REFS
+ aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
+ aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
+ aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
+#endif // CONFIG_EXT_REFS
+
+ unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
+
+ ref_costs_comp[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+ ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
+#endif // CONFIG_EXT_REFS
+ ref_costs_comp[GOLDEN_FRAME] = base_cost;
+
+#if CONFIG_EXT_REFS
+ ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0;
+#endif // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+ ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
+ ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
+ ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
+ ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+
+ ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
+ ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
+
+ ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
+ ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
+
+ // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
+ // more bit.
+ ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+ ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+#else
+ ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
+ ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+#endif // CONFIG_EXT_REFS
+ } else {
+ ref_costs_comp[LAST_FRAME] = 512;
+#if CONFIG_EXT_REFS
+ ref_costs_comp[LAST2_FRAME] = 512;
+ ref_costs_comp[LAST3_FRAME] = 512;
+ ref_costs_comp[BWDREF_FRAME] = 512;
+ ref_costs_comp[ALTREF_FRAME] = 512;
+#endif // CONFIG_EXT_REFS
+ ref_costs_comp[GOLDEN_FRAME] = 512;
+ }
+ }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+ int mode_index,
+ int64_t comp_pred_diff[REFERENCE_MODES],
+ int skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->skip = x->skip;
+ ctx->skippable = skippable;
+ ctx->best_mode_index = mode_index;
+ ctx->mic = *xd->mi[0];
+ ctx->mbmi_ext = *x->mbmi_ext;
+ ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+ ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
+ ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+}
+
+static void setup_buffer_inter(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ BLOCK_SIZE block_size, int mi_row, int mi_col,
+ int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],
+ int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],
+ struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) {
+ const AV1_COMMON *cm = &cpi->common;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+ const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+ assert(yv12 != NULL);
+
+ // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+ // use the UV scaling factors.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+
+ // Gets an initial list of candidate vectors from neighbours and orders them
+ av1_find_mv_refs(
+ cm, xd, mi, ref_frame,
+#if CONFIG_REF_MV
+ &mbmi_ext->ref_mv_count[ref_frame], mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+ mbmi_ext->compound_mode_context,
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_REF_MV
+ candidates, mi_row, mi_col, NULL, NULL, mbmi_ext->mode_context);
+
+ // Candidate refinement carried out at encoder and decoder
+ av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
+ &frame_nearest_mv[ref_frame],
+ &frame_near_mv[ref_frame]);
+
+// Further refinement that is encode side only to test the top few candidates
+// in full and choose the best as the centre point for subsequent searches.
+// The current implementation doesn't support scaling.
+#if CONFIG_CB4X4
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+ block_size);
+#else
+ if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8)
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+ block_size);
+#endif // CONFIG_CB4X4
+}
+
+static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+#if CONFIG_EXT_INTER
+ int ref_idx,
+#endif // CONFIG_EXT_INTER
+ int *rate_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const AV1_COMMON *cm = &cpi->common;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ int bestsme = INT_MAX;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ MV mvp_full;
+#if CONFIG_EXT_INTER
+ int ref = mbmi->ref_frame[ref_idx];
+#else
+ int ref = mbmi->ref_frame[0];
+ int ref_idx = 0;
+#endif // CONFIG_EXT_INTER
+ MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+
+ MvLimits tmp_mv_limits = x->mv_limits;
+ int cost_list[5];
+
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+
+ MV pred_mv[3];
+ pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+ pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+ pred_mv[2] = x->pred_mv[ref];
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
+
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+ if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param =
+ (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+ 2;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
+
+ if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+ int boffset =
+ 2 * (b_width_log2_lookup[cm->sb_size] -
+ AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+ step_param = AOMMAX(step_param, boffset);
+ }
+
+ if (cpi->sf.adaptive_motion_search) {
+ int bwl = b_width_log2_lookup[bsize];
+ int bhl = b_height_log2_lookup[bsize];
+ int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+ if (tlevel < 5) step_param += 2;
+
+ // prev_mv_sad is not setup for dynamically scaled frames.
+ if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+ if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+ x->pred_mv[ref].row = 0;
+ x->pred_mv[ref].col = 0;
+ x->best_mv.as_int = INVALID_MV;
+
+ if (scaled_ref_frame) {
+ int j;
+ for (j = 0; j < MAX_MB_PLANE; ++j)
+ xd->plane[j].pre[ref_idx] = backup_yv12[j];
+ }
+ return;
+ }
+ }
+ }
+ }
+
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+#if CONFIG_MOTION_VAR
+ if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+ mvp_full = mbmi->mv[0].as_mv;
+ else
+#endif // CONFIG_MOTION_VAR
+ mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+
+ x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
+#if CONFIG_MOTION_VAR
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION:
+#endif // CONFIG_MOTION_VAR
+ bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+ sadpb, cond_cost_list(cpi, cost_list),
+ &ref_mv, INT_MAX, 1);
+#if CONFIG_MOTION_VAR
+ break;
+ case OBMC_CAUSAL:
+ bestsme = av1_obmc_full_pixel_diamond(
+ cpi, x, &mvp_full, step_param, sadpb,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
+ &(x->best_mv.as_mv), 0);
+ break;
+ default: assert("Invalid motion mode!\n");
+ }
+#endif // CONFIG_MOTION_VAR
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (bestsme < INT_MAX) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+#if CONFIG_MOTION_VAR
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION:
+#endif // CONFIG_MOTION_VAR
+ if (cpi->sf.use_upsampled_references) {
+ int best_mv_var;
+ const int try_second = x->second_best_mv.as_int != INVALID_MV &&
+ x->second_best_mv.as_int != x->best_mv.as_int;
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ // Use up-sampled reference frames.
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ struct buf_2d backup_pred = pd->pre[ref_idx];
+ const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+
+ // Set pred for Y plane
+ setup_pred_plane(
+ &pd->pre[ref_idx], bsize, upsampled_ref->y_buffer,
+ upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
+ upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
+ pd->subsampling_x, pd->subsampling_y);
+
+ best_mv_var = cpi->find_fractional_mv_step(
+ x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
+ 1);
+
+ if (try_second) {
+ const int minc =
+ AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
+ const int maxc =
+ AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
+ const int minr =
+ AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
+ const int maxr =
+ AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
+ int this_var;
+ MV best_mv = x->best_mv.as_mv;
+
+ x->best_mv = x->second_best_mv;
+ if (x->best_mv.as_mv.row * 8 <= maxr &&
+ x->best_mv.as_mv.row * 8 >= minr &&
+ x->best_mv.as_mv.col * 8 <= maxc &&
+ x->best_mv.as_mv.col * 8 >= minc) {
+ this_var = cpi->find_fractional_mv_step(
+ x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+ &dis, &x->pred_sse[ref], NULL, pw, ph, 1);
+ if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+ x->best_mv.as_mv = best_mv;
+ }
+ }
+
+ // Restore the reference frames.
+ pd->pre[ref_idx] = backup_pred;
+ } else {
+ cpi->find_fractional_mv_step(
+ x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
+ 0);
+ }
+#if CONFIG_MOTION_VAR
+ break;
+ case OBMC_CAUSAL:
+ av1_find_best_obmc_sub_pixel_tree_up(
+ cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
+ cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
+ cpi->sf.use_upsampled_references);
+ break;
+ default: assert("Invalid motion mode!\n");
+ }
+#endif // CONFIG_MOTION_VAR
+ }
+ *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+
+#if CONFIG_MOTION_VAR
+ if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
+#else
+ if (cpi->sf.adaptive_motion_search)
+#endif // CONFIG_MOTION_VAR
+ x->pred_mv[ref] = x->best_mv.as_mv;
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[ref_idx] = backup_yv12[i];
+ }
+}
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = dst.plane[i];
+ xd->plane[i].dst.stride = dst.stride[i];
+ }
+}
+
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+static void do_masked_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const uint8_t *mask, int mask_stride,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int_mv *tmp_mv, int *rate_mv, int ref_idx) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const AV1_COMMON *cm = &cpi->common;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ int bestsme = INT_MAX;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ MV mvp_full;
+ int ref = mbmi->ref_frame[ref_idx];
+ MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+
+ MvLimits tmp_mv_limits = x->mv_limits;
+
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+ int i;
+
+ MV pred_mv[3];
+ pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+ pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+ pred_mv[2] = x->pred_mv[ref];
+
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+
+ if (scaled_ref_frame) {
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
+
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+ if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param =
+ (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+ 2;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
+
+ // TODO(debargha): is show_frame needed here?
+ if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size && cm->show_frame) {
+ int boffset =
+ 2 * (b_width_log2_lookup[cm->sb_size] -
+ AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+ step_param = AOMMAX(step_param, boffset);
+ }
+
+ if (cpi->sf.adaptive_motion_search) {
+ int bwl = b_width_log2_lookup[bsize];
+ int bhl = b_height_log2_lookup[bsize];
+ int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+ if (tlevel < 5) step_param += 2;
+
+ // prev_mv_sad is not setup for dynamically scaled frames.
+ if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+ for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+ if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+ x->pred_mv[ref].row = 0;
+ x->pred_mv[ref].col = 0;
+ tmp_mv->as_int = INVALID_MV;
+
+ if (scaled_ref_frame) {
+ int j;
+ for (j = 0; j < MAX_MB_PLANE; ++j)
+ xd->plane[j].pre[ref_idx] = backup_yv12[j];
+ }
+ return;
+ }
+ }
+ }
+ }
+
+ mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+
+ bestsme = av1_masked_full_pixel_diamond(
+ cpi, x, mask, mask_stride, &mvp_full, step_param, sadpb,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
+ &tmp_mv->as_mv, ref_idx);
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (bestsme < INT_MAX) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ av1_find_best_masked_sub_pixel_tree_up(
+ cpi, x, mask, mask_stride, mi_row, mi_col, &tmp_mv->as_mv, &ref_mv,
+ cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], ref_idx,
+ cpi->sf.use_upsampled_references);
+ }
+ *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+
+ if (cpi->sf.adaptive_motion_search && cm->show_frame)
+ x->pred_mv[ref] = tmp_mv->as_mv;
+
+ if (scaled_ref_frame) {
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[ref_idx] = backup_yv12[i];
+ }
+}
+
+static void do_masked_motion_search_indexed(
+ const AV1_COMP *const cpi, MACROBLOCK *x,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
+ // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ BLOCK_SIZE sb_type = mbmi->sb_type;
+ const uint8_t *mask;
+ const int mask_stride = block_size_wide[bsize];
+
+ mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+ if (which == 0 || which == 2)
+ do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
+ &tmp_mv[0], &rate_mv[0], 0);
+
+ if (which == 1 || which == 2) {
+// get the negative mask
+#if CONFIG_COMPOUND_SEGMENT
+ uint8_t inv_mask_buf[2 * MAX_SB_SQUARE];
+ const int h = block_size_high[bsize];
+ mask = av1_get_compound_type_mask_inverse(
+ comp_data, inv_mask_buf, h, mask_stride, mask_stride, sb_type);
+#else
+ mask = av1_get_compound_type_mask_inverse(comp_data, sb_type);
+#endif // CONFIG_COMPOUND_SEGMENT
+ do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
+ &tmp_mv[1], &rate_mv[1], 1);
+ }
+}
+#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif // CONFIG_EXT_INTER
+
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode,
+ int_mv this_mv,
+ int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
+ int ref_frame) {
+ return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
+ (this_mv.as_int != 0) &&
+ ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+ ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+ clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+#if CONFIG_EXT_INTER
+#if CONFIG_WEDGE
+static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const BLOCK_SIZE bsize, const uint8_t *pred0,
+ int stride0, const uint8_t *pred1, int stride1) {
+ const struct macroblock_plane *const p = &x->plane[0];
+ const uint8_t *src = p->src.buf;
+ int src_stride = p->src.stride;
+ const int f_index = bsize - BLOCK_8X8;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ uint32_t esq[2][4], var;
+ int64_t tl, br;
+
+#if CONFIG_HIGHBITDEPTH
+ if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ pred0 = CONVERT_TO_BYTEPTR(pred0);
+ pred1 = CONVERT_TO_BYTEPTR(pred1);
+ }
+#endif // CONFIG_HIGHBITDEPTH
+
+ var = cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+ var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2,
+ stride0, &esq[0][1]);
+ var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+ pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+ var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+ pred0 + bh / 2 * stride0 + bw / 2, stride0,
+ &esq[0][3]);
+ var = cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+ var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2,
+ stride1, &esq[1][1]);
+ var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+ pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+ var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+ pred1 + bh / 2 * stride1 + bw / 2, stride0,
+ &esq[1][3]);
+ (void)var;
+
+ tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
+ (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
+ br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
+ (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
+ return (tl + br > 0);
+}
+#endif // CONFIG_WEDGE
+#endif // CONFIG_EXT_INTER
+
+#if !CONFIG_DUAL_FILTER
+static InterpFilter predict_interp_filter(
+ const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize,
+ const int mi_row, const int mi_col,
+ InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) {
+ InterpFilter best_filter = SWITCHABLE;
+ const AV1_COMMON *cm = &cpi->common;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ int bsl = mi_width_log2_lookup[bsize];
+ int pred_filter_search =
+ cpi->sf.cb_pred_filter_search
+ ? (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1
+ : 0;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const int this_mode = mbmi->mode;
+ int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ if (pred_filter_search) {
+ InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
+ if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+ if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter;
+
+#if CONFIG_EXT_INTER
+ if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
+#else
+ if ((this_mode != NEWMV) || (af == lf))
+#endif // CONFIG_EXT_INTER
+ best_filter = af;
+ }
+ if (is_comp_pred) {
+ if (cpi->sf.adaptive_mode_search) {
+#if CONFIG_EXT_INTER
+ switch (this_mode) {
+ case NEAREST_NEARESTMV:
+ if (single_filter[NEARESTMV][refs[0]] ==
+ single_filter[NEARESTMV][refs[1]])
+ best_filter = single_filter[NEARESTMV][refs[0]];
+ break;
+ case NEAREST_NEARMV:
+ if (single_filter[NEARESTMV][refs[0]] ==
+ single_filter[NEARMV][refs[1]])
+ best_filter = single_filter[NEARESTMV][refs[0]];
+ break;
+ case NEAR_NEARESTMV:
+ if (single_filter[NEARMV][refs[0]] ==
+ single_filter[NEARESTMV][refs[1]])
+ best_filter = single_filter[NEARMV][refs[0]];
+ break;
+ case NEAR_NEARMV:
+ if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
+ best_filter = single_filter[NEARMV][refs[0]];
+ break;
+ case ZERO_ZEROMV:
+ if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]])
+ best_filter = single_filter[ZEROMV][refs[0]];
+ break;
+ case NEW_NEWMV:
+ if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]])
+ best_filter = single_filter[NEWMV][refs[0]];
+ break;
+ case NEAREST_NEWMV:
+ if (single_filter[NEARESTMV][refs[0]] ==
+ single_filter[NEWMV][refs[1]])
+ best_filter = single_filter[NEARESTMV][refs[0]];
+ break;
+ case NEAR_NEWMV:
+ if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]])
+ best_filter = single_filter[NEARMV][refs[0]];
+ break;
+ case NEW_NEARESTMV:
+ if (single_filter[NEWMV][refs[0]] ==
+ single_filter[NEARESTMV][refs[1]])
+ best_filter = single_filter[NEWMV][refs[0]];
+ break;
+ case NEW_NEARMV:
+ if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]])
+ best_filter = single_filter[NEWMV][refs[0]];
+ break;
+ default:
+ if (single_filter[this_mode][refs[0]] ==
+ single_filter[this_mode][refs[1]])
+ best_filter = single_filter[this_mode][refs[0]];
+ break;
+ }
+#else
+ if (single_filter[this_mode][refs[0]] ==
+ single_filter[this_mode][refs[1]])
+ best_filter = single_filter[this_mode][refs[0]];
+#endif // CONFIG_EXT_INTER
+ }
+ }
+ if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+ best_filter = EIGHTTAP_REGULAR;
+ }
+ return best_filter;
+}
+#endif // !CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_INTER
+// Choose the best wedge index and sign
+#if CONFIG_WEDGE
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const uint8_t *const p0,
+ const uint8_t *const p1, int *const best_wedge_sign,
+ int *const best_wedge_index) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int wedge_index;
+ int wedge_sign;
+ int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+ const uint8_t *mask;
+ uint64_t sse;
+#if CONFIG_HIGHBITDEPTH
+ const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+ const int bd_round = 0;
+#endif // CONFIG_HIGHBITDEPTH
+
+ DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+
+ int64_t sign_limit;
+
+#if CONFIG_HIGHBITDEPTH
+ if (hbd) {
+ aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ } else // NOLINT
+#endif // CONFIG_HIGHBITDEPTH
+ {
+ aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
+ aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+ }
+
+ sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) -
+ (int64_t)aom_sum_squares_i16(r1, N)) *
+ (1 << WEDGE_WEIGHT_BITS) / 2;
+
+ if (N < 64)
+ av1_wedge_compute_delta_squares_c(ds, r0, r1, N);
+ else
+ av1_wedge_compute_delta_squares(ds, r0, r1, N);
+
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+ // TODO(jingning): Make sse2 functions support N = 16 case
+ if (N < 64)
+ wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit);
+ else
+ wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ if (N < 64)
+ sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+ else
+ sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+ rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ *best_wedge_sign = wedge_sign;
+ best_rd = rd;
+ }
+ }
+
+ return best_rd;
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+ const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1,
+ const int wedge_sign, int *const best_wedge_index) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int wedge_index;
+ int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+ const uint8_t *mask;
+ uint64_t sse;
+#if CONFIG_HIGHBITDEPTH
+ const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+ const int bd_round = 0;
+#endif // CONFIG_HIGHBITDEPTH
+
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+
+#if CONFIG_HIGHBITDEPTH
+ if (hbd) {
+ aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ } else // NOLINT
+#endif // CONFIG_HIGHBITDEPTH
+ {
+ aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+ }
+
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ if (N < 64)
+ sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
+ else
+ sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+ rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ best_rd = rd;
+ }
+ }
+
+ return best_rd;
+}
+
+static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const int bw = block_size_wide[bsize];
+
+ int64_t rd;
+ int wedge_index = -1;
+ int wedge_sign = 0;
+
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+
+ if (cpi->sf.fast_wedge_sign_estimate) {
+ wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+ rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
+ } else {
+ rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
+ }
+
+ mbmi->wedge_sign = wedge_sign;
+ mbmi->wedge_index = wedge_index;
+ return rd;
+}
+#endif // CONFIG_WEDGE
+
+#if CONFIG_COMPOUND_SEGMENT
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ int rate;
+ uint64_t sse;
+ int64_t dist;
+ int64_t rd0;
+ SEG_MASK_TYPE cur_mask_type;
+ int64_t best_rd = INT64_MAX;
+ SEG_MASK_TYPE best_mask_type = 0;
+#if CONFIG_HIGHBITDEPTH
+ const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+ const int bd_round = 0;
+#endif // CONFIG_HIGHBITDEPTH
+ DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+
+#if CONFIG_HIGHBITDEPTH
+ if (hbd) {
+ aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ } else // NOLINT
+#endif // CONFIG_HIGHBITDEPTH
+ {
+ aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
+ aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+ }
+
+ // try each mask type and its inverse
+ for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) {
+// build mask and inverse
+#if CONFIG_HIGHBITDEPTH
+ if (hbd)
+ build_compound_seg_mask_highbd(
+ xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+ CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw,
+ bsize, bh, bw);
+
+ // compute rd for mask
+ sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+ rd0 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+ if (rd0 < best_rd) {
+ best_mask_type = cur_mask_type;
+ best_rd = rd0;
+ }
+ }
+
+ // make final mask
+ mbmi->mask_type = best_mask_type;
+#if CONFIG_HIGHBITDEPTH
+ if (hbd)
+ build_compound_seg_mask_highbd(
+ xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+ CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw,
+ bsize, bh, bw);
+
+ return best_rd;
+}
+#endif // CONFIG_COMPOUND_SEGMENT
+
+#if CONFIG_WEDGE && CONFIG_INTERINTRA
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+ int64_t rd;
+ int wedge_index = -1;
+
+ assert(is_interintra_wedge_used(bsize));
+
+ rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
+
+ mbmi->interintra_wedge_sign = 0;
+ mbmi->interintra_wedge_index = wedge_index;
+ return rd;
+}
+#endif // CONFIG_WEDGE && CONFIG_INTERINTRA
+
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1) {
+ const COMPOUND_TYPE compound_type =
+ x->e_mbd.mi[0]->mbmi.interinter_compound_type;
+ switch (compound_type) {
+#if CONFIG_WEDGE
+ case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
+#endif // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+ case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1);
+#endif // CONFIG_COMPOUND_SEGMENT
+ default: assert(0); return 0;
+ }
+}
+
+static int interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const BLOCK_SIZE bsize,
+ const int this_mode, int mi_row,
+ int mi_col) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int_mv tmp_mv[2];
+ int rate_mvs[2], tmp_rate_mv = 0;
+ const INTERINTER_COMPOUND_DATA compound_data = {
+#if CONFIG_WEDGE
+ mbmi->wedge_index,
+ mbmi->wedge_sign,
+#endif // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+ mbmi->mask_type,
+ xd->seg_mask,
+#endif // CONFIG_COMPOUND_SEGMENT
+ mbmi->interinter_compound_type
+ };
+ if (this_mode == NEW_NEWMV) {
+ do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
+ mi_col, tmp_mv, rate_mvs, 2);
+ tmp_rate_mv = rate_mvs[0] + rate_mvs[1];
+ mbmi->mv[0].as_int = tmp_mv[0].as_int;
+ mbmi->mv[1].as_int = tmp_mv[1].as_int;
+ } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+ do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
+ mi_col, tmp_mv, rate_mvs, 0);
+ tmp_rate_mv = rate_mvs[0];
+ mbmi->mv[0].as_int = tmp_mv[0].as_int;
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
+ mi_col, tmp_mv, rate_mvs, 1);
+ tmp_rate_mv = rate_mvs[1];
+ mbmi->mv[1].as_int = tmp_mv[1].as_int;
+ }
+ return tmp_rate_mv;
+}
+
+static int64_t build_and_cost_compound_type(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv,
+ BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
+ int *strides, int mi_row, int mi_col) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int rate_sum;
+ int64_t dist_sum;
+ int64_t best_rd_cur = INT64_MAX;
+ int64_t rd = INT64_MAX;
+ int tmp_skip_txfm_sb;
+ int64_t tmp_skip_sse_sb;
+ const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
+
+ best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
+ best_rd_cur += RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv, 0);
+
+ if (have_newmv_in_inter_mode(this_mode) &&
+ use_masked_motion_search(compound_type)) {
+ *out_rate_mv = interinter_compound_motion_search(cpi, x, bsize, this_mode,
+ mi_row, mi_col);
+ av1_build_inter_predictors_sby(xd, mi_row, mi_col, ctx, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+ rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+ if (rd >= best_rd_cur) {
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ *out_rate_mv = rate_mv;
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
+#if CONFIG_SUPERTX
+ 0, 0,
+#endif // CONFIG_SUPERTX
+ preds0, strides, preds1,
+ strides);
+ }
+ av1_subtract_plane(x, bsize, 0);
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+ best_rd_cur = rd;
+
+ } else {
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
+#if CONFIG_SUPERTX
+ 0, 0,
+#endif // CONFIG_SUPERTX
+ preds0, strides, preds1, strides);
+ av1_subtract_plane(x, bsize, 0);
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+ best_rd_cur = rd;
+ }
+ return best_rd_cur;
+}
+#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#endif // CONFIG_EXT_INTER
+
+typedef struct {
+#if CONFIG_MOTION_VAR
+ // Inter prediction buffers and respective strides
+ uint8_t *above_pred_buf[MAX_MB_PLANE];
+ int above_pred_stride[MAX_MB_PLANE];
+ uint8_t *left_pred_buf[MAX_MB_PLANE];
+ int left_pred_stride[MAX_MB_PLANE];
+#endif // CONFIG_MOTION_VAR
+ int_mv *single_newmv;
+#if CONFIG_EXT_INTER
+ // Pointer to array of motion vectors to use for each ref and their rates
+ // Should point to first of 2 arrays in 2D array
+ int *single_newmv_rate;
+ // Pointers costs of compound inter-intra and inter-inter predictions
+ int *compmode_interintra_cost;
+ int *compmode_interinter_cost;
+ // Pointer to array of predicted rate-distortion
+ // Should point to first of 2 arrays in 2D array
+ int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
+#endif // CONFIG_EXT_INTER
+ InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+} HandleInterModeArgs;
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
+ const int mi_row, const int mi_col,
+ int *const rate_mv, int_mv *const single_newmv,
+ HandleInterModeArgs *const args) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+#if CONFIG_EXT_INTER
+ const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
+#endif // CONFIG_EXT_INTER
+ int_mv *const frame_mv = mode_mv[this_mode];
+ const int refs[2] = { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+ int i;
+
+ (void)args;
+
+ if (is_comp_pred) {
+#if CONFIG_EXT_INTER
+ for (i = 0; i < 2; ++i) {
+ single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
+ }
+
+ if (this_mode == NEW_NEWMV) {
+ frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+ frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+ joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL,
+ rate_mv, 0);
+ } else {
+ *rate_mv = 0;
+ for (i = 0; i < 2; ++i) {
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+ *rate_mv += av1_mv_bit_cost(
+ &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+ }
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+ *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+ &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ } else {
+ assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+ frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+ *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+ &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+#else
+ // Initialize mv using single prediction mode result.
+ frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+ frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+ joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, rate_mv, 0);
+ } else {
+ *rate_mv = 0;
+ for (i = 0; i < 2; ++i) {
+#if CONFIG_REF_MV
+ av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
+#endif // CONFIG_REF_MV
+ *rate_mv += av1_mv_bit_cost(&frame_mv[refs[i]].as_mv,
+ &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+ }
+#endif // CONFIG_EXT_INTER
+ } else {
+#if CONFIG_EXT_INTER
+ if (is_comp_interintra_pred) {
+ x->best_mv = args->single_newmv[refs[0]];
+ *rate_mv = args->single_newmv_rate[refs[0]];
+ } else {
+ single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
+ args->single_newmv[refs[0]] = x->best_mv;
+ args->single_newmv_rate[refs[0]] = *rate_mv;
+ }
+#else
+ single_motion_search(cpi, x, bsize, mi_row, mi_col, rate_mv);
+ single_newmv[refs[0]] = x->best_mv;
+#endif // CONFIG_EXT_INTER
+
+ if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+ frame_mv[refs[0]] = x->best_mv;
+ xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
+
+ // Estimate the rate implications of a new mv but discount this
+ // under certain circumstances where we want to help initiate a weak
+ // motion field, where the distortion gain for a single block may not
+ // be enough to overcome the cost of a new mv.
+ if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
+ *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
+ }
+ }
+
+ return 0;
+}
+
+int64_t interpolation_filter_search(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
+ BUFFER_SET *const orig_dst,
+ InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME],
+ int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int i;
+ int tmp_rate;
+ int64_t tmp_dist;
+
+ (void)single_filter;
+
+ InterpFilter assign_filter = SWITCHABLE;
+
+ if (cm->interp_filter == SWITCHABLE) {
+#if !CONFIG_DUAL_FILTER
+ assign_filter = av1_is_interp_needed(xd)
+ ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
+ single_filter)
+ : cm->interp_filter;
+#endif // !CONFIG_DUAL_FILTER
+ } else {
+ assign_filter = cm->interp_filter;
+ }
+
+ set_default_interp_filters(mbmi, assign_filter);
+
+ *switchable_rate = av1_get_switchable_rate(cpi, xd);
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
+ skip_txfm_sb, skip_sse_sb);
+ *rd = RDCOST(x->rdmult, x->rddiv, *switchable_rate + tmp_rate, tmp_dist);
+
+ if (assign_filter == SWITCHABLE) {
+ // do interp_filter search
+ if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+ const int filter_set_size = DUAL_FILTER_SET_SIZE;
+#else
+ const int filter_set_size = SWITCHABLE_FILTERS;
+#endif // CONFIG_DUAL_FILTER
+ int best_in_temp = 0;
+#if CONFIG_DUAL_FILTER
+ InterpFilter best_filter[4];
+ av1_copy(best_filter, mbmi->interp_filter);
+#else
+ InterpFilter best_filter = mbmi->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ restore_dst_buf(xd, *tmp_dst);
+ // EIGHTTAP_REGULAR mode is calculated beforehand
+ for (i = 1; i < filter_set_size; ++i) {
+ int tmp_skip_sb = 0;
+ int64_t tmp_skip_sse = INT64_MAX;
+ int tmp_rs;
+ int64_t tmp_rd;
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] = filter_sets[i][0];
+ mbmi->interp_filter[1] = filter_sets[i][1];
+ mbmi->interp_filter[2] = filter_sets[i][0];
+ mbmi->interp_filter[3] = filter_sets[i][1];
+#else
+ mbmi->interp_filter = (InterpFilter)i;
+#endif // CONFIG_DUAL_FILTER
+ tmp_rs = av1_get_switchable_rate(cpi, xd);
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+ &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+ tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
+
+ if (tmp_rd < *rd) {
+ *rd = tmp_rd;
+ *switchable_rate = av1_get_switchable_rate(cpi, xd);
+#if CONFIG_DUAL_FILTER
+ av1_copy(best_filter, mbmi->interp_filter);
+#else
+ best_filter = mbmi->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ *skip_txfm_sb = tmp_skip_sb;
+ *skip_sse_sb = tmp_skip_sse;
+ best_in_temp = !best_in_temp;
+ if (best_in_temp) {
+ restore_dst_buf(xd, *orig_dst);
+ } else {
+ restore_dst_buf(xd, *tmp_dst);
+ }
+ }
+ }
+ if (best_in_temp) {
+ restore_dst_buf(xd, *tmp_dst);
+ } else {
+ restore_dst_buf(xd, *orig_dst);
+ }
+#if CONFIG_DUAL_FILTER
+ av1_copy(mbmi->interp_filter, best_filter);
+#else
+ mbmi->interp_filter = best_filter;
+#endif // CONFIG_DUAL_FILTER
+ } else {
+#if CONFIG_DUAL_FILTER
+ for (i = 0; i < 4; ++i)
+ assert(mbmi->interp_filter[i] == EIGHTTAP_REGULAR);
+#else
+ assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
+#endif // CONFIG_DUAL_FILTER
+ }
+ }
+
+ return 0;
+}
+
+// TODO(afergs): Refactor the MBMI references in here - there's four
+// TODO(afergs): Refactor optional args - add them to a struct or remove
+static int64_t motion_mode_rd(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+ int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
+ int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
+ const int *refs, int rate_mv,
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+ int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi,
+#if CONFIG_MOTION_VAR
+ int rate_mv_bmc,
+#endif // CONFIG_MOTION_VAR
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+
+ (void)mode_mv;
+ (void)mi_row;
+ (void)mi_col;
+ (void)args;
+ (void)refs;
+ (void)rate_mv;
+ (void)is_comp_pred;
+ (void)this_mode;
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ MOTION_MODE motion_mode, last_motion_mode_allowed;
+ int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
+ RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+ MB_MODE_INFO base_mbmi, best_mbmi;
+#if CONFIG_VAR_TX
+ uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#endif // CONFIG_VAR_TX
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_WARPED_MOTION
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#endif // CONFIG_WARPED_MOTION
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ av1_invalid_rd_stats(&best_rd_stats);
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+ if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
+#if CONFIG_WARPED_MOTION
+ aom_clear_system_state();
+ mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+#if CONFIG_EXT_INTER
+ best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ rate2_nocoeff = rd_stats->rate;
+ last_motion_mode_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ 0, xd->global_motion,
+#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ mi);
+ base_mbmi = *mbmi;
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ int64_t best_rd = INT64_MAX;
+ for (motion_mode = SIMPLE_TRANSLATION;
+ motion_mode <= last_motion_mode_allowed; motion_mode++) {
+ int64_t tmp_rd = INT64_MAX;
+ int tmp_rate;
+ int64_t tmp_dist;
+#if CONFIG_EXT_INTER
+ int tmp_rate2 =
+ motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
+#else
+ int tmp_rate2 = rate2_nocoeff;
+#endif // CONFIG_EXT_INTER
+
+ *mbmi = base_mbmi;
+ mbmi->motion_mode = motion_mode;
+#if CONFIG_MOTION_VAR
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+#if CONFIG_EXT_INTER
+ *mbmi = *best_bmc_mbmi;
+ mbmi->motion_mode = OBMC_CAUSAL;
+#endif // CONFIG_EXT_INTER
+ if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+ int tmp_rate_mv = 0;
+
+ single_motion_search(cpi, x, bsize, mi_row, mi_col,
+#if CONFIG_EXT_INTER
+ 0,
+#endif // CONFIG_EXT_INTER
+ &tmp_rate_mv);
+ mbmi->mv[0].as_int = x->best_mv.as_int;
+ if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
+ refs[0])) {
+ tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+ }
+#if CONFIG_EXT_INTER
+ tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
+#else
+ tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+#endif // CONFIG_EXT_INTER
+#if CONFIG_DUAL_FILTER
+ if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+ mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+ if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+ mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#endif // CONFIG_DUAL_FILTER
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+#if CONFIG_EXT_INTER
+ } else {
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+#endif // CONFIG_EXT_INTER
+ }
+ av1_build_obmc_inter_prediction(
+ cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
+ args->left_pred_buf, args->left_pred_stride);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+ &tmp_dist, skip_txfm_sb, skip_sse_sb);
+ }
+#endif // CONFIG_MOTION_VAR
+
+#if CONFIG_WARPED_MOTION
+ if (mbmi->motion_mode == WARPED_CAUSAL) {
+#if CONFIG_EXT_INTER
+ *mbmi = *best_bmc_mbmi;
+ mbmi->motion_mode = WARPED_CAUSAL;
+#endif // CONFIG_EXT_INTER
+ mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+#if CONFIG_DUAL_FILTER
+ for (int dir = 0; dir < 4; ++dir)
+ mbmi->interp_filter[dir] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#else
+ mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+
+ if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+ mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+ &mbmi->wm_params[0], mi_row, mi_col) == 0) {
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+ &tmp_dist, skip_txfm_sb, skip_sse_sb);
+ } else {
+ continue;
+ }
+ }
+#endif // CONFIG_WARPED_MOTION
+ x->skip = 0;
+
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip = 1;
+ rd_stats->rate = tmp_rate2;
+ if (last_motion_mode_allowed > SIMPLE_TRANSLATION) {
+#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+ if (last_motion_mode_allowed == WARPED_CAUSAL)
+#endif // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+ rd_stats->rate += cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+ else
+ rd_stats->rate += cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+#endif // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+ }
+#if CONFIG_WARPED_MOTION
+ if (mbmi->motion_mode == WARPED_CAUSAL) {
+ rd_stats->rate -= rs;
+ }
+#endif // CONFIG_WARPED_MOTION
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ if (!*skip_txfm_sb) {
+ int64_t rdcosty = INT64_MAX;
+ int is_cost_valid_uv = 0;
+
+ // cost and distortion
+ av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+ select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+ } else {
+ int idx, idy;
+ super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+ for (idy = 0; idy < xd->n8_h; ++idy)
+ for (idx = 0; idx < xd->n8_w; ++idx)
+ mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+ memset(x->blk_skip[0], rd_stats_y->skip,
+ sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+ }
+#else
+ /* clang-format off */
+ super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+/* clang-format on */
+#endif // CONFIG_VAR_TX
+
+ if (rd_stats_y->rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ if (mbmi->motion_mode != SIMPLE_TRANSLATION) {
+ continue;
+ } else {
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ restore_dst_buf(xd, *orig_dst);
+ return INT64_MAX;
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ }
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ }
+
+ av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+ rdcosty = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+ rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+/* clang-format off */
+#if CONFIG_VAR_TX
+ is_cost_valid_uv =
+ inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
+#else
+ is_cost_valid_uv =
+ super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
+#endif // CONFIG_VAR_TX
+ if (!is_cost_valid_uv) {
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ continue;
+#else
+ restore_dst_buf(xd, *orig_dst);
+ return INT64_MAX;
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ }
+ /* clang-format on */
+ av1_merge_rd_stats(rd_stats, rd_stats_uv);
+#if CONFIG_RD_DEBUG
+ // record transform block coefficient cost
+ // TODO(angiebird): So far rd_debug tool only detects discrepancy of
+ // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi
+ // here because we already collect the coefficient cost. Move this part to
+ // other place when we need to compare non-coefficient cost.
+ mbmi->rd_stats = *rd_stats;
+#endif // CONFIG_RD_DEBUG
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ if (rd_stats->skip) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ mbmi->skip = 0;
+ // here mbmi->skip temporarily plays a role as what this_skip2 does
+ } else if (!xd->lossless[mbmi->segment_id] &&
+ (RDCOST(x->rdmult, x->rddiv,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+ rd_stats->dist) >=
+ RDCOST(x->rdmult, x->rddiv,
+ av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
+ rd_stats->sse))) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ mbmi->skip = 1;
+ } else {
+ rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ mbmi->skip = 0;
+ }
+ *disable_skip = 0;
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ } else {
+ x->skip = 1;
+ *disable_skip = 1;
+ mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+
+// The cost of skip bit needs to be added.
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ mbmi->skip = 0;
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+
+ rd_stats->dist = *skip_sse_sb;
+ rd_stats->sse = *skip_sse_sb;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->skip = 1;
+ }
+
+#if CONFIG_GLOBAL_MOTION
+ if (this_mode == ZEROMV
+#if CONFIG_EXT_INTER
+ || this_mode == ZERO_ZEROMV
+#endif // CONFIG_EXT_INTER
+ ) {
+ if (is_nontrans_global_motion(xd)) {
+ rd_stats->rate -= rs;
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+ mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#else
+ mbmi->interp_filter = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ }
+ }
+#endif // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ tmp_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+ if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
+ best_mbmi = *mbmi;
+ best_rd = tmp_rd;
+ best_rd_stats = *rd_stats;
+ best_rd_stats_y = *rd_stats_y;
+ best_rd_stats_uv = *rd_stats_uv;
+#if CONFIG_VAR_TX
+ for (int i = 0; i < MAX_MB_PLANE; ++i)
+ memcpy(best_blk_skip[i], x->blk_skip[i],
+ sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif // CONFIG_VAR_TX
+ best_xskip = x->skip;
+ best_disable_skip = *disable_skip;
+ }
+ }
+
+ if (best_rd == INT64_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ restore_dst_buf(xd, *orig_dst);
+ return INT64_MAX;
+ }
+ *mbmi = best_mbmi;
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ *rd_stats_uv = best_rd_stats_uv;
+#if CONFIG_VAR_TX
+ for (int i = 0; i < MAX_MB_PLANE; ++i)
+ memcpy(x->blk_skip[i], best_blk_skip[i],
+ sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif // CONFIG_VAR_TX
+ x->skip = best_xskip;
+ *disable_skip = best_disable_skip;
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+ restore_dst_buf(xd, *orig_dst);
+ return 0;
+}
+
+static int64_t handle_inter_mode(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+ int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
+ int mi_col, HandleInterModeArgs *args, const int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ (void)cm;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const int this_mode = mbmi->mode;
+ int_mv *frame_mv = mode_mv[this_mode];
+ int i;
+ int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ int_mv cur_mv[2];
+ int rate_mv = 0;
+#if CONFIG_EXT_INTER
+ int pred_exists = 1;
+ const int bw = block_size_wide[bsize];
+ int_mv single_newmv[TOTAL_REFS_PER_FRAME];
+#if CONFIG_INTERINTRA
+ const unsigned int *const interintra_mode_cost =
+ cpi->interintra_mode_cost[size_group_lookup[bsize]];
+#endif // CONFIG_INTERINTRA
+ const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
+#if CONFIG_REF_MV
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+#endif // CONFIG_REF_MV
+#else
+ int_mv *const single_newmv = args->single_newmv;
+#endif // CONFIG_EXT_INTER
+#if CONFIG_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif // CONFIG_HIGHBITDEPTH
+ uint8_t *tmp_buf;
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+ int rate2_bmc_nocoeff;
+ MB_MODE_INFO best_bmc_mbmi;
+#if CONFIG_MOTION_VAR
+ int rate_mv_bmc;
+#endif // CONFIG_MOTION_VAR
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ int64_t rd = INT64_MAX;
+ BUFFER_SET orig_dst, tmp_dst;
+ int rs = 0;
+
+ int skip_txfm_sb = 0;
+ int64_t skip_sse_sb = INT64_MAX;
+ int16_t mode_ctx;
+
+#if CONFIG_EXT_INTER
+ *args->compmode_interintra_cost = 0;
+ mbmi->use_wedge_interintra = 0;
+ *args->compmode_interinter_cost = 0;
+ mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+
+ // is_comp_interintra_pred implies !is_comp_pred
+ assert(!is_comp_interintra_pred || (!is_comp_pred));
+ // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
+ assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
+#endif // CONFIG_EXT_INTER
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (is_comp_pred)
+ mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
+ else
+#endif // CONFIG_EXT_INTER
+ mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+ mbmi->ref_frame, bsize, -1);
+#else // CONFIG_REF_MV
+ mode_ctx = mbmi_ext->mode_context[refs[0]];
+#endif // CONFIG_REF_MV
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ tmp_buf = tmp_buf_;
+ // Make sure that we didn't leave the plane destination buffers set
+ // to tmp_buf at the end of the last iteration
+ assert(xd->plane[0].dst.buf != tmp_buf);
+
+#if CONFIG_WARPED_MOTION
+ mbmi->num_proj_ref[0] = 0;
+ mbmi->num_proj_ref[1] = 0;
+#endif // CONFIG_WARPED_MOTION
+
+ if (is_comp_pred) {
+ if (frame_mv[refs[0]].as_int == INVALID_MV ||
+ frame_mv[refs[1]].as_int == INVALID_MV)
+ return INT64_MAX;
+ }
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ const int64_t ret_val = handle_newmv(cpi, x, bsize, mode_mv, mi_row, mi_col,
+ &rate_mv, single_newmv, args);
+ if (ret_val != 0)
+ return ret_val;
+ else
+ rd_stats->rate += rate_mv;
+ }
+ for (i = 0; i < is_comp_pred + 1; ++i) {
+ cur_mv[i] = frame_mv[refs[i]];
+ // Clip "next_nearest" so that it does not extend to far out of image
+ if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (this_mode == NEAREST_NEARESTMV)
+#else
+ if (this_mode == NEARESTMV && is_comp_pred)
+#endif // CONFIG_EXT_INTER
+ {
+#if !CONFIG_EXT_INTER
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+#endif // !CONFIG_EXT_INTER
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
+ cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+ cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+
+ for (i = 0; i < 2; ++i) {
+ clamp_mv2(&cur_mv[i].as_mv, xd);
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+ }
+ }
+
+#if CONFIG_EXT_INTER
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
+ if (this_mode == NEAREST_NEWMV || this_mode == NEAREST_NEARMV) {
+ cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+
+ lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+ clamp_mv2(&cur_mv[0].as_mv, xd);
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ }
+
+ if (this_mode == NEW_NEARESTMV || this_mode == NEAR_NEARESTMV) {
+ cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+
+ lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+ clamp_mv2(&cur_mv[1].as_mv, xd);
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ }
+ }
+
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+ int ref_mv_idx = mbmi->ref_mv_idx + 1;
+ if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARESTMV ||
+ this_mode == NEAR_NEARMV) {
+ cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+
+ lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+ clamp_mv2(&cur_mv[0].as_mv, xd);
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ }
+
+ if (this_mode == NEW_NEARMV || this_mode == NEAREST_NEARMV ||
+ this_mode == NEAR_NEARMV) {
+ cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+ lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+ clamp_mv2(&cur_mv[1].as_mv, xd);
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ }
+ }
+#else
+ if (this_mode == NEARMV && is_comp_pred) {
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+ int ref_mv_idx = mbmi->ref_mv_idx + 1;
+ cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+ cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+ for (i = 0; i < 2; ++i) {
+ clamp_mv2(&cur_mv[i].as_mv, xd);
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+ }
+ }
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_REF_MV
+
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
+ tmp_dst.stride[i] = MAX_SB_SIZE;
+ }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ orig_dst.plane[i] = xd->plane[i].dst.buf;
+ orig_dst.stride[i] = xd->plane[i].dst.stride;
+ }
+
+ // We don't include the cost of the second reference here, because there
+ // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+ // words if you present them in that order, the second one is always known
+ // if the first is known.
+ //
+ // Under some circumstances we discount the cost of new mv mode to encourage
+ // initiation of a motion field.
+ if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
+ refs[0])) {
+#if CONFIG_EXT_INTER
+ rd_stats->rate +=
+ AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
+ cost_mv_ref(cpi, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV,
+ mode_ctx));
+#else
+ rd_stats->rate += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
+ cost_mv_ref(cpi, NEARESTMV, mode_ctx));
+#endif // CONFIG_REF_MV && CONFIG_EXT_INTER
+ } else {
+ rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
+ }
+
+ if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, 0) > ref_best_rd &&
+#if CONFIG_EXT_INTER
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
+#else
+ mbmi->mode != NEARESTMV
+#endif // CONFIG_EXT_INTER
+ )
+ return INT64_MAX;
+
+ int64_t ret_val = interpolation_filter_search(
+ x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
+ &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
+ if (ret_val != 0) return ret_val;
+
+#if CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ best_bmc_mbmi = *mbmi;
+ rate2_bmc_nocoeff = rd_stats->rate;
+ if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
+#if CONFIG_MOTION_VAR
+ rate_mv_bmc = rate_mv;
+#endif // CONFIG_MOTION_VAR
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+ if (is_comp_pred) {
+ int rate_sum, rs2;
+ int64_t dist_sum;
+ int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
+ INTERINTER_COMPOUND_DATA best_compound_data;
+ int_mv best_mv[2];
+ int best_tmp_rate_mv = rate_mv;
+ int tmp_skip_txfm_sb;
+ int64_t tmp_skip_sse_sb;
+ int compound_type_cost[COMPOUND_TYPES];
+ uint8_t pred0[2 * MAX_SB_SQUARE];
+ uint8_t pred1[2 * MAX_SB_SQUARE];
+ uint8_t *preds0[1] = { pred0 };
+ uint8_t *preds1[1] = { pred1 };
+ int strides[1] = { bw };
+ int tmp_rate_mv;
+ int masked_compound_used = is_any_masked_compound_used(bsize);
+ COMPOUND_TYPE cur_type;
+
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ memset(&best_compound_data, 0, sizeof(best_compound_data));
+#if CONFIG_COMPOUND_SEGMENT
+ uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE];
+ best_compound_data.seg_mask = tmp_mask_buf;
+#endif // CONFIG_COMPOUND_SEGMENT
+ av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
+ av1_compound_type_tree);
+
+ if (masked_compound_used) {
+ av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
+ av1_compound_type_tree);
+ // get inter predictors to use for masked compound modes
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
+ }
+
+ for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+ if (!is_interinter_compound_used(cur_type, bsize)) break;
+ tmp_rate_mv = rate_mv;
+ best_rd_cur = INT64_MAX;
+ mbmi->interinter_compound_type = cur_type;
+ rs2 = av1_cost_literal(get_interinter_compound_type_bits(
+ bsize, mbmi->interinter_compound_type)) +
+ (masked_compound_used
+ ? compound_type_cost[mbmi->interinter_compound_type]
+ : 0);
+
+ switch (cur_type) {
+ case COMPOUND_AVERAGE:
+ av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+ av1_subtract_plane(x, bsize, 0);
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+ INT64_MAX);
+ if (rd != INT64_MAX)
+ best_rd_cur =
+ RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+ best_rd_compound = best_rd_cur;
+ break;
+#if CONFIG_WEDGE
+ case COMPOUND_WEDGE:
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+ best_rd_compound / 3 < ref_best_rd) {
+ best_rd_cur = build_and_cost_compound_type(
+ cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
+ &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+ }
+ break;
+#endif // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+ case COMPOUND_SEG:
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+ best_rd_compound / 3 < ref_best_rd) {
+ best_rd_cur = build_and_cost_compound_type(
+ cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
+ &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+ }
+ break;
+#endif // CONFIG_COMPOUND_SEGMENT
+ default: assert(0); return 0;
+ }
+
+ if (best_rd_cur < best_rd_compound) {
+ best_rd_compound = best_rd_cur;
+#if CONFIG_WEDGE
+ best_compound_data.wedge_index = mbmi->wedge_index;
+ best_compound_data.wedge_sign = mbmi->wedge_sign;
+#endif // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+ best_compound_data.mask_type = mbmi->mask_type;
+ memcpy(best_compound_data.seg_mask, xd->seg_mask,
+ 2 * MAX_SB_SQUARE * sizeof(uint8_t));
+#endif // CONFIG_COMPOUND_SEGMENT
+ best_compound_data.interinter_compound_type =
+ mbmi->interinter_compound_type;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ if (use_masked_motion_search(cur_type)) {
+ best_tmp_rate_mv = tmp_rate_mv;
+ best_mv[0].as_int = mbmi->mv[0].as_int;
+ best_mv[1].as_int = mbmi->mv[1].as_int;
+ } else {
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ }
+ }
+ }
+ // reset to original mvs for next iteration
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ }
+#if CONFIG_WEDGE
+ mbmi->wedge_index = best_compound_data.wedge_index;
+ mbmi->wedge_sign = best_compound_data.wedge_sign;
+#endif // CONFIG_WEDGE
+#if CONFIG_COMPOUND_SEGMENT
+ mbmi->mask_type = best_compound_data.mask_type;
+ memcpy(xd->seg_mask, best_compound_data.seg_mask,
+ 2 * MAX_SB_SQUARE * sizeof(uint8_t));
+#endif // CONFIG_COMPOUND_SEGMENT
+ mbmi->interinter_compound_type =
+ best_compound_data.interinter_compound_type;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ mbmi->mv[0].as_int = best_mv[0].as_int;
+ mbmi->mv[1].as_int = best_mv[1].as_int;
+ xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+ xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
+ if (use_masked_motion_search(mbmi->interinter_compound_type)) {
+ rd_stats->rate += best_tmp_rate_mv - rate_mv;
+ rate_mv = best_tmp_rate_mv;
+ }
+ }
+
+ if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst);
+ return INT64_MAX;
+ }
+
+ pred_exists = 0;
+
+ *args->compmode_interinter_cost =
+ av1_cost_literal(get_interinter_compound_type_bits(
+ bsize, mbmi->interinter_compound_type)) +
+ (masked_compound_used
+ ? compound_type_cost[mbmi->interinter_compound_type]
+ : 0);
+ }
+
+#if CONFIG_INTERINTRA
+ if (is_comp_interintra_pred) {
+ INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+ int64_t best_interintra_rd = INT64_MAX;
+ int rmode, rate_sum;
+ int64_t dist_sum;
+ int j;
+ int tmp_rate_mv = 0;
+ int tmp_skip_txfm_sb;
+ int64_t tmp_skip_sse_sb;
+ DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
+ uint8_t *intrapred;
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ intrapred = CONVERT_TO_BYTEPTR(intrapred_);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ intrapred = intrapred_;
+
+ mbmi->ref_frame[1] = NONE_FRAME;
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
+ xd->plane[j].dst.stride = bw;
+ }
+ av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+ restore_dst_buf(xd, orig_dst);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ mbmi->use_wedge_interintra = 0;
+
+ for (j = 0; j < INTERINTRA_MODES; ++j) {
+ mbmi->interintra_mode = (INTERINTRA_MODE)j;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+ rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
+ if (rd < best_interintra_rd) {
+ best_interintra_rd = rd;
+ best_interintra_mode = mbmi->interintra_mode;
+ }
+ }
+ mbmi->interintra_mode = best_interintra_mode;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ av1_subtract_plane(x, bsize, 0);
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+ best_interintra_rd = rd;
+
+ if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
+ // Don't need to call restore_dst_buf here
+ return INT64_MAX;
+ }
+#if CONFIG_WEDGE
+ if (is_interintra_wedge_used(bsize)) {
+ int64_t best_interintra_rd_nowedge = INT64_MAX;
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ int_mv tmp_mv;
+ int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum,
+ dist_sum);
+ best_interintra_rd_nowedge = rd;
+
+ // Disable wedge search if source variance is small
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+ mbmi->use_wedge_interintra = 1;
+
+ rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+ av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
+
+ best_interintra_rd_wedge =
+ pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+ best_interintra_rd_wedge +=
+ RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge, 0);
+ // Refine motion vector.
+ if (have_newmv_in_inter_mode(this_mode)) {
+ // get negative of mask
+ const uint8_t *mask = av1_get_contiguous_soft_mask(
+ mbmi->interintra_wedge_index, 1, bsize);
+ do_masked_motion_search(cpi, x, mask, bw, bsize, mi_row, mi_col,
+ &tmp_mv, &tmp_rate_mv, 0);
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+ rd = RDCOST(x->rdmult, x->rddiv,
+ rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+ if (rd < best_interintra_rd_wedge) {
+ best_interintra_rd_wedge = rd;
+ } else {
+ tmp_mv.as_int = cur_mv[0].as_int;
+ tmp_rate_mv = rate_mv;
+ }
+ } else {
+ tmp_mv.as_int = cur_mv[0].as_int;
+ tmp_rate_mv = rate_mv;
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ // Evaluate closer to true rd
+ av1_subtract_plane(x, bsize, 0);
+ rd =
+ estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, x->rddiv,
+ rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+ best_interintra_rd_wedge = rd;
+ if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+ mbmi->use_wedge_interintra = 1;
+ best_interintra_rd = best_interintra_rd_wedge;
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ rd_stats->rate += tmp_rate_mv - rate_mv;
+ rate_mv = tmp_rate_mv;
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ best_interintra_rd = best_interintra_rd_nowedge;
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ }
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ best_interintra_rd = best_interintra_rd_nowedge;
+ }
+ }
+#endif // CONFIG_WEDGE
+
+ pred_exists = 0;
+ *args->compmode_interintra_cost =
+ av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
+ *args->compmode_interintra_cost +=
+ interintra_mode_cost[mbmi->interintra_mode];
+ if (is_interintra_wedge_used(bsize)) {
+ *args->compmode_interintra_cost += av1_cost_bit(
+ cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
+ if (mbmi->use_wedge_interintra) {
+ *args->compmode_interintra_cost +=
+ av1_cost_literal(get_interintra_wedge_bits(bsize));
+ }
+ }
+ } else if (is_interintra_allowed(mbmi)) {
+ *args->compmode_interintra_cost =
+ av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
+ }
+#endif // CONFIG_INTERINTRA
+
+ if (pred_exists == 0) {
+ int tmp_rate;
+ int64_t tmp_dist;
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, &orig_dst, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+ &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
+ rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+ }
+#endif // CONFIG_EXT_INTER
+
+ if (!is_comp_pred)
+#if CONFIG_DUAL_FILTER
+ args->single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
+#else
+ args->single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+
+#if CONFIG_EXT_INTER
+ if (args->modelled_rd != NULL) {
+ if (is_comp_pred) {
+ const int mode0 = compound_ref0_mode(this_mode);
+ const int mode1 = compound_ref1_mode(this_mode);
+ const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
+ args->modelled_rd[mode1][refs[1]]);
+ if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+ restore_dst_buf(xd, orig_dst);
+ return INT64_MAX;
+ }
+ } else if (!is_comp_interintra_pred) {
+ args->modelled_rd[this_mode][refs[0]] = rd;
+ }
+ }
+#endif // CONFIG_EXT_INTER
+
+ if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+ // if current pred_error modeled rd is substantially more than the best
+ // so far, do not bother doing full rd
+ if (rd / 2 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst);
+ return INT64_MAX;
+ }
+ }
+
+ ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+ disable_skip, mode_mv, mi_row, mi_col, args,
+ ref_best_rd, refs, rate_mv,
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+ rate2_bmc_nocoeff, &best_bmc_mbmi,
+#if CONFIG_MOTION_VAR
+ rate_mv_bmc,
+#endif // CONFIG_MOTION_VAR
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
+ if (ret_val != 0) return ret_val;
+
+ return 0; // The rate-distortion cost will be re-calculated by caller.
+}
+
+#if CONFIG_INTRABC
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (bsize < BLOCK_8X8 || !cm->allow_screen_content_tools) return INT64_MAX;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TileInfo *tile = &xd->tile;
+ MODE_INFO *const mi = xd->mi[0];
+ const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
+ const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int sb_row = mi_row / MAX_MIB_SIZE;
+
+ int_mv dv_ref;
+ av1_find_ref_dv(&dv_ref, mi_row, mi_col);
+
+ const MvLimits tmp_mv_limits = x->mv_limits;
+
+ // TODO(aconverse@google.com): Handle same row DV.
+ x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+ x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
+ x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+ x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
+ assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
+ assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
+ assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
+ assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
+ av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+
+ if (x->mv_limits.col_max < x->mv_limits.col_min ||
+ x->mv_limits.row_max < x->mv_limits.row_min) {
+ x->mv_limits = tmp_mv_limits;
+ return INT64_MAX;
+ }
+
+ struct buf_2d yv12_mb[MAX_MB_PLANE];
+ av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL);
+ for (int i = 0; i < MAX_MB_PLANE; ++i) {
+ xd->plane[i].pre[0] = yv12_mb[i];
+ }
+
+ int step_param = cpi->mv_step_param;
+ MV mvp_full = dv_ref.as_mv;
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+ int sadpb = x->sadperbit16;
+ int cost_list[5];
+ int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+ sadpb, cond_cost_list(cpi, cost_list),
+ &dv_ref.as_mv, INT_MAX, 1);
+
+ x->mv_limits = tmp_mv_limits;
+ if (bestsme == INT_MAX) return INT64_MAX;
+ mvp_full = x->best_mv.as_mv;
+ MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+ if (mv_check_bounds(&x->mv_limits, &dv)) return INT64_MAX;
+ if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) return INT64_MAX;
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ MB_MODE_INFO best_mbmi = *mbmi;
+ RD_STATS best_rdcost = *rd_cost;
+ int best_skip = x->skip;
+#if CONFIG_PALETTE
+ memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+#endif
+ mbmi->use_intrabc = 1;
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->mv[0].as_mv = dv;
+#if CONFIG_DUAL_FILTER
+ for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
+#else
+ mbmi->interp_filter = BILINEAR;
+#endif
+ mbmi->skip = 0;
+ x->skip = 0;
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+
+ int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost, x->mvcost,
+ MV_COST_WEIGHT);
+ const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0);
+ const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0);
+ const int rate_mode =
+ cpi->y_mode_costs[A][L][DC_PRED] + av1_cost_bit(INTRABC_PROB, 1);
+
+ RD_STATS rd_stats, rd_stats_uv;
+ av1_subtract_plane(x, bsize, 0);
+ super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
+#if CONFIG_RD_DEBUG
+ mbmi->rd_stats = rd_stats;
+#endif
+
+ const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+
+ RD_STATS rdc_noskip;
+ av1_init_rd_stats(&rdc_noskip);
+ rdc_noskip.rate =
+ rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
+ rdc_noskip.dist = rd_stats.dist;
+ rdc_noskip.rdcost =
+ RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist);
+ if (rdc_noskip.rdcost < best_rd) {
+ best_rd = rdc_noskip.rdcost;
+ best_mbmi = *mbmi;
+ best_skip = x->skip;
+ best_rdcost = rdc_noskip;
+ }
+
+ x->skip = 1;
+ mbmi->skip = 1;
+ RD_STATS rdc_skip;
+ av1_init_rd_stats(&rdc_skip);
+ rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
+ rdc_skip.dist = rd_stats.sse;
+ rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist);
+ if (rdc_skip.rdcost < best_rd) {
+ best_rd = rdc_skip.rdcost;
+ best_mbmi = *mbmi;
+ best_skip = x->skip;
+ best_rdcost = rdc_skip;
+ }
+ *mbmi = best_mbmi;
+ *rd_cost = best_rdcost;
+ x->skip = best_skip;
+ return best_rd;
+}
+#endif // CONFIG_INTRABC
+
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = xd->plane;
+ int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+ int y_skip = 0, uv_skip = 0;
+ int64_t dist_y = 0, dist_uv = 0;
+ TX_SIZE max_uv_tx_size;
+ const int unify_bsize = CONFIG_CB4X4;
+
+ ctx->skip = 0;
+ xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+ xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+#if CONFIG_INTRABC
+ xd->mi[0]->mbmi.use_intrabc = 0;
+#endif // CONFIG_INTRABC
+
+ const int64_t intra_yrd =
+ (bsize >= BLOCK_8X8 || unify_bsize)
+ ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+ &y_skip, bsize, best_rd)
+ : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+ &dist_y, &y_skip, best_rd);
+
+ if (intra_yrd < best_rd) {
+ max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size]
+ [pd[1].subsampling_x][pd[1].subsampling_y];
+
+#if CONFIG_CB4X4
+#if !CONFIG_CHROMA_2X2
+ max_uv_tx_size = AOMMAX(max_uv_tx_size, TX_4X4);
+#endif // !CONFIG_CHROMA_2X2
+ if (!x->skip_chroma_rd)
+ rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+ &uv_skip, bsize, max_uv_tx_size);
+#else
+ rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+ &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
+#endif // CONFIG_CB4X4
+
+ if (y_skip && uv_skip) {
+ rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+ av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ rd_cost->dist = dist_y + dist_uv;
+ } else {
+ rd_cost->rate =
+ rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ rd_cost->dist = dist_y + dist_uv;
+ }
+ rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+ } else {
+ rd_cost->rate = INT_MAX;
+ }
+
+#if CONFIG_INTRABC
+ if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+ best_rd = rd_cost->rdcost;
+ if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
+ ctx->skip = x->skip; // FIXME where is the proper place to set this?!
+ assert(rd_cost->rate != INT_MAX);
+ rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+ }
+#endif
+ if (rd_cost->rate == INT_MAX) return;
+
+ ctx->mic = *xd->mi[0];
+ ctx->mbmi_ext = *x->mbmi_ext;
+}
+
+// Do we have an internal image edge (e.g. formatting bars).
+int av1_internal_image_edge(const AV1_COMP *cpi) {
+ return (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+ (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+ int top_edge = 0;
+ int bottom_edge = cpi->common.mi_rows;
+ int is_active_h_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ const TWO_PASS *const twopass = &cpi->twopass;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+ bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+ bottom_edge = AOMMAX(top_edge, bottom_edge);
+ }
+
+ if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+ ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+ is_active_h_edge = 1;
+ }
+ return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+ int left_edge = 0;
+ int right_edge = cpi->common.mi_cols;
+ int is_active_v_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ const TWO_PASS *const twopass = &cpi->twopass;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+ right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+ right_edge = AOMMAX(left_edge, right_edge);
+ }
+
+ if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+ ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+ is_active_v_edge = 1;
+ }
+ return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
+ return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
+ av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
+}
+
+#if CONFIG_PALETTE
+static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ float *const data = x->palette_buffer->kmeans_data_buf;
+ float centroids[2 * PALETTE_MAX_SIZE];
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ int r, c;
+#if CONFIG_HIGHBITDEPTH
+ const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+#endif // CONFIG_HIGHBITDEPTH
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+ (void)cpi;
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+#if CONFIG_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) {
+ data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ }
+ }
+
+ for (r = 1; r < 3; ++r) {
+ for (c = 0; c < pmi->palette_size[1]; ++c) {
+ centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+ }
+ }
+
+ av1_calc_indices(data, centroids, color_map, rows * cols,
+ pmi->palette_size[1], 2);
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+}
+#endif // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+static void pick_filter_intra_interframe(
+ const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra,
+ int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv,
+ PREDICTION_MODE *mode_uv, FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
+#if CONFIG_EXT_INTRA
+ int8_t *uv_angle_delta,
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+ PALETTE_MODE_INFO *pmi_uv, int palette_ctx,
+#endif // CONFIG_PALETTE
+ int skip_mask, unsigned int *ref_costs_single, int64_t *best_rd,
+ int64_t *best_intra_rd, PREDICTION_MODE *best_intra_mode,
+ int *best_mode_index, int *best_skip2, int *best_mode_skippable,
+#if CONFIG_SUPERTX
+ int *returnrate_nocoef,
+#endif // CONFIG_SUPERTX
+ int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_PALETTE
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#endif // CONFIG_PALETTE
+ int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
+ int dc_mode_index;
+ const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+ int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
+ int64_t distortion_uv, model_rd = INT64_MAX;
+ TX_SIZE uv_tx;
+
+ for (i = 0; i < MAX_MODES; ++i)
+ if (av1_mode_order[i].mode == DC_PRED &&
+ av1_mode_order[i].ref_frame[0] == INTRA_FRAME)
+ break;
+ dc_mode_index = i;
+ assert(i < MAX_MODES);
+
+ // TODO(huisu): use skip_mask for further speedup.
+ (void)skip_mask;
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+ &skippable, bsize, intra_mode_cost[mbmi->mode],
+ &this_rd, &model_rd, 0)) {
+ return;
+ }
+ if (rate_y == INT_MAX) return;
+
+ uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
+ [xd->plane[1].subsampling_y];
+ if (rate_uv_intra[uv_tx] == INT_MAX) {
+ choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+ &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
+ &skip_uv[uv_tx], &mode_uv[uv_tx]);
+#if CONFIG_PALETTE
+ if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
+#endif // CONFIG_PALETTE
+ filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#if CONFIG_EXT_INTRA
+ uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif // CONFIG_EXT_INTRA
+ }
+
+ rate_uv = rate_uv_tokenonly[uv_tx];
+ distortion_uv = dist_uv[uv_tx];
+ skippable = skippable && skip_uv[uv_tx];
+ mbmi->uv_mode = mode_uv[uv_tx];
+#if CONFIG_PALETTE
+ if (cm->allow_screen_content_tools) {
+ pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+#endif // CONFIG_PALETTE
+#if CONFIG_EXT_INTRA
+ mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+#endif // CONFIG_EXT_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+ filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+ if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+ mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+ filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+ }
+
+ rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+ cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+#if CONFIG_PALETTE
+ if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED &&
+ bsize >= BLOCK_8X8)
+ rate2 += av1_cost_bit(
+ av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
+#endif // CONFIG_PALETTE
+
+ if (!xd->lossless[mbmi->segment_id]) {
+ // super_block_yrd above includes the cost of the tx_size in the
+ // tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+ }
+
+ rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
+ rate2 += write_uniform_cost(
+ FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
+#if CONFIG_EXT_INTRA
+ if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+ rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+ }
+#endif // CONFIG_EXT_INTRA
+ if (mbmi->mode == DC_PRED) {
+ rate2 +=
+ av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
+ if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
+ rate2 +=
+ write_uniform_cost(FILTER_INTRA_MODES,
+ mbmi->filter_intra_mode_info.filter_intra_mode[1]);
+ }
+ distortion2 = distortion_y + distortion_uv;
+ av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row,
+ mi_col);
+
+ rate2 += ref_costs_single[INTRA_FRAME];
+
+ if (skippable) {
+ rate2 -= (rate_y + rate_uv);
+ rate_y = 0;
+ rate_uv = 0;
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ } else {
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ }
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+ if (this_rd < *best_intra_rd) {
+ *best_intra_rd = this_rd;
+ *best_intra_mode = mbmi->mode;
+ }
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
+
+ if (this_rd < *best_rd) {
+ *best_mode_index = dc_mode_index;
+ mbmi->mv[0].as_int = 0;
+ rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+ if (x->skip)
+ *returnrate_nocoef = rate2;
+ else
+ *returnrate_nocoef = rate2 - rate_y - rate_uv;
+ *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable);
+ *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+ mbmi->ref_frame[0] != INTRA_FRAME);
+#endif // CONFIG_SUPERTX
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ *best_rd = this_rd;
+ *best_mbmode = *mbmi;
+ *best_skip2 = 0;
+ *best_mode_skippable = skippable;
+ }
+}
+#endif // CONFIG_FILTER_INTRA
+
+#if CONFIG_MOTION_VAR
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+ const MACROBLOCKD *xd, int mi_row,
+ int mi_col, const uint8_t *above,
+ int above_stride, const uint8_t *left,
+ int left_stride);
+#endif // CONFIG_MOTION_VAR
+
+void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, int mi_row, int mi_col,
+ RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+ int *returnrate_nocoef,
+#endif // CONFIG_SUPERTX
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RD_OPT *const rd_opt = &cpi->rd;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_PALETTE
+ const int try_palette =
+ cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#endif // CONFIG_PALETTE
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const struct segmentation *const seg = &cm->seg;
+ PREDICTION_MODE this_mode;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+ unsigned char segment_id = mbmi->segment_id;
+ int comp_pred, i, k;
+ int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+ struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
+ int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
+#if CONFIG_EXT_INTER
+ int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
+ int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+#endif // CONFIG_EXT_INTER
+ static const int flag_list[TOTAL_REFS_PER_FRAME] = {
+ 0,
+ AOM_LAST_FLAG,
+#if CONFIG_EXT_REFS
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+#endif // CONFIG_EXT_REFS
+ AOM_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+ AOM_BWD_FLAG,
+#endif // CONFIG_EXT_REFS
+ AOM_ALT_FLAG
+ };
+ int64_t best_rd = best_rd_so_far;
+ int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
+ int64_t best_pred_diff[REFERENCE_MODES];
+ int64_t best_pred_rd[REFERENCE_MODES];
+ MB_MODE_INFO best_mbmode;
+#if CONFIG_REF_MV
+ int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+#endif // CONFIG_REF_MV
+ int best_mode_skippable = 0;
+ int midx, best_mode_index = -1;
+ unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+ unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+ aom_prob comp_mode_p;
+ int64_t best_intra_rd = INT64_MAX;
+ unsigned int best_pred_sse = UINT_MAX;
+ PREDICTION_MODE best_intra_mode = DC_PRED;
+ int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
+ int64_t dist_uvs[TX_SIZES_ALL];
+ int skip_uvs[TX_SIZES_ALL];
+ PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+#if CONFIG_PALETTE
+ PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
+#endif // CONFIG_PALETTE
+#if CONFIG_EXT_INTRA
+ int8_t uv_angle_delta[TX_SIZES_ALL];
+ int is_directional_mode, angle_stats_ready = 0;
+ uint8_t directional_mode_skip_mask[INTRA_MODES];
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ int8_t dc_skipped = 1;
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL];
+#endif // CONFIG_FILTER_INTRA
+ const int intra_cost_penalty = av1_get_intra_cost_penalty(
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+ int best_skip2 = 0;
+ uint8_t ref_frame_skip_mask[2] = { 0 };
+#if CONFIG_EXT_INTER
+ uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
+ MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
+ int64_t best_single_inter_rd = INT64_MAX;
+#else
+ uint16_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
+#endif // CONFIG_EXT_INTER
+ int mode_skip_start = sf->mode_skip_start + 1;
+ const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
+ const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+ int64_t mode_threshold[MAX_MODES];
+ int *mode_map = tile_data->mode_map[bsize];
+ const int mode_search_skip_flags = sf->mode_search_skip_flags;
+#if CONFIG_PVQ
+ od_rollback_buffer pre_buf;
+#endif // CONFIG_PVQ
+
+ HandleInterModeArgs args = {
+#if CONFIG_MOTION_VAR
+ { NULL },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+ { NULL },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+#endif // CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+#else // CONFIG_EXT_INTER
+ NULL,
+#endif // CONFIG_EXT_INTER
+ { { 0 } },
+ };
+
+#if CONFIG_PALETTE || CONFIG_EXT_INTRA
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+#endif // CONFIG_PALETTE || CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+ int palette_ctx = 0;
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
+#endif // CONFIG_PALETTE
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif // CONFIG_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+ args.above_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+ args.above_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
+ args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+ args.left_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+ args.left_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
+ } else {
+#endif // CONFIG_HIGHBITDEPTH
+ args.above_pred_buf[0] = tmp_buf1;
+ args.above_pred_buf[1] = tmp_buf1 + MAX_SB_SQUARE;
+ args.above_pred_buf[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
+ args.left_pred_buf[0] = tmp_buf2;
+ args.left_pred_buf[1] = tmp_buf2 + MAX_SB_SQUARE;
+ args.left_pred_buf[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
+#if CONFIG_HIGHBITDEPTH
+ }
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_MOTION_VAR
+
+ av1_zero(best_mbmode);
+
+#if CONFIG_PALETTE
+ av1_zero(pmi_uv);
+ if (try_palette) {
+ if (above_mi)
+ palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+ if (left_mi)
+ palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+ }
+#endif // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA
+ memset(directional_mode_skip_mask, 0,
+ sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#endif // CONFIG_EXT_INTRA
+
+ estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+ &comp_mode_p);
+
+ for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
+ for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX;
+ for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
+ for (i = 0; i < MB_MODE_COUNT; ++i) {
+ for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) {
+ args.single_filter[i][k] = SWITCHABLE;
+ }
+ }
+
+ rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+ *returnrate_nocoef = INT_MAX;
+#endif // CONFIG_SUPERTX
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+ x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif // CONFIG_REF_MV && CONFIG_EXT_INTER
+ if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+ assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+ setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+ }
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_GLOBAL_MOTION
+ frame_mv[ZEROMV][ref_frame].as_int =
+ gm_get_motion_vector(&cm->global_motion[ref_frame],
+ cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+ 0)
+ .as_int;
+#else // CONFIG_GLOBAL_MOTION
+ frame_mv[ZEROMV][ref_frame].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+#if CONFIG_EXT_INTER
+ frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_GLOBAL_MOTION
+ frame_mv[ZERO_ZEROMV][ref_frame].as_int =
+ gm_get_motion_vector(&cm->global_motion[ref_frame],
+ cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+ 0)
+ .as_int;
+#else // CONFIG_GLOBAL_MOTION
+ frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+#endif // CONFIG_EXT_INTER
+ }
+
+#if CONFIG_REF_MV
+ for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+ MODE_INFO *const mi = xd->mi[0];
+ int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+ x->mbmi_ext->mode_context[ref_frame] = 0;
+ av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
+ mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+ mbmi_ext->compound_mode_context,
+#endif // CONFIG_EXT_INTER
+ candidates, mi_row, mi_col, NULL, NULL,
+ mbmi_ext->mode_context);
+ if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+ if (mbmi_ext->ref_mvs[rf[0]][0].as_int !=
+ frame_mv[ZEROMV][rf[0]].as_int ||
+ mbmi_ext->ref_mvs[rf[0]][1].as_int !=
+ frame_mv[ZEROMV][rf[0]].as_int ||
+ mbmi_ext->ref_mvs[rf[1]][0].as_int !=
+ frame_mv[ZEROMV][rf[1]].as_int ||
+ mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int)
+ mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
+ }
+ }
+#endif // CONFIG_REF_MV
+
+#if CONFIG_MOTION_VAR
+ av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+ if (check_num_overlappable_neighbors(mbmi) &&
+ is_motion_variation_allowed_bsize(bsize)) {
+ av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+ args.above_pred_buf, dst_width1,
+ dst_height1, args.above_pred_stride);
+ av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+ args.left_pred_buf, dst_width2,
+ dst_height2, args.left_pred_stride);
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col);
+ x->mask_buf = mask2d_buf;
+ x->wsrc_buf = weighted_src_buf;
+ calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
+ args.above_pred_stride[0], args.left_pred_buf[0],
+ args.left_pred_stride[0]);
+ }
+#endif // CONFIG_MOTION_VAR
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+// Skip checking missing references in both single and compound reference
+// modes. Note that a mode will be skipped iff both reference frames
+// are masked out.
+#if CONFIG_EXT_REFS
+ if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01);
+ } else {
+#endif // CONFIG_EXT_REFS
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#if CONFIG_EXT_REFS
+ }
+#endif // CONFIG_EXT_REFS
+ } else {
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ // Skip fixed mv modes for poor references
+ if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+ mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ break;
+ }
+ }
+ }
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ }
+ }
+
+ // Disable this drop out case if the ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that we end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+ int_mv zeromv;
+ ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+ (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+ (1 << BWDREF_FRAME) |
+#endif // CONFIG_EXT_REFS
+ (1 << GOLDEN_FRAME);
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ // TODO(zoeliu): To further explore whether following needs to be done for
+ // BWDREF_FRAME as well.
+ mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+#if CONFIG_GLOBAL_MOTION
+ zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
+ cm->allow_high_precision_mv, bsize,
+ mi_col, mi_row, 0)
+ .as_int;
+#else
+ zeromv.as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+ if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+#if CONFIG_EXT_INTER
+ if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
+ if (frame_mv[NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARMV);
+ if (frame_mv[NEAR_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARESTMV);
+ if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
+#endif // CONFIG_EXT_INTER
+ }
+ }
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ if (sf->alt_ref_search_fp) {
+ assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
+ mode_skip_mask[ALTREF_FRAME] = 0;
+ ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ }
+ }
+
+ if (sf->alt_ref_search_fp)
+ if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+ if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+ mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+ if (sf->adaptive_mode_search) {
+ if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+ cpi->rc.frames_since_golden >= 3)
+ if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+ mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+ }
+
+ if (bsize > sf->max_intra_bsize) {
+ ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+ ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+ }
+
+ mode_skip_mask[INTRA_FRAME] |=
+ ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+ for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
+ for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+ mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
+
+ midx = sf->schedule_mode_search ? mode_skip_start : 0;
+ while (midx > 4) {
+ uint8_t end_pos = 0;
+ for (i = 5; i < midx; ++i) {
+ if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
+ uint8_t tmp = mode_map[i];
+ mode_map[i] = mode_map[i - 1];
+ mode_map[i - 1] = tmp;
+ end_pos = i;
+ }
+ }
+ midx = end_pos;
+ }
+
+ if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+ x->use_default_intra_tx_type = 1;
+ else
+ x->use_default_intra_tx_type = 0;
+
+ if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+ x->use_default_inter_tx_type = 1;
+ else
+ x->use_default_inter_tx_type = 0;
+#if CONFIG_PVQ
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+#if CONFIG_EXT_INTER
+ for (i = 0; i < MB_MODE_COUNT; ++i)
+ for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
+ modelled_rd[i][ref_frame] = INT64_MAX;
+#endif // CONFIG_EXT_INTER
+
+ for (midx = 0; midx < MAX_MODES; ++midx) {
+ int mode_index;
+ int mode_excluded = 0;
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int compmode_cost = 0;
+#if CONFIG_EXT_INTER
+ int compmode_interintra_cost = 0;
+ int compmode_interinter_cost = 0;
+#endif // CONFIG_EXT_INTER
+ int rate2 = 0, rate_y = 0, rate_uv = 0;
+ int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int skippable = 0;
+ int this_skip2 = 0;
+ int64_t total_sse = INT64_MAX;
+#if CONFIG_REF_MV
+ uint8_t ref_frame_type;
+#endif // CONFIG_REF_MV
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+ mode_index = mode_map[midx];
+ this_mode = av1_mode_order[mode_index].mode;
+ ref_frame = av1_mode_order[mode_index].ref_frame[0];
+ second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
+#if CONFIG_REF_MV
+ mbmi->ref_mv_idx = 0;
+#endif // CONFIG_REF_MV
+
+#if CONFIG_EXT_INTER
+ if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
+ // Mode must by compatible
+ if (!is_interintra_allowed_mode(this_mode)) continue;
+ if (!is_interintra_allowed_bsize(bsize)) continue;
+ }
+
+ if (is_inter_compound_mode(this_mode)) {
+ frame_mv[this_mode][ref_frame].as_int =
+ frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
+ frame_mv[this_mode][second_ref_frame].as_int =
+ frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
+ }
+#endif // CONFIG_EXT_INTER
+
+ // Look at the reference frame of the best mode so far and set the
+ // skip mask to look at a subset of the remaining modes.
+ if (midx == mode_skip_start && best_mode_index >= 0) {
+ switch (best_mbmode.ref_frame[0]) {
+ case INTRA_FRAME: break;
+ case LAST_FRAME:
+ ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+#if CONFIG_EXT_REFS
+ case LAST2_FRAME:
+ ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+ case LAST3_FRAME:
+ ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+#endif // CONFIG_EXT_REFS
+ case GOLDEN_FRAME:
+ ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+#if CONFIG_EXT_REFS
+ case BWDREF_FRAME:
+ ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+#endif // CONFIG_EXT_REFS
+ case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
+#if CONFIG_EXT_REFS
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#endif // CONFIG_EXT_REFS
+ break;
+ case NONE_FRAME:
+ case TOTAL_REFS_PER_FRAME:
+ assert(0 && "Invalid Reference frame");
+ break;
+ }
+ }
+
+ if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+ (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
+ continue;
+
+ if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
+
+ // Test best rd so far against threshold for trying this mode.
+ if (best_mode_skippable && sf->schedule_mode_search)
+ mode_threshold[mode_index] <<= 1;
+
+ if (best_rd < mode_threshold[mode_index]) continue;
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
+#if CONFIG_LOWDELAY_COMPOUND // Changes LL bitstream
+#if CONFIG_EXT_REFS
+ if (cpi->oxcf.pass == 0) {
+ // Complexity-compression trade-offs
+ // if (ref_frame == ALTREF_FRAME) continue;
+ // if (ref_frame == BWDREF_FRAME) continue;
+ if (second_ref_frame == ALTREF_FRAME) continue;
+ // if (second_ref_frame == BWDREF_FRAME) continue;
+ }
+#endif
+#endif
+ comp_pred = second_ref_frame > INTRA_FRAME;
+ if (comp_pred) {
+ if (!cpi->allow_comp_inter_inter) continue;
+
+ // Skip compound inter modes if ARF is not available.
+ if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+
+ if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+ best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
+ continue;
+
+ mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+ } else {
+ if (ref_frame != INTRA_FRAME)
+ mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ if (sf->adaptive_mode_search)
+ if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+ continue;
+
+ if (this_mode != DC_PRED) {
+ // Disable intra modes other than DC_PRED for blocks with low variance
+ // Threshold for intra skipping based on source variance
+ // TODO(debargha): Specialize the threshold for super block sizes
+ const unsigned int skip_intra_var_thresh = 64;
+ if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+ x->source_variance < skip_intra_var_thresh)
+ continue;
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+ if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME)
+ continue;
+ }
+ if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(this_mode, best_intra_mode)) continue;
+ }
+ }
+#if CONFIG_GLOBAL_MOTION
+ } else if (cm->global_motion[ref_frame].wmtype == IDENTITY &&
+ (!comp_pred ||
+ cm->global_motion[second_ref_frame].wmtype == IDENTITY)) {
+#else // CONFIG_GLOBAL_MOTION
+ } else {
+#endif // CONFIG_GLOBAL_MOTION
+ const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
+ if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+ mbmi_ext->compound_mode_context,
+#endif // CONFIG_REF_MV && CONFIG_EXT_INTER
+ frame_mv, this_mode, ref_frames, bsize, -1,
+ mi_row, mi_col))
+ continue;
+ }
+
+ mbmi->mode = this_mode;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = ref_frame;
+ mbmi->ref_frame[1] = second_ref_frame;
+#if CONFIG_PALETTE
+ pmi->palette_size[0] = 0;
+ pmi->palette_size[1] = 0;
+#endif // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif // CONFIG_FILTER_INTRA
+ // Evaluate all sub-pel filters irrespective of whether we can use
+ // them for this frame.
+
+ set_default_interp_filters(mbmi, cm->interp_filter);
+
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+#if CONFIG_EXT_INTER
+ mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+#endif // CONFIG_EXT_INTER
+
+ if (ref_frame == INTRA_FRAME) {
+ RD_STATS rd_stats_y;
+ TX_SIZE uv_tx;
+ struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_EXT_INTRA
+ is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
+ if (is_directional_mode) {
+ int rate_dummy;
+ int64_t model_rd = INT64_MAX;
+ if (!angle_stats_ready) {
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *src = x->plane[0].src.buf;
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ highbd_angle_estimation(src, src_stride, rows, cols,
+ directional_mode_skip_mask);
+ else
+#endif // CONFIG_HIGHBITDEPTH
+ angle_estimation(src, src_stride, rows, cols,
+ directional_mode_skip_mask);
+ angle_stats_ready = 1;
+ }
+ if (directional_mode_skip_mask[mbmi->mode]) continue;
+ rd_stats_y.rate = INT_MAX;
+ rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize,
+ intra_mode_cost[mbmi->mode], best_rd,
+ &model_rd);
+ } else {
+ mbmi->angle_delta[0] = 0;
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
+ }
+#else
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
+#endif // CONFIG_EXT_INTRA
+ rate_y = rd_stats_y.rate;
+ distortion_y = rd_stats_y.dist;
+ skippable = rd_stats_y.skip;
+
+ if (rate_y == INT_MAX) continue;
+
+#if CONFIG_FILTER_INTRA
+ if (mbmi->mode == DC_PRED) dc_skipped = 0;
+#endif // CONFIG_FILTER_INTRA
+
+ uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
+ [pd->subsampling_y];
+ if (rate_uv_intra[uv_tx] == INT_MAX) {
+ choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+ &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
+ &skip_uvs[uv_tx], &mode_uv[uv_tx]);
+#if CONFIG_PALETTE
+ if (try_palette) pmi_uv[uv_tx] = *pmi;
+#endif // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA
+ uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#endif // CONFIG_FILTER_INTRA
+ }
+
+ rate_uv = rate_uv_tokenonly[uv_tx];
+ distortion_uv = dist_uvs[uv_tx];
+ skippable = skippable && skip_uvs[uv_tx];
+ mbmi->uv_mode = mode_uv[uv_tx];
+#if CONFIG_PALETTE
+ if (try_palette) {
+ pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+#endif // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA
+ mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+ filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+ if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+ mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+ filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+ }
+#endif // CONFIG_FILTER_INTRA
+
+#if CONFIG_CB4X4
+ rate2 = rate_y + intra_mode_cost[mbmi->mode];
+ if (!x->skip_chroma_rd)
+ rate2 += rate_uv + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+#else
+ rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+ cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+#endif // CONFIG_CB4X4
+
+#if CONFIG_PALETTE
+ if (try_palette && mbmi->mode == DC_PRED) {
+ rate2 += av1_cost_bit(
+ av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
+ }
+#endif // CONFIG_PALETTE
+
+ if (!xd->lossless[mbmi->segment_id] && bsize >= BLOCK_8X8) {
+ // super_block_yrd above includes the cost of the tx_size in the
+ // tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
+ }
+#if CONFIG_EXT_INTRA
+ if (is_directional_mode) {
+#if CONFIG_INTRA_INTERP
+ const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+ const int p_angle =
+ mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+ if (av1_is_intra_filter_switchable(p_angle))
+ rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+#endif // CONFIG_INTRA_INTERP
+ rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+ }
+ if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+ rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+ MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
+ }
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ if (mbmi->mode == DC_PRED) {
+ rate2 +=
+ av1_cost_bit(cm->fc->filter_intra_probs[0],
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
+ if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+ rate2 += write_uniform_cost(
+ FILTER_INTRA_MODES,
+ mbmi->filter_intra_mode_info.filter_intra_mode[0]);
+ }
+ }
+ if (mbmi->uv_mode == DC_PRED) {
+ rate2 +=
+ av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
+ if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
+ rate2 += write_uniform_cost(
+ FILTER_INTRA_MODES,
+ mbmi->filter_intra_mode_info.filter_intra_mode[1]);
+ }
+#endif // CONFIG_FILTER_INTRA
+ if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+ rate2 += intra_cost_penalty;
+ distortion2 = distortion_y + distortion_uv;
+ } else {
+#if CONFIG_REF_MV
+ int_mv backup_ref_mv[2];
+
+#if !SUB8X8_COMP_REF
+ if (bsize < BLOCK_8X8 && mbmi->ref_frame[1] > INTRA_FRAME) continue;
+#endif // !SUB8X8_COMP_REF
+
+ backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
+ if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
+#endif // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+ if (second_ref_frame == INTRA_FRAME) {
+ if (best_single_inter_ref != ref_frame) continue;
+ mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
+// TODO(debargha|geza.lore):
+// Should we use ext_intra modes for interintra?
+#if CONFIG_EXT_INTRA
+ mbmi->angle_delta[0] = 0;
+ mbmi->angle_delta[1] = 0;
+#if CONFIG_INTRA_INTERP
+ mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif // CONFIG_INTRA_INTERP
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif // CONFIG_FILTER_INTRA
+ }
+#endif // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
+ mbmi->ref_mv_idx = 0;
+ ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+
+#if CONFIG_EXT_INTER
+ if (comp_pred) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+ int ref_mv_idx = 0;
+ // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+ // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+ // mbmi->ref_mv_idx (like NEWMV)
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+ ref_mv_idx = 1;
+
+ if (compound_ref0_mode(mbmi->mode) == NEWMV) {
+ int_mv this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+ }
+ if (compound_ref1_mode(mbmi->mode) == NEWMV) {
+ int_mv this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+ }
+ }
+ } else {
+#endif // CONFIG_EXT_INTER
+ if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+ int ref;
+ for (ref = 0; ref < 1 + comp_pred; ++ref) {
+ int_mv this_mv =
+ (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv
+ : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+ }
+ }
+#if CONFIG_EXT_INTER
+ }
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_REF_MV
+ {
+ RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+ av1_init_rd_stats(&rd_stats);
+ rd_stats.rate = rate2;
+
+ // Point to variables that are maintained between loop iterations
+ args.single_newmv = single_newmv;
+#if CONFIG_EXT_INTER
+ args.single_newmv_rate = single_newmv_rate;
+ args.compmode_interintra_cost = &compmode_interintra_cost;
+ args.compmode_interinter_cost = &compmode_interinter_cost;
+ args.modelled_rd = modelled_rd;
+#endif // CONFIG_EXT_INTER
+ this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, &disable_skip, frame_mv,
+ mi_row, mi_col, &args, best_rd);
+// Prevent pointers from escaping local scope
+#if CONFIG_EXT_INTER
+ args.compmode_interintra_cost = NULL;
+ args.compmode_interinter_cost = NULL;
+#endif // CONFIG_EXT_INTER
+
+ rate2 = rd_stats.rate;
+ skippable = rd_stats.skip;
+ distortion2 = rd_stats.dist;
+ total_sse = rd_stats.sse;
+ rate_y = rd_stats_y.rate;
+ rate_uv = rd_stats_uv.rate;
+ }
+
+#if CONFIG_REF_MV
+// TODO(jingning): This needs some refactoring to improve code quality
+// and reduce redundant steps.
+#if CONFIG_EXT_INTER
+ if ((have_nearmv_in_inter_mode(mbmi->mode) &&
+ mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+ ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
+ mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
+#else
+ if ((mbmi->mode == NEARMV &&
+ mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+ (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
+#endif
+ int_mv backup_mv = frame_mv[NEARMV][ref_frame];
+ MB_MODE_INFO backup_mbmi = *mbmi;
+ int backup_skip = x->skip;
+ int64_t tmp_ref_rd = this_rd;
+ int ref_idx;
+
+// TODO(jingning): This should be deprecated shortly.
+#if CONFIG_EXT_INTER
+ int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+#else
+ int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
+#endif // CONFIG_EXT_INTER
+ int ref_set =
+ AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
+
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset);
+ // Dummy
+ int_mv backup_fmv[2];
+ backup_fmv[0] = frame_mv[NEWMV][ref_frame];
+ if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
+
+ rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
+
+ if (this_rd < INT64_MAX) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse))
+ tmp_ref_rd =
+ RDCOST(x->rdmult, x->rddiv,
+ rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+ distortion2);
+ else
+ tmp_ref_rd =
+ RDCOST(x->rdmult, x->rddiv,
+ rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+ rate_y - rate_uv,
+ total_sse);
+ }
+#if CONFIG_VAR_TX
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif // CONFIG_VAR_TX
+
+ for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
+ int64_t tmp_alt_rd = INT64_MAX;
+ int dummy_disable_skip = 0;
+ int ref;
+ int_mv cur_mv;
+ RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
+#if CONFIG_EXT_INTER
+ int tmp_compmode_interintra_cost = 0;
+ int tmp_compmode_interinter_cost = 0;
+#endif // CONFIG_EXT_INTER
+
+ av1_invalid_rd_stats(&tmp_rd_stats);
+ x->skip = 0;
+
+ mbmi->ref_mv_idx = 1 + ref_idx;
+
+#if CONFIG_EXT_INTER
+ if (comp_pred) {
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+ // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+ // mbmi->ref_mv_idx (like NEWMV)
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+ ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+ if (compound_ref0_mode(mbmi->mode) == NEWMV) {
+ int_mv this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+ } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) {
+ int_mv this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+ }
+
+ if (compound_ref1_mode(mbmi->mode) == NEWMV) {
+ int_mv this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+ } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) {
+ int_mv this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
+ }
+ } else {
+#endif // CONFIG_EXT_INTER
+ for (ref = 0; ref < 1 + comp_pred; ++ref) {
+ int_mv this_mv =
+ (ref == 0)
+ ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+ .this_mv
+ : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+ .comp_mv;
+ clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+ xd->n8_h << MI_SIZE_LOG2, xd);
+ mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+ }
+#if CONFIG_EXT_INTER
+ }
+#endif
+
+ cur_mv =
+ mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
+ .this_mv;
+ clamp_mv2(&cur_mv.as_mv, xd);
+
+ if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
+ int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
+#if CONFIG_EXT_INTER
+ int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
+#endif // CONFIG_EXT_INTER
+
+ frame_mv[NEARMV][ref_frame] = cur_mv;
+ av1_init_rd_stats(&tmp_rd_stats);
+
+ // Point to variables that are not maintained between iterations
+ args.single_newmv = dummy_single_newmv;
+#if CONFIG_EXT_INTER
+ args.single_newmv_rate = dummy_single_newmv_rate;
+ args.compmode_interintra_cost = &tmp_compmode_interintra_cost;
+ args.compmode_interinter_cost = &tmp_compmode_interinter_cost;
+ args.modelled_rd = NULL;
+#endif // CONFIG_EXT_INTER
+ tmp_alt_rd = handle_inter_mode(
+ cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
+ &dummy_disable_skip, frame_mv, mi_row, mi_col, &args, best_rd);
+ // Prevent pointers from escaping local scope
+ args.single_newmv = NULL;
+#if CONFIG_EXT_INTER
+ args.single_newmv_rate = NULL;
+ args.compmode_interintra_cost = NULL;
+ args.compmode_interinter_cost = NULL;
+#endif // CONFIG_EXT_INTER
+ }
+
+ for (i = 0; i < mbmi->ref_mv_idx; ++i) {
+ uint8_t drl1_ctx = 0;
+ drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+ i + idx_offset);
+ tmp_rd_stats.rate +=
+ (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1]
+ : 0);
+ }
+
+ if (mbmi_ext->ref_mv_count[ref_frame_type] >
+ mbmi->ref_mv_idx + idx_offset + 1 &&
+ ref_idx < ref_set - 1) {
+ uint8_t drl1_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+ mbmi->ref_mv_idx + idx_offset);
+ tmp_rd_stats.rate +=
+ (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][0]
+ : 0);
+ }
+
+ if (tmp_alt_rd < INT64_MAX) {
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rd_stats.rate,
+ tmp_rd_stats.dist);
+#else
+ if (RDCOST(x->rdmult, x->rddiv,
+ tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
+ tmp_rd_stats.dist) <
+ RDCOST(x->rdmult, x->rddiv, 0, tmp_rd_stats.sse))
+ tmp_alt_rd =
+ RDCOST(x->rdmult, x->rddiv,
+ tmp_rd_stats.rate +
+ av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+ tmp_rd_stats.dist);
+ else
+ tmp_alt_rd =
+ RDCOST(x->rdmult, x->rddiv,
+ tmp_rd_stats.rate +
+ av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+ tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
+ tmp_rd_stats.sse);
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ }
+
+ if (tmp_ref_rd > tmp_alt_rd) {
+ rate2 = tmp_rd_stats.rate;
+ disable_skip = dummy_disable_skip;
+ distortion2 = tmp_rd_stats.dist;
+ skippable = tmp_rd_stats.skip;
+ rate_y = tmp_rd_stats_y.rate;
+ rate_uv = tmp_rd_stats_uv.rate;
+ total_sse = tmp_rd_stats.sse;
+ this_rd = tmp_alt_rd;
+ tmp_ref_rd = tmp_alt_rd;
+ backup_mbmi = *mbmi;
+ backup_skip = x->skip;
+#if CONFIG_VAR_TX
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif // CONFIG_VAR_TX
+#if CONFIG_EXT_INTER
+ compmode_interintra_cost = tmp_compmode_interintra_cost;
+ compmode_interinter_cost = tmp_compmode_interinter_cost;
+#endif // CONFIG_EXT_INTER
+ } else {
+ *mbmi = backup_mbmi;
+ x->skip = backup_skip;
+ }
+ }
+
+ frame_mv[NEARMV][ref_frame] = backup_mv;
+ frame_mv[NEWMV][ref_frame] = backup_fmv[0];
+ if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
+#if CONFIG_VAR_TX
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ memcpy(x->blk_skip[i], x->blk_skip_drl[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif // CONFIG_VAR_TX
+ }
+ mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
+ if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
+#endif // CONFIG_REF_MV
+
+ if (this_rd == INT64_MAX) continue;
+
+#if SUB8X8_COMP_REF
+ compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
+#else
+ if (mbmi->sb_type >= BLOCK_8X8)
+ compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
+#endif // SUB8X8_COMP_REF
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
+ }
+
+#if CONFIG_EXT_INTER
+ rate2 += compmode_interintra_cost;
+ if (cm->reference_mode != SINGLE_REFERENCE && comp_pred)
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ if (mbmi->motion_mode == SIMPLE_TRANSLATION)
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ rate2 += compmode_interinter_cost;
+#endif // CONFIG_EXT_INTER
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ if (comp_pred) {
+ rate2 += ref_costs_comp[ref_frame];
+#if CONFIG_EXT_REFS
+ rate2 += ref_costs_comp[second_ref_frame];
+#endif // CONFIG_EXT_REFS
+ } else {
+ rate2 += ref_costs_single[ref_frame];
+ }
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ if (ref_frame == INTRA_FRAME) {
+#else
+ if (!disable_skip) {
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ if (skippable) {
+ // Back out the coefficient coding costs
+ rate2 -= (rate_y + rate_uv);
+ rate_y = 0;
+ rate_uv = 0;
+ // Cost the skip mb case
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_REF_MV
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0,
+ distortion2) <
+ RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse)) {
+#else
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+#endif // CONFIG_REF_MV
+ // Add in the cost of the no skip flag.
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ } else {
+ // FIXME(rbultje) make this work for splitmv also
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ distortion2 = total_sse;
+ assert(total_sse >= 0);
+ rate2 -= (rate_y + rate_uv);
+ this_skip2 = 1;
+ rate_y = 0;
+ rate_uv = 0;
+ }
+ } else {
+ // Add in the cost of the no skip flag.
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ }
+
+ // Calculate the final RD estimate for this mode.
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ } else {
+ this_skip2 = mbmi->skip;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ if (this_skip2) {
+ rate_y = 0;
+ rate_uv = 0;
+ }
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ // Keep record of best intra rd
+ if (this_rd < best_intra_rd) {
+ best_intra_rd = this_rd;
+ best_intra_mode = mbmi->mode;
+ }
+#if CONFIG_EXT_INTER
+ } else if (second_ref_frame == NONE_FRAME) {
+ if (this_rd < best_single_inter_rd) {
+ best_single_inter_rd = this_rd;
+ best_single_inter_ref = mbmi->ref_frame[0];
+ }
+#endif // CONFIG_EXT_INTER
+ }
+
+ if (!disable_skip && ref_frame == INTRA_FRAME) {
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < best_rd || x->skip) {
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ best_mode_index = mode_index;
+
+ if (ref_frame == INTRA_FRAME) {
+ /* required for left and above block mv */
+ mbmi->mv[0].as_int = 0;
+ } else {
+ best_pred_sse = x->pred_sse[ref_frame];
+ }
+
+ rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+ if (x->skip)
+ *returnrate_nocoef = rate2;
+ else
+ *returnrate_nocoef = rate2 - rate_y - rate_uv;
+ *returnrate_nocoef -= av1_cost_bit(
+ av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2);
+ *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+ mbmi->ref_frame[0] != INTRA_FRAME);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+ MODE_INFO *const mi = xd->mi[0];
+ const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ 0, xd->global_motion,
+#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+ mi);
+ if (motion_allowed == WARPED_CAUSAL)
+ *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+ else if (motion_allowed == OBMC_CAUSAL)
+ *returnrate_nocoef -=
+ cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+#else
+ *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#endif // CONFIG_SUPERTX
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ best_rd = this_rd;
+ best_mbmode = *mbmi;
+ best_skip2 = this_skip2;
+ best_mode_skippable = skippable;
+ best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
+ this_skip2 || skippable);
+ best_rate_uv = rate_uv;
+
+#if CONFIG_VAR_TX
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ memcpy(ctx->blk_skip[i], x->blk_skip[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif // CONFIG_VAR_TX
+ }
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && ref_frame != INTRA_FRAME) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+ if (!comp_pred) {
+ if (single_rd < best_pred_rd[SINGLE_REFERENCE])
+ best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ } else {
+ if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
+ best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+ }
+ if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+ best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+ }
+
+ if (x->skip && !comp_pred) break;
+ }
+
+ if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+ ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+ is_inter_mode(best_mbmode.mode)) ||
+ (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+ !is_inter_mode(best_mbmode.mode)))) {
+ int skip_blk = 0;
+ RD_STATS rd_stats_y, rd_stats_uv;
+
+ x->use_default_inter_tx_type = 0;
+ x->use_default_intra_tx_type = 0;
+
+ *mbmi = best_mbmode;
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (has_second_ref(mbmi))
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ if (is_inter_mode(mbmi->mode)) {
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+#if CONFIG_MOTION_VAR
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ av1_build_obmc_inter_prediction(
+ cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride,
+ args.left_pred_buf, args.left_pred_stride);
+ }
+#endif // CONFIG_MOTION_VAR
+ av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+ if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ } else {
+ int idx, idy;
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ for (idy = 0; idy < xd->n8_h; ++idy)
+ for (idx = 0; idx < xd->n8_w; ++idx)
+ mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+ memset(x->blk_skip[0], rd_stats_y.skip,
+ sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+ }
+
+ inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif // CONFIG_VAR_TX
+ } else {
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ }
+
+ if (RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+ (rd_stats_y.dist + rd_stats_uv.dist)) >
+ RDCOST(x->rdmult, x->rddiv, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+ skip_blk = 1;
+ rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ } else {
+ skip_blk = 0;
+ rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ }
+
+ if (RDCOST(x->rdmult, x->rddiv, best_rate_y + best_rate_uv, rd_cost->dist) >
+ RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+ (rd_stats_y.dist + rd_stats_uv.dist))) {
+#if CONFIG_VAR_TX
+ int idx, idy;
+#endif // CONFIG_VAR_TX
+ best_mbmode.tx_type = mbmi->tx_type;
+ best_mbmode.tx_size = mbmi->tx_size;
+#if CONFIG_VAR_TX
+ for (idy = 0; idy < xd->n8_h; ++idy)
+ for (idx = 0; idx < xd->n8_w; ++idx)
+ best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ memcpy(ctx->blk_skip[i], x->blk_skip[i],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+
+ best_mbmode.min_tx_size = mbmi->min_tx_size;
+#endif // CONFIG_VAR_TX
+ rd_cost->rate +=
+ (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
+ rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+ rd_cost->rdcost =
+ RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+ best_skip2 = skip_blk;
+ }
+ }
+
+#if CONFIG_PALETTE
+ // Only try palette mode when the best mode so far is an intra mode.
+ if (try_palette && !is_inter_mode(best_mbmode.mode)) {
+ int rate2 = 0;
+#if CONFIG_SUPERTX
+ int best_rate_nocoef;
+#endif // CONFIG_SUPERTX
+ int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
+ best_model_rd_palette = INT64_MAX;
+ int skippable = 0, rate_overhead_palette = 0;
+ RD_STATS rd_stats_y;
+ TX_SIZE uv_tx;
+ uint8_t *const best_palette_color_map =
+ x->palette_buffer->best_palette_color_map;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ MB_MODE_INFO best_mbmi_palette = best_mbmode;
+
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ rate_overhead_palette = rd_pick_palette_intra_sby(
+ cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
+ &best_model_rd_palette, NULL, NULL, NULL, NULL);
+ if (pmi->palette_size[0] == 0) goto PALETTE_EXIT;
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
+ if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT;
+ uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
+ [xd->plane[1].subsampling_y];
+ if (rate_uv_intra[uv_tx] == INT_MAX) {
+ choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+ &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
+ &skip_uvs[uv_tx], &mode_uv[uv_tx]);
+ pmi_uv[uv_tx] = *pmi;
+#if CONFIG_EXT_INTRA
+ uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#endif // CONFIG_FILTER_INTRA
+ }
+ mbmi->uv_mode = mode_uv[uv_tx];
+ pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+ if (pmi->palette_size[1] > 0) {
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+#if CONFIG_EXT_INTRA
+ mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+ filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+ if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+ mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+ filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+ }
+#endif // CONFIG_FILTER_INTRA
+ skippable = rd_stats_y.skip && skip_uvs[uv_tx];
+ distortion2 = rd_stats_y.dist + dist_uvs[uv_tx];
+ rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx];
+ rate2 += ref_costs_single[INTRA_FRAME];
+
+ if (skippable) {
+ rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
+#if CONFIG_SUPERTX
+ best_rate_nocoef = rate2;
+#endif // CONFIG_SUPERTX
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ } else {
+#if CONFIG_SUPERTX
+ best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
+#endif // CONFIG_SUPERTX
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ }
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ if (this_rd < best_rd) {
+ best_mode_index = 3;
+ mbmi->mv[0].as_int = 0;
+ rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+ *returnrate_nocoef = best_rate_nocoef;
+#endif // CONFIG_SUPERTX
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ best_rd = this_rd;
+ best_mbmode = *mbmi;
+ best_skip2 = 0;
+ best_mode_skippable = skippable;
+ }
+ }
+PALETTE_EXIT:
+#endif // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+ // TODO(huisu): filter-intra is turned off in lossless mode for now to
+ // avoid a unit test failure
+ if (!xd->lossless[mbmi->segment_id] &&
+#if CONFIG_PALETTE
+ pmi->palette_size[0] == 0 &&
+#endif // CONFIG_PALETTE
+ !dc_skipped && best_mode_index >= 0 &&
+ best_intra_rd < (best_rd + (best_rd >> 3))) {
+ pick_filter_intra_interframe(
+ cpi, x, ctx, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
+ dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
+#if CONFIG_EXT_INTRA
+ uv_angle_delta,
+#endif // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+ pmi_uv, palette_ctx,
+#endif // CONFIG_PALETTE
+ 0, ref_costs_single, &best_rd, &best_intra_rd, &best_intra_mode,
+ &best_mode_index, &best_skip2, &best_mode_skippable,
+#if CONFIG_SUPERTX
+ returnrate_nocoef,
+#endif // CONFIG_SUPERTX
+ best_pred_rd, &best_mbmode, rd_cost);
+ }
+#endif // CONFIG_FILTER_INTRA
+
+ // The inter modes' rate costs are not calculated precisely in some cases.
+ // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+ // ZEROMV. Here, checks are added for those cases, and the mode decisions
+ // are corrected.
+ if (best_mbmode.mode == NEWMV
+#if CONFIG_EXT_INTER
+ || best_mbmode.mode == NEW_NEWMV
+#endif // CONFIG_EXT_INTER
+ ) {
+ const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
+ best_mbmode.ref_frame[1] };
+ int comp_pred_mode = refs[1] > INTRA_FRAME;
+ int_mv zeromv[2];
+#if CONFIG_REF_MV
+ const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
+#endif // CONFIG_REF_MV
+#if CONFIG_GLOBAL_MOTION
+ zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
+ cm->allow_high_precision_mv, bsize,
+ mi_col, mi_row, 0)
+ .as_int;
+ zeromv[1].as_int = comp_pred_mode
+ ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+ cm->allow_high_precision_mv,
+ bsize, mi_col, mi_row, 0)
+ .as_int
+ : 0;
+#else
+ zeromv[0].as_int = 0;
+ zeromv[1].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+#if CONFIG_REF_MV
+ if (!comp_pred_mode) {
+ int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
+ ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
+ : INT_MAX;
+
+ for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+ int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+ if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
+ best_mbmode.mode = NEARMV;
+ best_mbmode.ref_mv_idx = i;
+ }
+ }
+
+ if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
+ best_mbmode.mode = NEARESTMV;
+ else if (best_mbmode.mv[0].as_int == zeromv[0].as_int)
+ best_mbmode.mode = ZEROMV;
+ } else {
+ int_mv nearestmv[2];
+ int_mv nearmv[2];
+
+#if CONFIG_EXT_INTER
+ if (mbmi_ext->ref_mv_count[rf_type] > 1) {
+ nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
+ nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
+ } else {
+ nearmv[0] = frame_mv[NEARMV][refs[0]];
+ nearmv[1] = frame_mv[NEARMV][refs[1]];
+ }
+#else
+ int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
+ ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
+ : INT_MAX;
+
+ for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+ nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+ nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
+
+ if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+ nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+ best_mbmode.mode = NEARMV;
+ best_mbmode.ref_mv_idx = i;
+ }
+ }
+#endif // CONFIG_EXT_INTER
+ if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
+ nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
+ nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
+ } else {
+ nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
+ nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
+ }
+
+ if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+ nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
+#if CONFIG_EXT_INTER
+ best_mbmode.mode = NEAREST_NEARESTMV;
+ } else {
+ int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
+ ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
+ : INT_MAX;
+
+ for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+ nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+ nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
+
+ // Try switching to the NEAR_NEAREST type modes first
+ if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+ nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+ best_mbmode.mode = NEAREST_NEARMV;
+ best_mbmode.ref_mv_idx = i;
+ } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+ nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
+ best_mbmode.mode = NEAR_NEARESTMV;
+ best_mbmode.ref_mv_idx = i;
+ } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+ nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+ best_mbmode.mode = NEAR_NEARMV;
+ best_mbmode.ref_mv_idx = i;
+ }
+ }
+
+ if (best_mbmode.mode == NEW_NEWMV &&
+ best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+ best_mbmode.mv[1].as_int == zeromv[1].as_int)
+ best_mbmode.mode = ZERO_ZEROMV;
+ }
+#else
+ best_mbmode.mode = NEARESTMV;
+ } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+ best_mbmode.mv[1].as_int == zeromv[1].as_int) {
+ best_mbmode.mode = ZEROMV;
+ }
+#endif // CONFIG_EXT_INTER
+ }
+#else
+#if CONFIG_EXT_INTER
+ if (!comp_pred_mode) {
+#endif // CONFIG_EXT_INTER
+ if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode &&
+ frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
+ !comp_pred_mode))
+ best_mbmode.mode = NEARESTMV;
+ else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode &&
+ frame_mv[NEARMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int) ||
+ !comp_pred_mode))
+ best_mbmode.mode = NEARMV;
+ else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+ ((comp_pred_mode &&
+ best_mbmode.mv[1].as_int == zeromv[1].as_int) ||
+ !comp_pred_mode))
+ best_mbmode.mode = ZEROMV;
+#if CONFIG_EXT_INTER
+ } else {
+#if CONFIG_GLOBAL_MOTION
+ zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
+ cm->allow_high_precision_mv,
+ bsize, mi_col, mi_row, 0)
+ .as_int;
+ zeromv[1].as_int = comp_pred_mode
+ ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+ cm->allow_high_precision_mv,
+ bsize, mi_col, mi_row, 0)
+ .as_int
+ : 0;
+#else
+ zeromv[0].as_int = 0;
+ zeromv[1].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ if (frame_mv[NEAREST_NEARESTMV][refs[0]].as_int ==
+ best_mbmode.mv[0].as_int &&
+ frame_mv[NEAREST_NEARESTMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int)
+ best_mbmode.mode = NEAREST_NEARESTMV;
+ else if (frame_mv[NEAREST_NEARMV][refs[0]].as_int ==
+ best_mbmode.mv[0].as_int &&
+ frame_mv[NEAREST_NEARMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int)
+ best_mbmode.mode = NEAREST_NEARMV;
+ else if (frame_mv[NEAR_NEARESTMV][refs[0]].as_int ==
+ best_mbmode.mv[0].as_int &&
+ frame_mv[NEAR_NEARESTMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int)
+ best_mbmode.mode = NEAR_NEARESTMV;
+ else if (frame_mv[NEAR_NEARMV][refs[0]].as_int ==
+ best_mbmode.mv[0].as_int &&
+ frame_mv[NEAR_NEARMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int)
+ best_mbmode.mode = NEAR_NEARMV;
+ else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+ best_mbmode.mv[1].as_int == zeromv[1].as_int)
+ best_mbmode.mode = ZERO_ZEROMV;
+ }
+#endif // CONFIG_EXT_INTER
+#endif // CONFIG_REF_MV
+ }
+
+#if CONFIG_REF_MV
+ // Make sure that the ref_mv_idx is only nonzero when we're
+ // using a mode which can support ref_mv_idx
+ if (best_mbmode.ref_mv_idx != 0 &&
+#if CONFIG_EXT_INTER
+ !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(best_mbmode.mode))) {
+#else
+ !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV)) {
+#endif
+ best_mbmode.ref_mv_idx = 0;
+ }
+
+ {
+ int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
+ int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
+ if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
+ int_mv zeromv[2];
+#if CONFIG_GLOBAL_MOTION
+ const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
+ best_mbmode.ref_frame[1] };
+ zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
+ cm->allow_high_precision_mv,
+ bsize, mi_col, mi_row, 0)
+ .as_int;
+ zeromv[1].as_int = (refs[1] != NONE_FRAME)
+ ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+ cm->allow_high_precision_mv,
+ bsize, mi_col, mi_row, 0)
+ .as_int
+ : 0;
+ lower_mv_precision(&zeromv[0].as_mv, cm->allow_high_precision_mv);
+ lower_mv_precision(&zeromv[1].as_mv, cm->allow_high_precision_mv);
+#else
+ zeromv[0].as_int = zeromv[1].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
+ best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+#if CONFIG_EXT_INTER
+ (best_mbmode.ref_frame[1] <= INTRA_FRAME)
+#else
+ (best_mbmode.ref_frame[1] == NONE_FRAME ||
+ best_mbmode.mv[1].as_int == zeromv[1].as_int)
+#endif // CONFIG_EXT_INTER
+ ) {
+ best_mbmode.mode = ZEROMV;
+ }
+ }
+ }
+#endif // CONFIG_REF_MV
+
+ if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+#if CONFIG_DUAL_FILTER
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+ !is_inter_block(&best_mbmode));
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter[1]) ||
+ !is_inter_block(&best_mbmode));
+ if (best_mbmode.ref_frame[1] > INTRA_FRAME) {
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter[2]) ||
+ !is_inter_block(&best_mbmode));
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter[3]) ||
+ !is_inter_block(&best_mbmode));
+ }
+#else
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter) ||
+ !is_inter_block(&best_mbmode));
+#endif // CONFIG_DUAL_FILTER
+
+ if (!cpi->rc.is_src_frame_alt_ref)
+ av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+ sf->adaptive_rd_thresh, bsize, best_mode_index);
+
+ // macroblock modes
+ *mbmi = best_mbmode;
+ x->skip |= best_skip2;
+
+// Note: this section is needed since the mode may have been forced to
+// ZEROMV by the all-zero mode handling of ref-mv.
+#if CONFIG_GLOBAL_MOTION
+ if (mbmi->mode == ZEROMV
+#if CONFIG_EXT_INTER
+ || mbmi->mode == ZERO_ZEROMV
+#endif // CONFIG_EXT_INTER
+ ) {
+#if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
+ // Correct the motion mode for ZEROMV
+ const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+#if SEPARATE_GLOBAL_MOTION
+ 0, xd->global_motion,
+#endif // SEPARATE_GLOBAL_MOTION
+ xd->mi[0]);
+ if (mbmi->motion_mode > last_motion_mode_allowed)
+ mbmi->motion_mode = last_motion_mode_allowed;
+#endif // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
+
+ // Correct the interpolation filter for ZEROMV
+ if (is_nontrans_global_motion(xd)) {
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+ mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#else
+ mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ }
+ }
+#endif // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_REF_MV
+ for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+ if (mbmi->mode != NEWMV)
+ mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
+ else
+ mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+ }
+#endif // CONFIG_REF_MV
+
+ for (i = 0; i < REFERENCE_MODES; ++i) {
+ if (best_pred_rd[i] == INT64_MAX)
+ best_pred_diff[i] = INT_MIN;
+ else
+ best_pred_diff[i] = best_rd - best_pred_rd[i];
+ }
+
+ x->skip |= best_mode_skippable;
+
+ assert(best_mode_index >= 0);
+
+ store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+ best_mode_skippable);
+
+#if CONFIG_PALETTE
+ if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
+ restore_uv_color_map(cpi, x);
+ }
+#endif // CONFIG_PALETTE
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+ TileDataEnc *tile_data, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ unsigned char segment_id = mbmi->segment_id;
+ const int comp_pred = 0;
+ int i;
+ int64_t best_pred_diff[REFERENCE_MODES];
+ unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+ unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+ aom_prob comp_mode_p;
+ InterpFilter best_filter = SWITCHABLE;
+ int64_t this_rd = INT64_MAX;
+ int rate2 = 0;
+ const int64_t distortion2 = 0;
+ (void)mi_row;
+ (void)mi_col;
+
+ estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+ &comp_mode_p);
+
+ for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
+ for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i)
+ x->pred_mv_sad[i] = INT_MAX;
+
+ rd_cost->rate = INT_MAX;
+
+ assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+#if CONFIG_PALETTE
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+#endif // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif // CONFIG_FILTER_INTRA
+ mbmi->mode = ZEROMV;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+#if CONFIG_GLOBAL_MOTION
+ mbmi->mv[0].as_int =
+ gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+ cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+ 0)
+ .as_int;
+#else // CONFIG_GLOBAL_MOTION
+ mbmi->mv[0].as_int = 0;
+#endif // CONFIG_GLOBAL_MOTION
+ mbmi->tx_size = max_txsize_lookup[bsize];
+ x->skip = 1;
+
+#if CONFIG_REF_MV
+ mbmi->ref_mv_idx = 0;
+ mbmi->pred_mv[0].as_int = 0;
+#endif // CONFIG_REF_MV
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_MOTION_VAR
+ av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+#endif
+#if CONFIG_WARPED_MOTION
+ if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+ }
+#endif
+
+ set_default_interp_filters(mbmi, cm->interp_filter);
+
+ if (cm->interp_filter != SWITCHABLE) {
+ best_filter = cm->interp_filter;
+ } else {
+ best_filter = EIGHTTAP_REGULAR;
+ if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) &&
+ x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+ int rs;
+ int best_rs = INT_MAX;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+#if CONFIG_DUAL_FILTER
+ int k;
+ for (k = 0; k < 4; ++k) mbmi->interp_filter[k] = i;
+#else
+ mbmi->interp_filter = i;
+#endif // CONFIG_DUAL_FILTER
+ rs = av1_get_switchable_rate(cpi, xd);
+ if (rs < best_rs) {
+ best_rs = rs;
+#if CONFIG_DUAL_FILTER
+ best_filter = mbmi->interp_filter[0];
+#else
+ best_filter = mbmi->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ }
+ }
+ }
+ }
+// Set the appropriate filter
+#if CONFIG_DUAL_FILTER
+ for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = best_filter;
+#else
+ mbmi->interp_filter = best_filter;
+#endif // CONFIG_DUAL_FILTER
+ rate2 += av1_get_switchable_rate(cpi, xd);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT)
+ rate2 += av1_cost_bit(comp_mode_p, comp_pred);
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rate2 += ref_costs_single[LAST_FRAME];
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+
+ if (this_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+#if CONFIG_DUAL_FILTER
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == mbmi->interp_filter[0]));
+#else
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == mbmi->interp_filter));
+#endif // CONFIG_DUAL_FILTER
+
+ av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+ cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+
+ av1_zero(best_pred_diff);
+
+ store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
+}
+
+void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
+ TileDataEnc *tile_data, struct macroblock *x,
+ int mi_row, int mi_col,
+ struct RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+ int *returnrate_nocoef,
+#endif // CONFIG_SUPERTX
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RD_OPT *const rd_opt = &cpi->rd;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const struct segmentation *const seg = &cm->seg;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+ unsigned char segment_id = mbmi->segment_id;
+ int comp_pred, i;
+ int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+ struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
+ static const int flag_list[TOTAL_REFS_PER_FRAME] = {
+ 0,
+ AOM_LAST_FLAG,
+#if CONFIG_EXT_REFS
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+#endif // CONFIG_EXT_REFS
+ AOM_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+ AOM_BWD_FLAG,
+#endif // CONFIG_EXT_REFS
+ AOM_ALT_FLAG
+ };
+ int64_t best_rd = best_rd_so_far;
+ int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise
+ int64_t best_pred_diff[REFERENCE_MODES];
+ int64_t best_pred_rd[REFERENCE_MODES];
+ MB_MODE_INFO best_mbmode;
+ int ref_index, best_ref_index = 0;
+ unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+ unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+ aom_prob comp_mode_p;
+#if CONFIG_DUAL_FILTER
+ InterpFilter tmp_best_filter[4] = { 0 };
+#else
+ InterpFilter tmp_best_filter = SWITCHABLE;
+#endif // CONFIG_DUAL_FILTER
+ int rate_uv_intra, rate_uv_tokenonly = INT_MAX;
+ int64_t dist_uv = INT64_MAX;
+ int skip_uv;
+ PREDICTION_MODE mode_uv = DC_PRED;
+ const int intra_cost_penalty = av1_get_intra_cost_penalty(
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME];
+ b_mode_info best_bmodes[4];
+ int best_skip2 = 0;
+ int ref_frame_skip_mask[2] = { 0 };
+ int internal_active_edge =
+ av1_active_edge_sb(cpi, mi_row, mi_col) && av1_internal_image_edge(cpi);
+#if CONFIG_PVQ
+ od_rollback_buffer pre_buf;
+
+ od_encode_checkpoint(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+
+#if CONFIG_SUPERTX
+ best_rd_so_far = INT64_MAX;
+ best_rd = best_rd_so_far;
+ best_yrd = best_rd_so_far;
+#endif // CONFIG_SUPERTX
+ av1_zero(best_mbmode);
+
+#if CONFIG_FILTER_INTRA
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif // CONFIG_FILTER_INTRA
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_EXT_INTER
+ mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+ mbmi->use_wedge_interintra = 0;
+#endif // CONFIG_EXT_INTER
+#if CONFIG_WARPED_MOTION
+ mbmi->num_proj_ref[0] = 0;
+ mbmi->num_proj_ref[1] = 0;
+#endif // CONFIG_WARPED_MOTION
+
+ for (i = 0; i < 4; i++) {
+ int j;
+ for (j = 0; j < TOTAL_REFS_PER_FRAME; j++)
+ seg_mvs[i][j].as_int = INVALID_MV;
+ }
+
+ estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+ &comp_mode_p);
+
+ for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
+ rate_uv_intra = INT_MAX;
+
+ rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+ *returnrate_nocoef = INT_MAX;
+#endif // CONFIG_SUPERTX
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+ x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif // CONFIG_REF_MV && CONFIG_EXT_INTER
+ if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+ setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+ } else {
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ }
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_EXT_INTER
+#endif // CONFIG_EXT_INTER
+ frame_mv[ZEROMV][ref_frame].as_int = 0;
+ }
+
+#if CONFIG_PALETTE
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+#endif // CONFIG_PALETTE
+
+ for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
+ int mode_excluded = 0;
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int compmode_cost = 0;
+ int rate2 = 0, rate_y = 0, rate_uv = 0;
+ int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int skippable = 0;
+ int this_skip2 = 0;
+ int64_t total_sse = INT_MAX;
+
+#if CONFIG_PVQ
+ od_encode_rollback(&x->daala_enc, &pre_buf);
+#endif // CONFIG_PVQ
+
+ ref_frame = av1_ref_order[ref_index].ref_frame[0];
+ second_ref_frame = av1_ref_order[ref_index].ref_frame[1];
+
+#if CONFIG_REF_MV
+ mbmi->ref_mv_idx = 0;
+#endif // CONFIG_REF_MV
+
+ // Look at the reference frame of the best mode so far and set the
+ // skip mask to look at a subset of the remaining modes.
+ if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
+ if (ref_index == 3) {
+ switch (best_mbmode.ref_frame[0]) {
+ case INTRA_FRAME: break;
+ case LAST_FRAME:
+ ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) |
+#if CONFIG_EXT_REFS
+ (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+ (1 << BWDREF_FRAME) |
+#endif // CONFIG_EXT_REFS
+ (1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+#if CONFIG_EXT_REFS
+ case LAST2_FRAME:
+ ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST3_FRAME) |
+ (1 << GOLDEN_FRAME) |
+ (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+ case LAST3_FRAME:
+ ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+ (1 << GOLDEN_FRAME) |
+ (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+#endif // CONFIG_EXT_REFS
+ case GOLDEN_FRAME:
+ ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+ (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+ (1 << BWDREF_FRAME) |
+#endif // CONFIG_EXT_REFS
+ (1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+#if CONFIG_EXT_REFS
+ case BWDREF_FRAME:
+ ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+ (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) |
+ (1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] |= (1 << ALTREF_FRAME) | 0x01;
+ break;
+#endif // CONFIG_EXT_REFS
+ case ALTREF_FRAME:
+ ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+ (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
+ (1 << BWDREF_FRAME) |
+#endif // CONFIG_EXT_REFS
+ (1 << GOLDEN_FRAME);
+#if CONFIG_EXT_REFS
+ ref_frame_skip_mask[1] |= (1 << BWDREF_FRAME) | 0x01;
+#endif // CONFIG_EXT_REFS
+ break;
+ case NONE_FRAME:
+ case TOTAL_REFS_PER_FRAME:
+ assert(0 && "Invalid Reference frame");
+ break;
+ }
+ }
+ }
+
+ if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+ (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
+ continue;
+
+ // Test best rd so far against threshold for trying this mode.
+ if (!internal_active_edge &&
+ rd_less_than_thresh(best_rd,
+ rd_opt->threshes[segment_id][bsize][ref_index],
+ tile_data->thresh_freq_fact[bsize][ref_index]))
+ continue;
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
+#if CONFIG_LOWDELAY_COMPOUND // Changes LL bitstream
+#if CONFIG_EXT_REFS
+ if (cpi->oxcf.pass == 0) {
+ // Complexity-compression trade-offs
+ // if (ref_frame == ALTREF_FRAME) continue;
+ // if (ref_frame == BWDREF_FRAME) continue;
+ if (second_ref_frame == ALTREF_FRAME) continue;
+ // if (second_ref_frame == BWDREF_FRAME) continue;
+ }
+#endif
+#endif
+ comp_pred = second_ref_frame > INTRA_FRAME;
+ if (comp_pred) {
+ if (!cpi->allow_comp_inter_inter) continue;
+ if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+
+ if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+ best_mbmode.ref_frame[0] == INTRA_FRAME)
+ continue;
+ }
+
+ // TODO(jingning, jkoleszar): scaling reference frame not supported for
+ // sub8x8 blocks.
+ if (ref_frame > INTRA_FRAME &&
+ av1_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+ continue;
+
+ if (second_ref_frame > INTRA_FRAME &&
+ av1_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
+ continue;
+
+ if (comp_pred)
+ mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+ else if (ref_frame != INTRA_FRAME)
+ mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ continue;
+ // Disable this drop out case if the ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that we end up unable to pick any mode.
+ } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+ continue;
+ }
+
+ mbmi->tx_size = TX_4X4;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = ref_frame;
+ mbmi->ref_frame[1] = second_ref_frame;
+// Evaluate all sub-pel filters irrespective of whether we can use
+// them for this frame.
+#if CONFIG_DUAL_FILTER
+ for (i = 0; i < 4; ++i)
+ mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#else
+ mbmi->interp_filter =
+ cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR : cm->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+#if CONFIG_VAR_TX
+ mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+ mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif // CONFIG_VAR_TX
+
+ if (ref_frame == INTRA_FRAME) {
+ int rate;
+ if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y,
+ NULL, best_rd) >= best_rd)
+ continue;
+ rate2 += rate;
+ rate2 += intra_cost_penalty;
+ distortion2 += distortion_y;
+
+ if (rate_uv_intra == INT_MAX) {
+ choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4, &rate_uv_intra,
+ &rate_uv_tokenonly, &dist_uv, &skip_uv, &mode_uv);
+ }
+ rate2 += rate_uv_intra;
+ rate_uv = rate_uv_tokenonly;
+ distortion2 += dist_uv;
+ distortion_uv = dist_uv;
+ mbmi->uv_mode = mode_uv;
+ } else {
+ int rate;
+ int64_t distortion;
+ int64_t this_rd_thresh;
+ int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+ int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+ int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+ int tmp_best_skippable = 0;
+ int switchable_filter_index;
+ int_mv *second_ref =
+ comp_pred ? &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
+ b_mode_info tmp_best_bmodes[16]; // Should this be 4 ?
+ MB_MODE_INFO tmp_best_mbmode;
+#if CONFIG_DUAL_FILTER
+ BEST_SEG_INFO bsi[DUAL_FILTER_SET_SIZE];
+#else
+ BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+#endif // CONFIG_DUAL_FILTER
+ int pred_exists = 0;
+ int uv_skippable;
+#if CONFIG_EXT_INTER
+ int_mv compound_seg_newmvs[4][2];
+ for (i = 0; i < 4; i++) {
+ compound_seg_newmvs[i][0].as_int = INVALID_MV;
+ compound_seg_newmvs[i][1].as_int = INVALID_MV;
+ }
+#endif // CONFIG_EXT_INTER
+
+ this_rd_thresh = (ref_frame == LAST_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_LAST]
+ : rd_opt->threshes[segment_id][bsize][THR_ALTR];
+#if CONFIG_EXT_REFS
+ this_rd_thresh = (ref_frame == LAST2_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_LAST2]
+ : this_rd_thresh;
+ this_rd_thresh = (ref_frame == LAST3_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_LAST3]
+ : this_rd_thresh;
+ this_rd_thresh = (ref_frame == BWDREF_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_BWDR]
+ : this_rd_thresh;
+#endif // CONFIG_EXT_REFS
+ this_rd_thresh = (ref_frame == GOLDEN_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_GOLD]
+ : this_rd_thresh;
+
+ // TODO(any): Add search of the tx_type to improve rd performance at the
+ // expense of speed.
+ mbmi->tx_type = DCT_DCT;
+
+ if (cm->interp_filter != BILINEAR) {
+#if CONFIG_DUAL_FILTER
+ tmp_best_filter[0] = EIGHTTAP_REGULAR;
+ tmp_best_filter[1] = EIGHTTAP_REGULAR;
+ tmp_best_filter[2] = EIGHTTAP_REGULAR;
+ tmp_best_filter[3] = EIGHTTAP_REGULAR;
+#else
+ tmp_best_filter = EIGHTTAP_REGULAR;
+#endif // CONFIG_DUAL_FILTER
+ if (x->source_variance < sf->disable_filter_search_var_thresh) {
+#if CONFIG_DUAL_FILTER
+ tmp_best_filter[0] = EIGHTTAP_REGULAR;
+#else
+ tmp_best_filter = EIGHTTAP_REGULAR;
+#endif // CONFIG_DUAL_FILTER
+ } else if (sf->adaptive_pred_interp_filter == 1 &&
+ ctx->pred_interp_filter < SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+ tmp_best_filter[0] = ctx->pred_interp_filter;
+#else
+ tmp_best_filter = ctx->pred_interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ } else if (sf->adaptive_pred_interp_filter == 2) {
+#if CONFIG_DUAL_FILTER
+ tmp_best_filter[0] = ctx->pred_interp_filter < SWITCHABLE
+ ? ctx->pred_interp_filter
+ : 0;
+#else
+ tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE
+ ? ctx->pred_interp_filter
+ : 0;
+#endif // CONFIG_DUAL_FILTER
+ } else {
+#if CONFIG_DUAL_FILTER
+ const int filter_set_size = DUAL_FILTER_SET_SIZE;
+#else
+ const int filter_set_size = SWITCHABLE_FILTERS;
+#endif // CONFIG_DUAL_FILTER
+ for (switchable_filter_index = 0;
+ switchable_filter_index < filter_set_size;
+ ++switchable_filter_index) {
+ int newbest, rs;
+ int64_t rs_rd;
+ MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] = filter_sets[switchable_filter_index][0];
+ mbmi->interp_filter[1] = filter_sets[switchable_filter_index][1];
+ mbmi->interp_filter[2] = filter_sets[switchable_filter_index][0];
+ mbmi->interp_filter[3] = filter_sets[switchable_filter_index][1];
+#else
+ mbmi->interp_filter = switchable_filter_index;
+#endif // CONFIG_DUAL_FILTER
+ tmp_rd = rd_pick_inter_best_sub8x8_mode(
+ cpi, x, &mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
+ &rate, &rate_y, &distortion, &skippable, &total_sse,
+ (int)this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+ compound_seg_newmvs,
+#endif // CONFIG_EXT_INTER
+ bsi, switchable_filter_index, mi_row, mi_col);
+ if (tmp_rd == INT64_MAX) continue;
+ rs = av1_get_switchable_rate(cpi, xd);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+ if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd;
+
+ newbest = (tmp_rd < tmp_best_rd);
+ if (newbest) {
+#if CONFIG_DUAL_FILTER
+ tmp_best_filter[0] = mbmi->interp_filter[0];
+ tmp_best_filter[1] = mbmi->interp_filter[1];
+ tmp_best_filter[2] = mbmi->interp_filter[2];
+ tmp_best_filter[3] = mbmi->interp_filter[3];
+#else
+ tmp_best_filter = mbmi->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ tmp_best_rd = tmp_rd;
+ }
+ if ((newbest && cm->interp_filter == SWITCHABLE) ||
+ (
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] == cm->interp_filter
+#else
+ mbmi->interp_filter == cm->interp_filter
+#endif // CONFIG_DUAL_FILTER
+ && cm->interp_filter != SWITCHABLE)) {
+ tmp_best_rdu = tmp_rd;
+ tmp_best_rate = rate;
+ tmp_best_ratey = rate_y;
+ tmp_best_distortion = distortion;
+ tmp_best_sse = total_sse;
+ tmp_best_skippable = skippable;
+ tmp_best_mbmode = *mbmi;
+ for (i = 0; i < 4; i++) {
+ tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
+ }
+ pred_exists = 1;
+ }
+ } // switchable_filter_index loop
+ }
+ }
+
+ if (tmp_best_rdu == INT64_MAX && pred_exists) continue;
+
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] =
+ (cm->interp_filter == SWITCHABLE ? tmp_best_filter[0]
+ : cm->interp_filter);
+ mbmi->interp_filter[1] =
+ (cm->interp_filter == SWITCHABLE ? tmp_best_filter[1]
+ : cm->interp_filter);
+ mbmi->interp_filter[2] =
+ (cm->interp_filter == SWITCHABLE ? tmp_best_filter[2]
+ : cm->interp_filter);
+ mbmi->interp_filter[3] =
+ (cm->interp_filter == SWITCHABLE ? tmp_best_filter[3]
+ : cm->interp_filter);
+#else
+ mbmi->interp_filter =
+ (cm->interp_filter == SWITCHABLE ? tmp_best_filter
+ : cm->interp_filter);
+#endif // CONFIG_DUAL_FILTER
+
+ if (!pred_exists) {
+ // Handles the special case when a filter that is not in the
+ // switchable list (bilinear) is indicated at the frame level
+ tmp_rd = rd_pick_inter_best_sub8x8_mode(
+ cpi, x, &x->mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
+ &rate, &rate_y, &distortion, &skippable, &total_sse,
+ (int)this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+ compound_seg_newmvs,
+#endif // CONFIG_EXT_INTER
+ bsi, 0, mi_row, mi_col);
+ if (tmp_rd == INT64_MAX) continue;
+ } else {
+ total_sse = tmp_best_sse;
+ rate = tmp_best_rate;
+ rate_y = tmp_best_ratey;
+ distortion = tmp_best_distortion;
+ skippable = tmp_best_skippable;
+ *mbmi = tmp_best_mbmode;
+ for (i = 0; i < 4; i++) xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
+ }
+ // Add in the cost of the transform type
+ if (!xd->lossless[mbmi->segment_id]) {
+ int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+ if (get_ext_tx_types(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used) >
+ 1) {
+ const int eset =
+ get_ext_tx_set(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used);
+ rate_tx_type =
+ cpi->inter_tx_type_costs[eset][mbmi->tx_size][mbmi->tx_type];
+ }
+#else
+ if (mbmi->tx_size < TX_32X32) {
+ rate_tx_type = cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+ }
+#endif // CONFIG_EXT_TX
+ rate += rate_tx_type;
+ rate_y += rate_tx_type;
+ }
+
+ rate2 += rate;
+ distortion2 += distortion;
+
+ if (cm->interp_filter == SWITCHABLE)
+ rate2 += av1_get_switchable_rate(cpi, xd);
+
+ if (!mode_excluded)
+ mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
+ : cm->reference_mode == COMPOUND_REFERENCE;
+
+ compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
+
+ tmp_best_rdu =
+ best_rd - AOMMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+ if (tmp_best_rdu > 0) {
+ // If even the 'Y' rd value of split is higher than best so far
+ // then dont bother looking at UV
+ int is_cost_valid_uv;
+ RD_STATS rd_stats_uv;
+ av1_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, NULL,
+ BLOCK_8X8);
+#if CONFIG_VAR_TX
+ is_cost_valid_uv =
+ inter_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
+#else
+ is_cost_valid_uv =
+ super_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
+#endif // CONFIG_VAR_TX
+ rate_uv = rd_stats_uv.rate;
+ distortion_uv = rd_stats_uv.dist;
+ uv_skippable = rd_stats_uv.skip;
+ uv_sse = rd_stats_uv.sse;
+
+ if (!is_cost_valid_uv) continue;
+ rate2 += rate_uv;
+ distortion2 += distortion_uv;
+ skippable = skippable && uv_skippable;
+ total_sse += uv_sse;
+ } else {
+ continue;
+ }
+ }
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ if (second_ref_frame > INTRA_FRAME) {
+ rate2 += ref_costs_comp[ref_frame];
+#if CONFIG_EXT_REFS
+ rate2 += ref_costs_comp[second_ref_frame];
+#endif // CONFIG_EXT_REFS
+ } else {
+ rate2 += ref_costs_single[ref_frame];
+ }
+
+ if (!disable_skip) {
+ // Skip is never coded at the segment level for sub8x8 blocks and instead
+ // always coded in the bitstream at the mode info level.
+
+ if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+ // Add in the cost of the no skip flag.
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ } else {
+ // FIXME(rbultje) make this work for splitmv also
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+ distortion2 = total_sse;
+ assert(total_sse >= 0);
+ rate2 -= (rate_y + rate_uv);
+ rate_y = 0;
+ rate_uv = 0;
+ this_skip2 = 1;
+ }
+ } else {
+ // Add in the cost of the no skip flag.
+ rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ }
+
+ // Calculate the final RD estimate for this mode.
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ }
+
+ if (!disable_skip && ref_frame == INTRA_FRAME) {
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < best_rd || x->skip) {
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ best_ref_index = ref_index;
+
+ if (ref_frame == INTRA_FRAME) {
+ /* required for left and above block mv */
+ mbmi->mv[0].as_int = 0;
+ }
+
+ rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+ *returnrate_nocoef = rate2 - rate_y - rate_uv;
+ if (!disable_skip)
+ *returnrate_nocoef -=
+ av1_cost_bit(av1_get_skip_prob(cm, xd), this_skip2);
+ *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+ mbmi->ref_frame[0] != INTRA_FRAME);
+ assert(*returnrate_nocoef > 0);
+#endif // CONFIG_SUPERTX
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ best_rd = this_rd;
+ best_yrd =
+ best_rd - RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
+ best_mbmode = *mbmi;
+ best_skip2 = this_skip2;
+
+#if CONFIG_VAR_TX
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ memset(ctx->blk_skip[i], 0, sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif // CONFIG_VAR_TX
+
+ for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
+ }
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && ref_frame != INTRA_FRAME) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+ if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
+ best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
+ best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+
+ if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+ best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+ }
+
+ if (x->skip && !comp_pred) break;
+ }
+
+ if (best_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ *returnrate_nocoef = INT_MAX;
+#endif // CONFIG_SUPERTX
+ return;
+ }
+
+ if (best_rd == INT64_MAX) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->dist = INT64_MAX;
+ rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+ *returnrate_nocoef = INT_MAX;
+#endif // CONFIG_SUPERTX
+ return;
+ }
+
+#if CONFIG_DUAL_FILTER
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+ !is_inter_block(&best_mbmode));
+#else
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter) ||
+ !is_inter_block(&best_mbmode));
+#endif // CONFIG_DUAL_FILTER
+
+ av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+ sf->adaptive_rd_thresh, bsize, best_ref_index);
+
+ // macroblock modes
+ *mbmi = best_mbmode;
+#if CONFIG_VAR_TX
+ mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+#endif // CONFIG_VAR_TX
+
+ x->skip |= best_skip2;
+ if (!is_inter_block(&best_mbmode)) {
+ for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+ } else {
+ for (i = 0; i < 4; ++i)
+ memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+
+#if CONFIG_REF_MV
+ mbmi->pred_mv[0].as_int = xd->mi[0]->bmi[3].pred_mv[0].as_int;
+ mbmi->pred_mv[1].as_int = xd->mi[0]->bmi[3].pred_mv[1].as_int;
+#endif // CONFIG_REF_MV
+ mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
+ }
+
+// Note: this section is needed since the mode may have been forced to ZEROMV
+#if CONFIG_GLOBAL_MOTION
+ if (mbmi->mode == ZEROMV
+#if CONFIG_EXT_INTER
+ || mbmi->mode == ZERO_ZEROMV
+#endif // CONFIG_EXT_INTER
+ ) {
+ if (is_nontrans_global_motion(xd)) {
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+ mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
+ ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#else
+ mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
+ : cm->interp_filter;
+#endif // CONFIG_DUAL_FILTER
+ }
+ }
+#endif // CONFIG_GLOBAL_MOTION
+
+ for (i = 0; i < REFERENCE_MODES; ++i) {
+ if (best_pred_rd[i] == INT64_MAX)
+ best_pred_diff[i] = INT_MIN;
+ else
+ best_pred_diff[i] = best_rd - best_pred_rd[i];
+ }
+
+ store_coding_context(x, ctx, best_ref_index, best_pred_diff, 0);
+}
+
+#if CONFIG_MOTION_VAR
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+// PObmc(x,y) =
+// AOM_BLEND_A64(Mh(x),
+// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+// PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+// Mh(x) * Mv(y) * P(x,y) +
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+// wsrc(x, y) =
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+// error(x, y) =
+// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+ const MACROBLOCKD *xd, int mi_row,
+ int mi_col, const uint8_t *above,
+ int above_stride, const uint8_t *left,
+ int left_stride) {
+ const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+ int row, col, i;
+ const int bw = xd->n8_w << MI_SIZE_LOG2;
+ const int bh = xd->n8_h << MI_SIZE_LOG2;
+ int32_t *mask_buf = x->mask_buf;
+ int32_t *wsrc_buf = x->wsrc_buf;
+ const int wsrc_stride = bw;
+ const int mask_stride = bw;
+ const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+#if CONFIG_HIGHBITDEPTH
+ const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+ const int is_hbd = 0;
+#endif // CONFIG_HIGHBITDEPTH
+
+ // plane 0 should not be subsampled
+ assert(xd->plane[0].subsampling_x == 0);
+ assert(xd->plane[0].subsampling_y == 0);
+
+ av1_zero_array(wsrc_buf, bw * bh);
+ for (i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+ // handle above row
+ if (xd->up_available) {
+ const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+ const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+ const int mi_row_offset = -1;
+ const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
+ const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
+ int neighbor_count = 0;
+
+ assert(miw > 0);
+
+ i = 0;
+ do { // for each mi in the above row
+ const int mi_col_offset = i;
+ const MB_MODE_INFO *const above_mbmi =
+ &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+ const BLOCK_SIZE a_bsize = above_mbmi->sb_type;
+ const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+ const int neighbor_bw = mi_step * MI_SIZE;
+
+ if (is_neighbor_overlappable(above_mbmi)) {
+ if (!CONFIG_CB4X4 && (a_bsize == BLOCK_4X4 || a_bsize == BLOCK_4X8))
+ neighbor_count += 2;
+ else
+ neighbor_count++;
+ if (neighbor_count > neighbor_limit) break;
+
+ const int tmp_stride = above_stride;
+ int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
+ int32_t *mask = mask_buf + (i * MI_SIZE);
+
+ if (!is_hbd) {
+ const uint8_t *tmp = above;
+
+ for (row = 0; row < overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (col = 0; col < neighbor_bw; ++col) {
+ wsrc[col] = m1 * tmp[col];
+ mask[col] = m0;
+ }
+ wsrc += wsrc_stride;
+ mask += mask_stride;
+ tmp += tmp_stride;
+ }
+#if CONFIG_HIGHBITDEPTH
+ } else {
+ const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
+
+ for (row = 0; row < overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (col = 0; col < neighbor_bw; ++col) {
+ wsrc[col] = m1 * tmp[col];
+ mask[col] = m0;
+ }
+ wsrc += wsrc_stride;
+ mask += mask_stride;
+ tmp += tmp_stride;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ }
+ }
+
+ above += neighbor_bw;
+ i += mi_step;
+ } while (i < miw);
+ }
+
+ for (i = 0; i < bw * bh; ++i) {
+ wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ }
+
+ // handle left column
+ if (xd->left_available) {
+ const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+ const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+ const int mi_col_offset = -1;
+ const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
+ const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
+ int neighbor_count = 0;
+
+ assert(mih > 0);
+
+ i = 0;
+ do { // for each mi in the left column
+ const int mi_row_offset = i;
+ const MB_MODE_INFO *const left_mbmi =
+ &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+ const BLOCK_SIZE l_bsize = left_mbmi->sb_type;
+ const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+ const int neighbor_bh = mi_step * MI_SIZE;
+
+ if (is_neighbor_overlappable(left_mbmi)) {
+ if (!CONFIG_CB4X4 && (l_bsize == BLOCK_4X4 || l_bsize == BLOCK_8X4))
+ neighbor_count += 2;
+ else
+ neighbor_count++;
+ if (neighbor_count > neighbor_limit) break;
+
+ const int tmp_stride = left_stride;
+ int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
+ int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
+
+ if (!is_hbd) {
+ const uint8_t *tmp = left;
+
+ for (row = 0; row < neighbor_bh; ++row) {
+ for (col = 0; col < overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += wsrc_stride;
+ mask += mask_stride;
+ tmp += tmp_stride;
+ }
+#if CONFIG_HIGHBITDEPTH
+ } else {
+ const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
+
+ for (row = 0; row < neighbor_bh; ++row) {
+ for (col = 0; col < overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += wsrc_stride;
+ mask += mask_stride;
+ tmp += tmp_stride;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ }
+ }
+
+ left += neighbor_bh * left_stride;
+ i += mi_step;
+ } while (i < mih);
+ }
+
+ if (!is_hbd) {
+ const uint8_t *src = x->plane[0].src.buf;
+
+ for (row = 0; row < bh; ++row) {
+ for (col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += wsrc_stride;
+ src += x->plane[0].src.stride;
+ }
+#if CONFIG_HIGHBITDEPTH
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+ for (row = 0; row < bh; ++row) {
+ for (col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += wsrc_stride;
+ src += x->plane[0].src.stride;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ }
+}
+
+#if CONFIG_NCOBMC
+void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ MB_MODE_INFO backup_mbmi;
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ int ref, skip_blk, backup_skip = x->skip;
+ int64_t rd_causal;
+ RD_STATS rd_stats_y, rd_stats_uv;
+ int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+ int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+
+ // Recompute the best causal predictor and rd
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+ assert(cfg != NULL);
+ av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+ &xd->block_refs[ref]->sf);
+ }
+ av1_setup_dst_planes(x->e_mbd.plane, bsize,
+ get_frame_new_buffer(&cpi->common), mi_row, mi_col);
+
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+
+ av1_subtract_plane(x, bsize, 0);
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
+ if (rd_stats_y.skip && rd_stats_uv.skip) {
+ rd_stats_y.rate = rate_skip1;
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ skip_blk = 0;
+ } else if (RDCOST(x->rdmult, x->rddiv,
+ (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
+ (rd_stats_y.dist + rd_stats_uv.dist)) >
+ RDCOST(x->rdmult, x->rddiv, rate_skip1,
+ (rd_stats_y.sse + rd_stats_uv.sse))) {
+ rd_stats_y.rate = rate_skip1;
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ skip_blk = 1;
+ } else {
+ rd_stats_y.rate += rate_skip0;
+ skip_blk = 0;
+ }
+ backup_skip = skip_blk;
+ backup_mbmi = *mbmi;
+ rd_causal = RDCOST(x->rdmult, x->rddiv, (rd_stats_y.rate + rd_stats_uv.rate),
+ (rd_stats_y.dist + rd_stats_uv.dist));
+ rd_causal += RDCOST(x->rdmult, x->rddiv,
+ av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
+
+ // Check non-causal mode
+ mbmi->motion_mode = OBMC_CAUSAL;
+ av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+ av1_subtract_plane(x, bsize, 0);
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
+ if (rd_stats_y.skip && rd_stats_uv.skip) {
+ rd_stats_y.rate = rate_skip1;
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ skip_blk = 0;
+ } else if (RDCOST(x->rdmult, x->rddiv,
+ (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
+ (rd_stats_y.dist + rd_stats_uv.dist)) >
+ RDCOST(x->rdmult, x->rddiv, rate_skip1,
+ (rd_stats_y.sse + rd_stats_uv.sse))) {
+ rd_stats_y.rate = rate_skip1;
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ skip_blk = 1;
+ } else {
+ rd_stats_y.rate += rate_skip0;
+ skip_blk = 0;
+ }
+
+ if (rd_causal >
+ RDCOST(x->rdmult, x->rddiv,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
+ (rd_stats_y.dist + rd_stats_uv.dist))) {
+ x->skip = skip_blk;
+ } else {
+ *mbmi = backup_mbmi;
+ x->skip = backup_skip;
+ }
+}
+#endif // CONFIG_NCOBMC
+#endif // CONFIG_MOTION_VAR
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
new file mode 100644
index 0000000000..a7053b2897
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RDOPT_H_
+#define AV1_ENCODER_RDOPT_H_
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TileInfo;
+struct AV1_COMP;
+struct macroblock;
+struct RD_STATS;
+
+#if CONFIG_RD_DEBUG
+static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+ TX_SIZE tx_size, int blk_row,
+ int blk_col, int txb_coeff_cost) {
+ (void)blk_row;
+ (void)blk_col;
+ (void)tx_size;
+ rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+
+#if CONFIG_VAR_TX
+ {
+ const int txb_h = tx_size_high_unit[tx_size];
+ const int txb_w = tx_size_wide_unit[tx_size];
+ int idx, idy;
+ for (idy = 0; idy < txb_h; ++idy)
+ for (idx = 0; idx < txb_w; ++idx)
+ rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
+
+ rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
+ }
+ assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
+ assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
+#endif
+}
+#endif
+
+typedef enum OUTPUT_STATUS {
+ OUTPUT_HAS_PREDICTED_PIXELS,
+ OUTPUT_HAS_DECODED_PIXELS
+} OUTPUT_STATUS;
+
+void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
+ TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
+ OUTPUT_STATUS output_status);
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
+ const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
+ int use_fast_coef_costing);
+#endif
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs);
+#if CONFIG_HIGHBITDEPTH
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd);
+#endif
+
+void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col,
+ struct RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+ int *returnrate_nocoef,
+#endif // CONFIG_SUPERTX
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+ const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+int av1_internal_image_edge(const struct AV1_COMP *cpi);
+int av1_active_h_edge(const struct AV1_COMP *cpi, int mi_row, int mi_step);
+int av1_active_v_edge(const struct AV1_COMP *cpi, int mi_col, int mi_step);
+int av1_active_edge_sb(const struct AV1_COMP *cpi, int mi_row, int mi_col);
+
+void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col,
+ struct RD_STATS *rd_cost,
+#if CONFIG_SUPERTX
+ int *returnrate_nocoef,
+#endif // CONFIG_SUPERTX
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
+ int mi_row, int mi_col);
+#endif // CONFIG_MOTION_VAR && CONFIG_NCOBMC
+
+#if CONFIG_SUPERTX
+#if CONFIG_VAR_TX
+void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+ int blk_row, int blk_col, int plane, int block,
+ int plane_bsize, const ENTROPY_CONTEXT *a,
+ const ENTROPY_CONTEXT *l, RD_STATS *rd_stats);
+#endif
+
+void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
+ int64_t *distortion, int *skippable,
+ int64_t *sse, int64_t ref_best_rd, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int use_fast_coef_casting);
+#endif // CONFIG_SUPERTX
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type);
+
+#endif // AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
new file mode 100644
index 0000000000..b581a61d0a
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/subexp.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+ seg->enabled = 1;
+ seg->update_map = 1;
+ seg->update_data = 1;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+}
+
+void av1_set_segment_data(struct segmentation *seg, signed char *feature_data,
+ unsigned char abs_delta) {
+ seg->abs_delta = abs_delta;
+
+ memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
+}
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_data[segment_id][feature_id] = 0;
+}
+
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs(unsigned *segcounts,
+ aom_prob *segment_tree_probs,
+ const aom_prob *cur_tree_probs,
+ const int probwt) {
+ // Work out probabilities of each segment
+ const unsigned cc[4] = { segcounts[0] + segcounts[1],
+ segcounts[2] + segcounts[3],
+ segcounts[4] + segcounts[5],
+ segcounts[6] + segcounts[7] };
+ const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] };
+ int i;
+
+ segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]);
+ segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]);
+ segment_tree_probs[2] = get_binary_prob(cc[2], cc[3]);
+ segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+ segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
+ segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
+ segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
+
+ for (i = 0; i < 7; i++) {
+ const unsigned *ct =
+ i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2;
+ av1_prob_diff_update_savings_search(ct, cur_tree_probs[i],
+ &segment_tree_probs[i],
+ DIFF_UPDATE_PROB, probwt);
+ }
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap(unsigned *segcounts, aom_prob *probs) {
+ const int c01 = segcounts[0] + segcounts[1];
+ const int c23 = segcounts[2] + segcounts[3];
+ const int c45 = segcounts[4] + segcounts[5];
+ const int c67 = segcounts[6] + segcounts[7];
+ const int c0123 = c01 + c23;
+ const int c4567 = c45 + c67;
+
+ // Cost the top node of the tree
+ int cost = c0123 * av1_cost_zero(probs[0]) + c4567 * av1_cost_one(probs[0]);
+
+ // Cost subsequent levels
+ if (c0123 > 0) {
+ cost += c01 * av1_cost_zero(probs[1]) + c23 * av1_cost_one(probs[1]);
+
+ if (c01 > 0)
+ cost += segcounts[0] * av1_cost_zero(probs[3]) +
+ segcounts[1] * av1_cost_one(probs[3]);
+ if (c23 > 0)
+ cost += segcounts[2] * av1_cost_zero(probs[4]) +
+ segcounts[3] * av1_cost_one(probs[4]);
+ }
+
+ if (c4567 > 0) {
+ cost += c45 * av1_cost_zero(probs[2]) + c67 * av1_cost_one(probs[2]);
+
+ if (c45 > 0)
+ cost += segcounts[4] * av1_cost_zero(probs[5]) +
+ segcounts[5] * av1_cost_one(probs[5]);
+ if (c67 > 0)
+ cost += segcounts[6] * av1_cost_zero(probs[6]) +
+ segcounts[7] * av1_cost_one(probs[6]);
+ }
+
+ return cost;
+}
+
+static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *tile, MODE_INFO **mi,
+ unsigned *no_pred_segcounts,
+ unsigned (*temporal_predictor_count)[2],
+ unsigned *t_unpred_seg_counts, int bw, int bh,
+ int mi_row, int mi_col) {
+ int segment_id;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ xd->mi = mi;
+ segment_id = xd->mi[0]->mbmi.segment_id;
+
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
+#if CONFIG_DEPENDENT_HORZTILES
+ cm->dependent_horz_tiles,
+#endif // CONFIG_DEPENDENT_HORZTILES
+ cm->mi_rows, cm->mi_cols);
+
+ // Count the number of hits on each segment with no prediction
+ no_pred_segcounts[segment_id]++;
+
+ // Temporal prediction not allowed on key frames
+ if (cm->frame_type != KEY_FRAME) {
+ const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+ // Test to see if the segment id matches the predicted value.
+ const int pred_segment_id =
+ get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col);
+ const int pred_flag = pred_segment_id == segment_id;
+ const int pred_context = av1_get_pred_context_seg_id(xd);
+
+ // Store the prediction status for this mb and update counts
+ // as appropriate
+ xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
+ temporal_predictor_count[pred_context][pred_flag]++;
+
+ // Update the "unpredicted" segment count
+ if (!pred_flag) t_unpred_seg_counts[segment_id]++;
+ }
+}
+
+static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *tile, MODE_INFO **mi,
+ unsigned *no_pred_segcounts,
+ unsigned (*temporal_predictor_count)[2],
+ unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int mis = cm->mi_stride;
+ const int bs = mi_size_wide[bsize], hbs = bs / 2;
+#if CONFIG_EXT_PARTITION_TYPES
+ PARTITION_TYPE partition;
+#else
+ int bw, bh;
+#endif // CONFIG_EXT_PARTITION_TYPES
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+#if CONFIG_EXT_PARTITION_TYPES
+ if (bsize == BLOCK_8X8)
+ partition = PARTITION_NONE;
+ else
+ partition = get_partition(cm, mi_row, mi_col, bsize);
+ switch (partition) {
+ case PARTITION_NONE:
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+ mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+ mi_col + hbs);
+ break;
+ case PARTITION_HORZ_A:
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+ mi_row, mi_col + hbs);
+ count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+ mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+ mi_row + hbs, mi_col);
+ count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+ mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+ mi_row + hbs, mi_col);
+ count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+ mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+ mi_row, mi_col + hbs);
+ count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+ mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT: {
+ const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+ int n;
+
+ assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs &&
+ num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs);
+
+ for (n = 0; n < 4; n++) {
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
+
+ count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts,
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
+ }
+ } break;
+ default: assert(0);
+ }
+#else
+ bw = mi_size_wide[mi[0]->mbmi.sb_type];
+ bh = mi_size_high[mi[0]->mbmi.sb_type];
+
+ if (bw == bs && bh == bs) {
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+ } else if (bw == bs && bh < bs) {
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+ mi_row + hbs, mi_col);
+ } else if (bw < bs && bh == bs) {
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+ mi_col + hbs);
+ } else {
+ const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+ int n;
+
+ assert(bw < bs && bh < bs);
+
+ for (n = 0; n < 4; n++) {
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
+
+ count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts,
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
+ }
+ }
+#endif // CONFIG_EXT_PARTITION_TYPES
+}
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
+ struct segmentation *seg = &cm->seg;
+ struct segmentation_probs *segp = &cm->fc->seg;
+
+ int no_pred_cost;
+ int t_pred_cost = INT_MAX;
+
+ int i, tile_col, tile_row, mi_row, mi_col;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+
+ unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
+ unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
+ unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
+
+ aom_prob no_pred_tree[SEG_TREE_PROBS];
+ aom_prob t_pred_tree[SEG_TREE_PROBS];
+ aom_prob t_nopred_prob[PREDICTION_PROBS];
+
+ (void)xd;
+
+ // We are about to recompute all the segment counts, so zero the accumulators.
+ av1_zero(cm->counts.seg);
+
+ // First of all generate stats regarding how well the last segment map
+ // predicts this one
+ for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+ MODE_INFO **mi_ptr;
+ av1_tile_set_col(&tile_info, cm, tile_col);
+#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES
+ av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
+#endif
+ mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
+ tile_info.mi_col_start;
+ for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+ mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) {
+ MODE_INFO **mi = mi_ptr;
+ for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->mib_size, mi += cm->mib_size) {
+ count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, mi_row,
+ mi_col, cm->sb_size);
+ }
+ }
+ }
+ }
+
+ // Work out probability tree for coding segments without prediction
+ // and the cost.
+ calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt);
+ no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
+
+ // Key frames cannot use temporal prediction
+ if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+ // Work out probability tree for coding those segments not
+ // predicted using the temporal method and the cost.
+ calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs,
+ probwt);
+ t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
+
+ // Add in the cost of the signaling for each prediction context.
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ const int count0 = temporal_predictor_count[i][0];
+ const int count1 = temporal_predictor_count[i][1];
+
+ t_nopred_prob[i] = get_binary_prob(count0, count1);
+ av1_prob_diff_update_savings_search(
+ temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i],
+ DIFF_UPDATE_PROB, probwt);
+
+ // Add in the predictor signaling cost
+ t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) +
+ count1 * av1_cost_one(t_nopred_prob[i]);
+ }
+ }
+
+ // Now choose which coding method to use.
+ if (t_pred_cost < no_pred_cost) {
+ assert(!cm->error_resilient_mode);
+ seg->temporal_update = 1;
+ } else {
+ seg->temporal_update = 0;
+ }
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+ struct segmentation *seg = &cm->seg;
+
+ // Set up default state for MB feature flags
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ av1_clearall_segfeatures(seg);
+}
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
new file mode 100644
index 0000000000..c1491ca2af
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_SEGMENTATION_H_
+#define AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+// The values given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+void av1_set_segment_data(struct segmentation *seg, signed char *feature_data,
+ unsigned char abs_delta);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
new file mode 100644
index 0000000000..20c96761b7
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method
+static MESH_PATTERN
+ good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
+ 50, 25, 15, 5, 1, 1
+};
+
+#if CONFIG_INTRABC
+// TODO(aconverse@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
+ 25, 25, 10 };
+#endif
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+ return frame_is_kf_gf_arf(cpi);
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
+ unsigned int screen_area = (cm->width * cm->height);
+
+ // Select block size based on image format size.
+ if (screen_area < 1280 * 720) {
+ // Formats smaller in area than 720P
+ return BLOCK_4X4;
+ } else if (screen_area < 1920 * 1080) {
+ // Format >= 720P and < 1080P
+ return BLOCK_8X8;
+ } else {
+ // Formats 1080P and up
+ return BLOCK_16X16;
+ }
+}
+
+static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ if (speed >= 1) {
+ if (AOMMIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask =
+ cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ } else {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ sf->partition_search_breakout_dist_thr = (1 << 21);
+ }
+ }
+
+ if (speed >= 2) {
+ if (AOMMIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask =
+ cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ sf->partition_search_breakout_rate_thr = 120;
+ } else {
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+ sf->partition_search_breakout_dist_thr = (1 << 22);
+ sf->partition_search_breakout_rate_thr = 100;
+ }
+ sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+ }
+
+ if (speed >= 3) {
+ if (AOMMIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+ sf->partition_search_breakout_dist_thr = (1 << 25);
+ sf->partition_search_breakout_rate_thr = 200;
+ } else {
+ sf->max_intra_bsize = BLOCK_32X32;
+ sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+ sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ sf->partition_search_breakout_rate_thr = 120;
+ }
+ }
+
+ // If this is a two pass clip that fits the criteria for animated or
+ // graphics content then reset disable_split_mask for speeds 1-4.
+ // Also if the image edge is internal to the coded area.
+ if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ (av1_internal_image_edge(cpi)))) {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ }
+
+ if (speed >= 4) {
+ if (AOMMIN(cm->width, cm->height) >= 720) {
+ sf->partition_search_breakout_dist_thr = (1 << 26);
+ } else {
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ }
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ }
+}
+
+static void set_good_speed_feature(AV1_COMP *cpi, AV1_COMMON *cm,
+ SPEED_FEATURES *sf, int speed) {
+ const int boosted = frame_is_boosted(cpi);
+
+ if (speed >= 1) {
+ sf->tx_type_search.fast_intra_tx_type_search = 1;
+ sf->tx_type_search.fast_inter_tx_type_search = 1;
+ }
+
+ if (speed >= 2) {
+ if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ av1_internal_image_edge(cpi)) {
+ sf->use_square_partition_only = !frame_is_boosted(cpi);
+ } else {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ }
+
+ sf->less_rectangular_check = 1;
+
+ sf->use_rd_breakout = 1;
+ sf->adaptive_motion_search = 1;
+ sf->mv.auto_mv_step_size = 1;
+ sf->adaptive_rd_thresh = 1;
+ sf->mv.subpel_iters_per_step = 1;
+ sf->mode_skip_start = 10;
+ sf->adaptive_pred_interp_filter = 1;
+
+ sf->recode_loop = ALLOW_RECODE_KFARFGF;
+#if CONFIG_TX64X64
+ sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#endif // CONFIG_TX64X64
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+
+ sf->tx_size_search_breakout = 1;
+ sf->partition_search_breakout_rate_thr = 80;
+ sf->tx_type_search.prune_mode = PRUNE_ONE;
+ // Use transform domain distortion.
+ // Note var-tx expt always uses pixel domain distortion.
+ sf->use_transform_domain_distortion = 1;
+#if CONFIG_EXT_INTER
+ sf->disable_wedge_search_var_thresh = 100;
+ sf->fast_wedge_sign_estimate = 1;
+#endif // CONFIG_EXT_INTER
+ }
+
+ if (speed >= 3) {
+ sf->tx_size_search_method =
+ frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
+ sf->mode_search_skip_flags =
+ (cm->frame_type == KEY_FRAME)
+ ? 0
+ : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
+ sf->disable_filter_search_var_thresh = 100;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+ sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+ sf->allow_partition_search_skip = 1;
+ sf->use_upsampled_references = 0;
+ sf->adaptive_rd_thresh = 2;
+#if CONFIG_EXT_TX
+ sf->tx_type_search.prune_mode = PRUNE_TWO;
+#endif
+ }
+
+ if (speed >= 4) {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ sf->tx_size_search_method =
+ frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 1;
+ sf->cb_partition_search = !boosted;
+ sf->cb_pred_filter_search = 1;
+ sf->alt_ref_search_fp = 1;
+ sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+ sf->adaptive_rd_thresh = 3;
+ sf->mode_skip_start = 6;
+#if CONFIG_TX64X64
+ sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+ sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
+#endif // CONFIG_TX64X64
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+ sf->adaptive_interp_filter_search = 1;
+ }
+
+ if (speed >= 5) {
+ sf->use_square_partition_only = 1;
+ sf->tx_size_search_method = USE_LARGESTALL;
+ sf->mv.search_method = BIGDIA;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ sf->adaptive_rd_thresh = 4;
+ if (cm->frame_type != KEY_FRAME)
+ sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+ sf->disable_filter_search_var_thresh = 200;
+ sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+ sf->use_fast_coef_costing = 1;
+ sf->partition_search_breakout_rate_thr = 300;
+ }
+
+ if (speed >= 6) {
+ int i;
+ sf->optimize_coefficients = 0;
+ sf->mv.search_method = HEX;
+ sf->disable_filter_search_var_thresh = 500;
+ for (i = 0; i < TX_SIZES; ++i) {
+ sf->intra_y_mode_mask[i] = INTRA_DC;
+ sf->intra_uv_mode_mask[i] = INTRA_DC;
+ }
+ sf->partition_search_breakout_rate_thr = 500;
+ sf->mv.reduce_first_step_size = 1;
+ sf->simple_model_rd_from_var = 1;
+ }
+ if (speed >= 7) {
+ const int is_keyframe = cm->frame_type == KEY_FRAME;
+ const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
+ sf->default_max_partition_size = BLOCK_32X32;
+ sf->default_min_partition_size = BLOCK_8X8;
+#if CONFIG_TX64X64
+ sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+#endif // CONFIG_TX64X64
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->frame_parameter_update = 0;
+ sf->mv.search_method = FAST_HEX;
+ sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+ sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+ sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+ sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+#if CONFIG_EXT_PARTITION
+ sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
+ sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
+ sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
+#endif // CONFIG_EXT_PARTITION
+ sf->partition_search_type = REFERENCE_PARTITION;
+ sf->default_min_partition_size = BLOCK_8X8;
+ sf->reuse_inter_pred_sby = 1;
+ sf->force_frame_boost =
+ is_keyframe ||
+ (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+ sf->max_delta_qindex = is_keyframe ? 20 : 15;
+ sf->coeff_prob_appx_step = 4;
+ sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ }
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+ RD_OPT *const rd = &cpi->rd;
+ int i;
+
+// Limit memory usage for high resolutions
+#if CONFIG_EXT_REFS
+ // TODO(zoeliu): Temporary solution to resolve the insufficient RAM issue for
+ // ext-refs. Need to work with @yunqingwang to have a more
+ // effective solution.
+ if (AOMMIN(cm->width, cm->height) > 720) {
+ // Turn off the use of upsampled references for HD resolution
+ sf->use_upsampled_references = 0;
+ } else if ((AOMMIN(cm->width, cm->height) > 540) &&
+ (oxcf->profile != PROFILE_0)) {
+ sf->use_upsampled_references = 0;
+ }
+#else
+ if (AOMMIN(cm->width, cm->height) > 1080) {
+ sf->use_upsampled_references = 0;
+ } else if ((AOMMIN(cm->width, cm->height) > 720) &&
+ (oxcf->profile != PROFILE_0)) {
+ sf->use_upsampled_references = 0;
+ }
+#endif // CONFIG_EXT_REFS
+
+ if (oxcf->mode == GOOD) {
+ set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+ }
+
+ if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+ sf->adaptive_pred_interp_filter = 0;
+ }
+
+ // Check for masked out split cases.
+ for (i = 0; i < MAX_REFS; ++i) {
+ if (sf->disable_split_mask & (1 << i)) {
+ rd->thresh_mult_sub8x8[i] = INT_MAX;
+ }
+ }
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test == 1)
+ cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+ else if (cpi->oxcf.motion_vector_unit_test == 2)
+ cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ int i;
+
+ // best quality defaults
+ sf->frame_parameter_update = 1;
+ sf->mv.search_method = NSTEP;
+ sf->recode_loop = ALLOW_RECODE;
+ sf->mv.subpel_search_method = SUBPEL_TREE;
+ sf->mv.subpel_iters_per_step = 2;
+ sf->mv.subpel_force_stop = 0;
+ sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
+ sf->mv.reduce_first_step_size = 0;
+ sf->coeff_prob_appx_step = 1;
+ sf->mv.auto_mv_step_size = 0;
+ sf->mv.fullpel_search_step_param = 6;
+ sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+ sf->adaptive_rd_thresh = 0;
+ sf->tx_size_search_method = USE_FULL_RD;
+ sf->adaptive_motion_search = 0;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 0;
+ sf->cb_pred_filter_search = 0;
+ sf->cb_partition_search = 0;
+ sf->alt_ref_search_fp = 0;
+ sf->partition_search_type = SEARCH_PARTITION;
+ sf->tx_type_search.prune_mode = NO_PRUNE;
+ sf->tx_type_search.fast_intra_tx_type_search = 0;
+ sf->tx_type_search.fast_inter_tx_type_search = 0;
+ sf->less_rectangular_check = 0;
+ sf->use_square_partition_only = 0;
+ sf->auto_min_max_partition_size = NOT_IN_USE;
+ sf->rd_auto_partition_min_limit = BLOCK_4X4;
+ sf->default_max_partition_size = BLOCK_LARGEST;
+ sf->default_min_partition_size = BLOCK_4X4;
+ sf->adjust_partitioning_from_last_frame = 0;
+ sf->last_partitioning_redo_frequency = 4;
+ sf->disable_split_mask = 0;
+ sf->mode_search_skip_flags = 0;
+ sf->force_frame_boost = 0;
+ sf->max_delta_qindex = 0;
+ sf->disable_filter_search_var_thresh = 0;
+ sf->adaptive_interp_filter_search = 0;
+ sf->allow_partition_search_skip = 0;
+ sf->use_upsampled_references = 1;
+#if CONFIG_EXT_INTER
+ sf->disable_wedge_search_var_thresh = 0;
+ sf->fast_wedge_sign_estimate = 0;
+#endif // CONFIG_EXT_INTER
+
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = INTRA_ALL;
+ sf->intra_uv_mode_mask[i] = INTRA_ALL;
+ }
+ sf->use_rd_breakout = 0;
+ sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+ sf->use_fast_coef_updates = TWO_LOOP;
+ sf->use_fast_coef_costing = 0;
+ sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
+ sf->schedule_mode_search = 0;
+ for (i = 0; i < BLOCK_SIZES; ++i) sf->inter_mode_mask[i] = INTER_ALL;
+ sf->max_intra_bsize = BLOCK_LARGEST;
+ sf->reuse_inter_pred_sby = 0;
+ // This setting only takes effect when partition_search_type is set
+ // to FIXED_PARTITION.
+ sf->always_this_block_size = BLOCK_16X16;
+ sf->search_type_check_frequency = 50;
+ // Recode loop tolerance %.
+ sf->recode_tolerance = 25;
+ sf->default_interp_filter = SWITCHABLE;
+ sf->tx_size_search_breakout = 0;
+ sf->partition_search_breakout_dist_thr = 0;
+ sf->partition_search_breakout_rate_thr = 0;
+ sf->simple_model_rd_from_var = 0;
+
+ // Set this at the appropriate speed levels
+ sf->use_transform_domain_distortion = 0;
+
+ if (oxcf->mode == GOOD
+#if CONFIG_XIPHRC
+ || oxcf->pass == 1
+#endif
+ )
+ set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+
+ // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+ // blocks. Normalise this if the blocks are bigger.
+ if (MAX_SB_SIZE_LOG2 > 6) {
+ sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+ }
+
+ cpi->full_search_sad = av1_full_search_sad;
+ cpi->diamond_search_sad = av1_diamond_search_sad;
+
+ sf->allow_exhaustive_searches = 1;
+ int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+ sf->exhaustive_searches_thresh = (1 << 24);
+ else
+ sf->exhaustive_searches_thresh = (1 << 25);
+ sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+ if (speed > 0)
+ sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[speed][i].interval;
+ }
+#if CONFIG_INTRABC
+ if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+ cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range;
+ sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval;
+ }
+ sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
+ }
+#endif // CONFIG_INTRABC
+
+#if !CONFIG_XIPHRC
+ // Slow quant, dct and trellis not worthwhile for first pass
+ // so make sure they are always turned off.
+ if (oxcf->pass == 1) sf->optimize_coefficients = 0;
+#endif
+
+ // No recode for 1 pass.
+ if (oxcf->pass == 0) {
+ sf->recode_loop = DISALLOW_RECODE;
+ sf->optimize_coefficients = 0;
+ }
+
+ if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_more;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
+ }
+
+#if !CONFIG_AOM_QM
+ x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+#else
+ // FIXME: trellis not very efficient for quantisation matrices
+ x->optimize = 0;
+#endif
+
+ x->min_partition_size = sf->default_min_partition_size;
+ x->max_partition_size = sf->default_max_partition_size;
+
+ if (!cpi->oxcf.frame_periodic_boost) {
+ sf->max_delta_qindex = 0;
+ }
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test == 1)
+ cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+ else if (cpi->oxcf.motion_vector_unit_test == 2)
+ cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
new file mode 100644
index 0000000000..af54a1a9ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_SPEED_FEATURES_H_
+#define AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+ INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+ (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) |
+ (1 << D207_PRED) | (1 << D63_PRED) |
+#if CONFIG_ALT_INTRA
+ (1 << SMOOTH_PRED) |
+#endif // CONFIG_ALT_INTRA
+ (1 << TM_PRED),
+ INTRA_DC = (1 << DC_PRED),
+ INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
+ INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+ INTRA_DC_TM_H_V =
+ (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+#if CONFIG_EXT_INTER
+enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+ (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEWMV) |
+ (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) |
+ (1 << NEW_NEARESTMV) | (1 << ZERO_ZEROMV),
+ INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
+ (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+ (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
+ INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
+ (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEARMV) |
+ (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+ (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+ INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+ (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+ (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
+ INTER_NEAREST_NEW_ZERO =
+ (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEW_NEWMV) |
+ (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) |
+ (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+ INTER_NEAREST_NEAR_NEW =
+ (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEARMV) |
+ (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+ (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+ INTER_NEAREST_NEAR_ZERO =
+ (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEAREST_NEARMV) |
+ (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+ (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+};
+#else
+enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+ INTER_NEAREST = (1 << NEARESTMV),
+ INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+ INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
+ INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
+ INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+ INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+#endif // CONFIG_EXT_INTER
+
+enum {
+ DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+ DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+ DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+ LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+typedef enum {
+ DIAMOND = 0,
+ NSTEP = 1,
+ HEX = 2,
+ BIGDIA = 3,
+ SQUARE = 4,
+ FAST_HEX = 5,
+ FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+ // No recode.
+ DISALLOW_RECODE = 0,
+ // Allow recode for KF and exceeding maximum frame bandwidth.
+ ALLOW_RECODE_KFMAXBW = 1,
+ // Allow recode only for KF/ARF/GF frames.
+ ALLOW_RECODE_KFARFGF = 2,
+ // Allow recode for all frames based on bitrate constraints.
+ ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+ SUBPEL_TREE = 0,
+ SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches
+ SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively
+ SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches
+ // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+ NO_MOTION_THRESHOLD = 0,
+ LOW_MOTION_THRESHOLD = 7
+} MOTION_THRESHOLD;
+
+typedef enum {
+ USE_FULL_RD = 0,
+ USE_LARGESTALL,
+ USE_TX_8X8
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+ NOT_IN_USE = 0,
+ RELAXED_NEIGHBORING_MIN_MAX = 1
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+ // Try the full image with different values.
+ LPF_PICK_FROM_FULL_IMAGE,
+ // Try a small portion of the image with different values.
+ LPF_PICK_FROM_SUBIMAGE,
+ // Estimate the level based on quantizer and frame type
+ LPF_PICK_FROM_Q,
+ // Pick 0 to disable LPF if LPF was enabled last frame
+ LPF_PICK_MINIMAL_LPF
+} LPF_PICK_METHOD;
+
+typedef enum {
+ // Terminate search early based on distortion so far compared to
+ // qp step, distortion in the neighborhood of the frame, etc.
+ FLAG_EARLY_TERMINATE = 1 << 0,
+
+ // Skips comp inter modes if the best so far is an intra mode.
+ FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+ // Skips oblique intra modes if the best so far is an inter mode.
+ FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+ // Skips oblique intra modes at angles 27, 63, 117, 153 if the best
+ // intra so far is not one of the neighboring directions.
+ FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+ // Skips intra modes other than DC_PRED if the source variance is small
+ FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+ FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR,
+ FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+ FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
+ NO_PRUNE = 0,
+ // eliminates one tx type in vertical and horizontal direction
+ PRUNE_ONE = 1,
+#if CONFIG_EXT_TX
+ // eliminates two tx types in each direction
+ PRUNE_TWO = 2,
+#endif
+} TX_TYPE_PRUNE_MODE;
+
+typedef struct {
+ TX_TYPE_PRUNE_MODE prune_mode;
+ int fast_intra_tx_type_search;
+ int fast_inter_tx_type_search;
+} TX_TYPE_SEARCH;
+
+typedef enum {
+ // Search partitions using RD criterion
+ SEARCH_PARTITION,
+
+ // Always use a fixed size partition
+ FIXED_PARTITION,
+
+ REFERENCE_PARTITION,
+
+ // Use an arbitrary partitioning scheme based on source variance within
+ // a 64X64 SB
+ VAR_BASED_PARTITION,
+
+ // Use non-fixed partitions based on source variance
+ SOURCE_VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef enum {
+ // Does a dry run to see if any of the contexts need to be updated or not,
+ // before the final run.
+ TWO_LOOP = 0,
+
+ // No dry run, also only half the coef contexts and bands are updated.
+ // The rest are not updated at all.
+ ONE_LOOP_REDUCED = 1
+} FAST_COEFF_UPDATE;
+
+typedef struct MV_SPEED_FEATURES {
+ // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+ SEARCH_METHODS search_method;
+
+ // This parameter controls which step in the n-step process we start at.
+ // It's changed adaptively based on circumstances.
+ int reduce_first_step_size;
+
+ // If this is set to 1, we limit the motion search range to 2 times the
+ // largest motion vector found in the last frame.
+ int auto_mv_step_size;
+
+ // Subpel_search_method can only be subpel_tree which does a subpixel
+ // logarithmic search that keeps stepping at 1/2 pixel units until
+ // you stop getting a gain, and then goes on to 1/4 and repeats
+ // the same process. Along the way it skips many diagonals.
+ SUBPEL_SEARCH_METHODS subpel_search_method;
+
+ // Maximum number of steps in logarithmic subpel search before giving up.
+ int subpel_iters_per_step;
+
+ // Control when to stop subpel search
+ int subpel_force_stop;
+
+ // This variable sets the step_param used in full pel motion search.
+ int fullpel_search_step_param;
+} MV_SPEED_FEATURES;
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+ int range;
+ int interval;
+} MESH_PATTERN;
+
+typedef struct SPEED_FEATURES {
+ MV_SPEED_FEATURES mv;
+
+ // Frame level coding parameter update
+ int frame_parameter_update;
+
+ RECODE_LOOP_TYPE recode_loop;
+
+ // Trellis (dynamic programming) optimization of quantized values (+1, 0).
+ int optimize_coefficients;
+
+ // Always set to 0. If on it enables 0 cost background transmission
+ // (except for the initial transmission of the segmentation). The feature is
+ // disabled because the addition of very large block sizes make the
+ // backgrounds very to cheap to encode, and the segmentation we have
+ // adds overhead.
+ int static_segmentation;
+
+ // If 1 we iterate finding a best reference for 2 ref frames together - via
+ // a log search that iterates 4 times (check around mv for last for best
+ // error of combined predictor then check around mv for alt). If 0 we
+ // we just use the best motion vector found for each frame by itself.
+ BLOCK_SIZE comp_inter_joint_search_thresh;
+
+ // This variable is used to cap the maximum number of times we skip testing a
+ // mode to be evaluated. A high value means we will be faster.
+ int adaptive_rd_thresh;
+
+ // Coefficient probability model approximation step size
+ int coeff_prob_appx_step;
+
+ // The threshold is to determine how slow the motino is, it is used when
+ // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
+ MOTION_THRESHOLD lf_motion_threshold;
+
+ // Determine which method we use to determine transform size. We can choose
+ // between options like full rd, largest for prediction size, largest
+ // for intra and model coefs for the rest.
+ TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+ // After looking at the first set of modes (set by index here), skip
+ // checking modes for reference frames that don't match the reference frame
+ // of the best so far.
+ int mode_skip_start;
+
+ PARTITION_SEARCH_TYPE partition_search_type;
+
+ TX_TYPE_SEARCH tx_type_search;
+
+ // Used if partition_search_type = FIXED_SIZE_PARTITION
+ BLOCK_SIZE always_this_block_size;
+
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split.
+ int less_rectangular_check;
+
+ // Disable testing non square partitions. (eg 16x32)
+ int use_square_partition_only;
+
+ // Sets min and max partition sizes for this superblock based on the
+ // same superblock in last encoded frame, and the left and above neighbor.
+ AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+ // Ensures the rd based auto partition search will always
+ // go down at least to the specified level.
+ BLOCK_SIZE rd_auto_partition_min_limit;
+
+ // Min and max partition size we enable (block_size) as per auto
+ // min max, but also used by adjust partitioning, and pick_partitioning.
+ BLOCK_SIZE default_min_partition_size;
+ BLOCK_SIZE default_max_partition_size;
+
+ // Whether or not we allow partitions one smaller or one greater than the last
+ // frame's partitioning. Only used if use_lastframe_partitioning is set.
+ int adjust_partitioning_from_last_frame;
+
+ // How frequently we re do the partitioning from scratch. Only used if
+ // use_lastframe_partitioning is set.
+ int last_partitioning_redo_frequency;
+
+ // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+ // it always, to allow it for only Last frame and Intra, disable it for all
+ // inter modes or to enable it always.
+ int disable_split_mask;
+
+ // TODO(jingning): combine the related motion search speed features
+ // This allows us to use motion search at other sizes as a starting
+ // point for this motion search and limits the search range around it.
+ int adaptive_motion_search;
+
+ // Flag for allowing some use of exhaustive searches;
+ int allow_exhaustive_searches;
+
+ // Threshold for allowing exhaistive motion search.
+ int exhaustive_searches_thresh;
+
+ // Maximum number of exhaustive searches for a frame.
+ int max_exaustive_pct;
+
+ // Pattern to be used for any exhaustive mesh searches.
+ MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+ int schedule_mode_search;
+
+ // Allows sub 8x8 modes to use the prediction filter that was determined
+ // best for 8x8 mode. If set to 0 we always re check all the filters for
+ // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+ // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+ int adaptive_pred_interp_filter;
+
+ // Adaptive prediction mode search
+ int adaptive_mode_search;
+
+ // Chessboard pattern prediction filter type search
+ int cb_pred_filter_search;
+
+ int cb_partition_search;
+
+ int alt_ref_search_fp;
+
+ // Use finer quantizer in every other few frames that run variable block
+ // partition type search.
+ int force_frame_boost;
+
+ // Maximally allowed base quantization index fluctuation.
+ int max_delta_qindex;
+
+ // Implements various heuristics to skip searching modes
+ // The heuristics selected are based on flags
+ // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+ unsigned int mode_search_skip_flags;
+
+ // A source variance threshold below which filter search is disabled
+ // Choose a very large value (UINT_MAX) to use 8-tap always
+ unsigned int disable_filter_search_var_thresh;
+
+#if CONFIG_EXT_INTER
+ // A source variance threshold below which wedge search is disabled
+ unsigned int disable_wedge_search_var_thresh;
+
+ // Whether fast wedge sign estimate is used
+ int fast_wedge_sign_estimate;
+#endif // CONFIG_EXT_INTER
+
+ // These bit masks allow you to enable or disable intra modes for each
+ // transform size separately.
+ int intra_y_mode_mask[TX_SIZES];
+ int intra_uv_mode_mask[TX_SIZES];
+
+ // These bit masks allow you to enable or disable intra modes for each
+ // prediction block size separately.
+ int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
+ // This variable enables an early break out of mode testing if the model for
+ // rd built from the prediction signal indicates a value that's much
+ // higher than the best rd we've seen so far.
+ int use_rd_breakout;
+
+ // This feature controls how the loop filter level is determined.
+ LPF_PICK_METHOD lpf_pick;
+
+ // This feature limits the number of coefficients updates we actually do
+ // by only looking at counts from 1/2 the bands.
+ FAST_COEFF_UPDATE use_fast_coef_updates;
+
+ // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+ // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+ int inter_mode_mask[BLOCK_SIZES];
+
+ // This feature controls whether we do the expensive context update and
+ // calculation in the rd coefficient costing loop.
+ int use_fast_coef_costing;
+
+ // This feature controls the tolerence vs target used in deciding whether to
+ // recode a frame. It has no meaning if recode is disabled.
+ int recode_tolerance;
+
+ // This variable controls the maximum block size where intra blocks can be
+ // used in inter frames.
+ // TODO(aconverse): Fold this into one of the other many mode skips
+ BLOCK_SIZE max_intra_bsize;
+
+ // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+ // FIXED_PARTITION search type should be used.
+ int search_type_check_frequency;
+
+ // When partition is pre-set, the inter prediction result from pick_inter_mode
+ // can be reused in final block encoding process. It is enabled only for real-
+ // time mode speed 6.
+ int reuse_inter_pred_sby;
+
+ // default interp filter choice
+ InterpFilter default_interp_filter;
+
+ // Early termination in transform size search, which only applies while
+ // tx_size_search_method is USE_FULL_RD.
+ int tx_size_search_breakout;
+
+ // adaptive interp_filter search to allow skip of certain filter types.
+ int adaptive_interp_filter_search;
+
+ // mask for skip evaluation of certain interp_filter type.
+ INTERP_FILTER_MASK interp_filter_search_mask;
+
+ // Partition search early breakout thresholds.
+ int64_t partition_search_breakout_dist_thr;
+ int partition_search_breakout_rate_thr;
+
+ // Allow skipping partition search for still image frame
+ int allow_partition_search_skip;
+
+ // Fast approximation of av1_model_rd_from_var_lapndz
+ int simple_model_rd_from_var;
+
+ // Do sub-pixel search in up-sampled reference frames
+ int use_upsampled_references;
+
+ // Whether to compute distortion in the image domain (slower but
+ // more accurate), or in the transform domain (faster but less acurate).
+ int use_transform_domain_distortion;
+} SPEED_FEATURES;
+
+struct AV1_COMP;
+
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c
new file mode 100644
index 0000000000..8960d33414
--- /dev/null
+++ b/third_party/aom/av1/encoder/subexp.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom_dsp/bitwriter.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/subexp.h"
+
+static const uint8_t update_bits[255] = {
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 0,
+};
+#define MIN_DELP_BITS 5
+
+static int recenter_nonneg(int v, int m) {
+ if (v > (m << 1))
+ return v;
+ else if (v >= m)
+ return ((v - m) << 1);
+ else
+ return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+ int i;
+ static const uint8_t map_table[MAX_PROB - 1] = {
+ // generated by:
+ // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+ 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+ 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88,
+ 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102,
+ 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+ 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+ 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,
+ 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171,
+ 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185,
+ 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199,
+ 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213,
+ 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+ 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+ 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
+ };
+ v--;
+ m--;
+ if ((m << 1) <= MAX_PROB)
+ i = recenter_nonneg(v, m) - 1;
+ else
+ i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+ i = map_table[i];
+ return i;
+}
+
+static int prob_diff_update_cost(aom_prob newp, aom_prob oldp) {
+ int delp = remap_prob(newp, oldp);
+ return update_bits[delp] << AV1_PROB_COST_SHIFT;
+}
+
+static void encode_uniform(aom_writer *w, int v) {
+ const int l = 8;
+ const int m = (1 << l) - 190;
+ if (v < m) {
+ aom_write_literal(w, v, l - 1);
+ } else {
+ aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+ aom_write_literal(w, (v - m) & 1, 1);
+ }
+}
+
+static INLINE int write_bit_gte(aom_writer *w, int word, int test) {
+ aom_write_literal(w, word >= test, 1);
+ return word >= test;
+}
+
+static void encode_term_subexp(aom_writer *w, int word) {
+ if (!write_bit_gte(w, word, 16)) {
+ aom_write_literal(w, word, 4);
+ } else if (!write_bit_gte(w, word, 32)) {
+ aom_write_literal(w, word - 16, 4);
+ } else if (!write_bit_gte(w, word, 64)) {
+ aom_write_literal(w, word - 32, 5);
+ } else {
+ encode_uniform(w, word - 64);
+ }
+}
+
+void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp) {
+ const int delp = remap_prob(newp, oldp);
+ encode_term_subexp(w, delp);
+}
+
+int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
+ aom_prob *bestp, aom_prob upd,
+ int probwt) {
+ const uint32_t old_b = cost_branch256(ct, oldp);
+ int bestsavings = 0;
+ aom_prob newp, bestnewp = oldp;
+ const int step = *bestp > oldp ? -1 : 1;
+ const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+
+ if (old_b > (uint32_t)upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
+ for (newp = *bestp; newp != oldp; newp += step) {
+ const int new_b = cost_branch256(ct, newp);
+ const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+ const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const aom_prob oldp,
+ aom_prob *bestp, aom_prob upd,
+ int stepsize, int probwt) {
+ int i, old_b, new_b, update_b, savings, bestsavings;
+ int newp;
+ const int step_sign = *bestp > oldp ? -1 : 1;
+ const int step = stepsize * step_sign;
+ const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+ const aom_prob *newplist, *oldplist;
+ aom_prob bestnewp;
+ oldplist = av1_pareto8_full[oldp - 1];
+ old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp);
+ for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+ old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]);
+
+ bestsavings = 0;
+ bestnewp = oldp;
+
+ assert(stepsize > 0);
+
+ if (old_b > upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
+ for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
+ if (newp < 1 || newp > 255) continue;
+ newplist = av1_pareto8_full[newp - 1];
+ new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+ for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+ new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
+ update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+ savings = old_b - new_b - update_b * probwt;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ }
+
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+#if CONFIG_SUBFRAME_PROB_UPDATE
+static int get_cost(unsigned int ct[][2], aom_prob p, int n) {
+ int i, p0 = p;
+ unsigned int total_ct[2] = { 0, 0 };
+ int cost = 0;
+
+ for (i = 0; i <= n; ++i) {
+ cost += cost_branch256(ct[i], p);
+ total_ct[0] += ct[i][0];
+ total_ct[1] += ct[i][1];
+ if (i < n)
+ p = av1_merge_probs(p0, total_ct, COEF_COUNT_SAT, COEF_MAX_UPDATE_FACTOR);
+ }
+ return cost;
+}
+
+int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
+ aom_prob *bestp, aom_prob upd, int n) {
+ const int old_b = get_cost(ct, oldp, n);
+ int bestsavings = 0;
+ const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+ aom_prob newp, bestnewp = oldp;
+ const int step = *bestp > oldp ? -1 : 1;
+
+ for (newp = *bestp; newp != oldp; newp += step) {
+ const int new_b = get_cost(ct, newp, n);
+ const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+ const int savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+int av1_prob_update_search_model_subframe(
+ unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp,
+ aom_prob *bestp, aom_prob upd, int stepsize, int n) {
+ int i, old_b, new_b, update_b, savings, bestsavings;
+ int newp;
+ const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+ const int step = stepsize * step_sign;
+ const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+ aom_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+ av1_model_to_full_probs(oldp, oldplist);
+ memcpy(newplist, oldp, sizeof(aom_prob) * UNCONSTRAINED_NODES);
+ for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+ old_b += get_cost(ct[i], oldplist[i], n);
+ old_b += get_cost(ct[PIVOT_NODE], oldplist[PIVOT_NODE], n);
+
+ bestsavings = 0;
+ bestnewp = oldp[PIVOT_NODE];
+
+ assert(stepsize > 0);
+
+ for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0; newp += step) {
+ if (newp < 1 || newp > 255) continue;
+ newplist[PIVOT_NODE] = newp;
+ av1_model_to_full_probs(newplist, newplist);
+ for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+ new_b += get_cost(ct[i], newplist[i], n);
+ new_b += get_cost(ct[PIVOT_NODE], newplist[PIVOT_NODE], n);
+ update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + upd_cost;
+ savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+
+ *bestp = bestnewp;
+ return bestsavings;
+}
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+
+void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
+ const unsigned int ct[2], int probwt) {
+ const aom_prob upd = DIFF_UPDATE_PROB;
+ aom_prob newp = get_binary_prob(ct[0], ct[1]);
+ const int savings =
+ av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
+ assert(newp >= 1);
+ if (savings > 0) {
+ aom_write(w, 1, upd);
+ av1_write_prob_diff_update(w, newp, *oldp);
+ *oldp = newp;
+ } else {
+ aom_write(w, 0, upd);
+ }
+}
+
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+ int probwt) {
+ const aom_prob upd = DIFF_UPDATE_PROB;
+ aom_prob newp = get_binary_prob(ct[0], ct[1]);
+ const int savings =
+ av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
+ return savings;
+}
diff --git a/third_party/aom/av1/encoder/subexp.h b/third_party/aom/av1/encoder/subexp.h
new file mode 100644
index 0000000000..049265cb88
--- /dev/null
+++ b/third_party/aom/av1/encoder/subexp.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_SUBEXP_H_
+#define AV1_ENCODER_SUBEXP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/prob.h"
+
+void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm);
+
+void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
+ const unsigned int ct[2], int probwt);
+
+int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
+ aom_prob *bestp, aom_prob upd,
+ int probwt);
+
+int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const aom_prob oldp,
+ aom_prob *bestp, aom_prob upd,
+ int stepsize, int probwt);
+
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+ int probwt);
+#if CONFIG_SUBFRAME_PROB_UPDATE
+int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
+ aom_prob *bestp, aom_prob upd, int n);
+int av1_prob_update_search_model_subframe(
+ unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp,
+ aom_prob *bestp, aom_prob upd, int stepsize, int n);
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_SUBEXP_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
new file mode 100644
index 0000000000..de962fe84d
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "./aom_config.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+
+static void temporal_filter_predictors_mb_c(
+ MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
+ int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
+ uint8_t *pred, struct scale_factors *scale, int x, int y) {
+ const int which_mv = 0;
+ const MV mv = { mv_row, mv_col };
+ enum mv_precision mv_precision_uv;
+ int uv_stride;
+ // TODO(angiebird): change plane setting accordingly
+ ConvolveParams conv_params = get_conv_params(which_mv, 0);
+
+#if USE_TEMPORALFILTER_12TAP
+#if CONFIG_DUAL_FILTER
+ const InterpFilter interp_filter[4] = { TEMPORALFILTER_12TAP,
+ TEMPORALFILTER_12TAP,
+ TEMPORALFILTER_12TAP,
+ TEMPORALFILTER_12TAP };
+#else
+ const InterpFilter interp_filter = TEMPORALFILTER_12TAP;
+#endif
+ (void)xd;
+#else
+ const InterpFilter interp_filter = xd->mi[0]->mbmi.interp_filter;
+#endif // USE_TEMPORALFILTER_12TAP
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ WarpTypesAllowed warp_types;
+ memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+ if (uv_block_width == 8) {
+ uv_stride = (stride + 1) >> 1;
+ mv_precision_uv = MV_PRECISION_Q4;
+ } else {
+ uv_stride = stride;
+ mv_precision_uv = MV_PRECISION_Q3;
+ }
+
+#if CONFIG_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
+ 16, 16, which_mv, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, x, y,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ 0, MV_PRECISION_Q3, x, y, xd);
+
+ av1_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
+ uv_block_width, &mv, scale, uv_block_width,
+ uv_block_height, which_mv, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, x, y,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ 1, mv_precision_uv, x, y, xd);
+
+ av1_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
+ uv_block_width, &mv, scale, uv_block_width,
+ uv_block_height, which_mv, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, x, y,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ 2, mv_precision_uv, x, y, xd);
+ return;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
+ &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, x, y, 0, 0,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ MV_PRECISION_Q3, x, y, xd);
+
+ av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
+ &mv, scale, uv_block_width, uv_block_height,
+ &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, x, y, 1, 0,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ mv_precision_uv, x, y, xd);
+
+ av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
+ &mv, scale, uv_block_width, uv_block_height,
+ &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ &warp_types, x, y, 2, 0,
+#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+ mv_precision_uv, x, y, xd);
+}
+
+void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
+ uint8_t *frame2, unsigned int block_width,
+ unsigned int block_height, int strength,
+ int filter_weight, unsigned int *accumulator,
+ uint16_t *count) {
+ unsigned int i, j, k;
+ int modifier;
+ int byte = 0;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ for (i = 0, k = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++, k++) {
+ int pixel_value = *frame2;
+
+ // non-local mean approach
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = (int)i + idy;
+ int col = (int)j + idx;
+
+ if (row >= 0 && row < (int)block_height && col >= 0 &&
+ col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+ modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
+
+ modifier += rounding;
+ modifier >>= strength;
+
+ if (modifier > 16) modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_width;
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+void av1_highbd_temporal_filter_apply_c(
+ uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
+ unsigned int block_width, unsigned int block_height, int strength,
+ int filter_weight, unsigned int *accumulator, uint16_t *count) {
+ uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+ uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+ unsigned int i, j, k;
+ int modifier;
+ int byte = 0;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ for (i = 0, k = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++, k++) {
+ int pixel_value = *frame2;
+
+ // non-local mean approach
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = (int)i + idy;
+ int col = (int)j + idx;
+
+ if (row >= 0 && row < (int)block_height && col >= 0 &&
+ col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+ modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
+
+ modifier += rounding;
+ modifier >>= strength;
+
+ if (modifier > 16) modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_width;
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+
+static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
+ uint8_t *arf_frame_buf,
+ uint8_t *frame_ptr_buf,
+ int stride) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ int bestsme = INT_MAX;
+ int distortion;
+ unsigned int sse;
+ int cost_list[5];
+ MvLimits tmp_mv_limits = x->mv_limits;
+
+ MV best_ref_mv1 = { 0, 0 };
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ // Save input state
+ struct buf_2d src = x->plane[0].src;
+ struct buf_2d pre = xd->plane[0].pre[0];
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = arf_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = frame_ptr_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+#if CONFIG_REF_MV
+ x->mvcost = x->mv_cost_stack[0];
+ x->nmvjointcost = x->nmv_vec_cost[0];
+ x->mvsadcost = x->mvcost;
+ x->nmvjointsadcost = x->nmvjointcost;
+#endif
+
+ // Ignore mv costing by sending NULL pointer instead of cost arrays
+ av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+ cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
+ &best_ref_mv1);
+
+ x->mv_limits = tmp_mv_limits;
+
+ // Ignore mv costing by sending NULL pointer instead of cost array
+ bestsme = cpi->find_fractional_mv_step(
+ x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+ 0);
+
+ x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
+
+ // Restore input state
+ x->plane[0].src = src;
+ xd->plane[0].pre[0] = pre;
+
+ return bestsme;
+}
+
+static void temporal_filter_iterate_c(AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG **frames,
+ int frame_count, int alt_ref_index,
+ int strength,
+ struct scale_factors *scale) {
+ int byte;
+ int frame;
+ int mb_col, mb_row;
+ unsigned int filter_weight;
+ int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+ int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+ int mb_y_offset = 0;
+ int mb_uv_offset = 0;
+ DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+ MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+ YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+ uint8_t *dst1, *dst2;
+#if CONFIG_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]);
+ DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]);
+ uint8_t *predictor;
+#else
+ DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]);
+#endif
+ const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+ const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+
+ // Save input state
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ int i;
+#if CONFIG_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ predictor = CONVERT_TO_BYTEPTR(predictor16);
+ } else {
+ predictor = predictor8;
+ }
+#endif
+
+ for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+
+ for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+ // Source frames are extended to 16 pixels. This is different than
+ // L/A/G reference frames that have a border of 32 (AV1ENCBORDERINPIXELS)
+ // A 6/8 tap filter is used for motion search. This requires 2 pixels
+ // before and 3 pixels after. So the largest Y mv on a border would
+ // then be 16 - AOM_INTERP_EXTEND. The UV blocks are half the size of the
+ // Y and therefore only extended by 8. The largest mv that a UV block
+ // can support is 8 - AOM_INTERP_EXTEND. A UV mv is half of a Y mv.
+ // (16 - AOM_INTERP_EXTEND) >> 1 which is greater than
+ // 8 - AOM_INTERP_EXTEND.
+ // To keep the mv in play for both Y and UV planes the max that it
+ // can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1).
+ cpi->td.mb.mv_limits.row_min =
+ -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+ cpi->td.mb.mv_limits.row_max =
+ ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+ for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+ int j, k;
+ int stride;
+
+ memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+ memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+
+ cpi->td.mb.mv_limits.col_min =
+ -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+ cpi->td.mb.mv_limits.col_max =
+ ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+ for (frame = 0; frame < frame_count; frame++) {
+ const int thresh_low = 10000;
+ const int thresh_high = 20000;
+
+ if (frames[frame] == NULL) continue;
+
+ mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+ mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
+
+ if (frame == alt_ref_index) {
+ filter_weight = 2;
+ } else {
+ // Find best match in this frame by MC
+ int err = temporal_filter_find_matching_mb_c(
+ cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
+ frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
+
+ // Assign higher weight to matching MB if it's error
+ // score is lower. If not applying MC default behavior
+ // is to weight all MBs equal.
+ filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+ }
+
+ if (filter_weight != 0) {
+ // Construct the predictors
+ temporal_filter_predictors_mb_c(
+ mbd, frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->u_buffer + mb_uv_offset,
+ frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
+ mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+ mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale,
+ mb_col * 16, mb_row * 16);
+
+#if CONFIG_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int adj_strength = strength + 2 * (mbd->bd - 8);
+ // Apply the filter (YUV)
+ av1_highbd_temporal_filter_apply(
+ f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
+ adj_strength, filter_weight, accumulator, count);
+ av1_highbd_temporal_filter_apply(
+ f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+ mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+ accumulator + 256, count + 256);
+ av1_highbd_temporal_filter_apply(
+ f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+ mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+ accumulator + 512, count + 512);
+ } else {
+ // Apply the filter (YUV)
+ av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16, strength,
+ filter_weight, accumulator, count);
+ av1_temporal_filter_apply_c(
+ f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+ mb_uv_width, mb_uv_height, strength, filter_weight,
+ accumulator + 256, count + 256);
+ av1_temporal_filter_apply_c(
+ f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+ mb_uv_width, mb_uv_height, strength, filter_weight,
+ accumulator + 512, count + 512);
+ }
+#else
+ // Apply the filter (YUV)
+ av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16, strength,
+ filter_weight, accumulator, count);
+ av1_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 256, mb_uv_width,
+ mb_uv_height, strength, filter_weight,
+ accumulator + 256, count + 256);
+ av1_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 512, mb_uv_width,
+ mb_uv_height, strength, filter_weight,
+ accumulator + 512, count + 512);
+#endif // CONFIG_HIGHBITDEPTH
+ }
+ }
+
+#if CONFIG_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *dst1_16;
+ uint16_t *dst2_16;
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ dst1_16[byte] =
+ (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - 16;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + 256;
+
+ // U
+ dst1_16[byte] =
+ (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // V
+ dst2_16[byte] =
+ (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - mb_uv_width;
+ }
+ } else {
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ dst1[byte] =
+ (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - 16;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + 256;
+
+ // U
+ dst1[byte] =
+ (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // V
+ dst2[byte] =
+ (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - mb_uv_width;
+ }
+ }
+#else
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ dst1[byte] =
+ (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - 16;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + 256;
+
+ // U
+ dst1[byte] =
+ (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // V
+ dst2[byte] =
+ (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - mb_uv_width;
+ }
+#endif // CONFIG_HIGHBITDEPTH
+ mb_y_offset += 16;
+ mb_uv_offset += mb_uv_width;
+ }
+ mb_y_offset += 16 * (f->y_stride - mb_cols);
+ mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+ }
+
+ // Restore input state
+ for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
+ int *arnr_frames, int *arnr_strength) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frames_after_arf =
+ av1_lookahead_depth(cpi->lookahead) - distance - 1;
+ int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+ int frames_bwd;
+ int q, frames, strength;
+
+ // Define the forward and backwards filter limits for this arnr group.
+ if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
+ if (frames_fwd > distance) frames_fwd = distance;
+
+ frames_bwd = frames_fwd;
+
+ // For even length filter there is one more frame backward
+ // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+ if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+ // Set the baseline active filter size.
+ frames = frames_bwd + 1 + frames_fwd;
+
+ // Adjust the strength based on active max q.
+ if (cpi->common.current_video_frame > 1)
+ q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+ cpi->common.bit_depth));
+ else
+ q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
+ cpi->common.bit_depth));
+ if (q > 16) {
+ strength = oxcf->arnr_strength;
+ } else {
+ strength = oxcf->arnr_strength - ((16 - q) / 2);
+ if (strength < 0) strength = 0;
+ }
+
+ // Adjust number of frames in filter and strength based on gf boost level.
+ if (frames > group_boost / 150) {
+ frames = group_boost / 150;
+ frames += !(frames & 1);
+ }
+
+ if (strength > group_boost / 300) {
+ strength = group_boost / 300;
+ }
+
+ // Adjustments for second level arf in multi arf case.
+ if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
+ strength >>= 1;
+ }
+ }
+
+ *arnr_frames = frames;
+ *arnr_strength = strength;
+}
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int frame;
+ int frames_to_blur;
+ int start_frame;
+ int strength;
+ int frames_to_blur_backward;
+ int frames_to_blur_forward;
+ struct scale_factors sf;
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+#if CONFIG_EXT_REFS
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+#endif
+
+ // Apply context specific adjustments to the arnr filter parameters.
+ adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+// TODO(weitinglin): Currently, we enforce the filtering strength on
+// extra ARFs' to be zeros. We should investigate in which
+// case it is more beneficial to use non-zero strength
+// filtering.
+#if CONFIG_EXT_REFS
+ if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
+ strength = 0;
+ frames_to_blur = 1;
+ }
+#endif
+
+#if CONFIG_EXT_REFS
+ if (strength == 0 && frames_to_blur == 1) {
+ cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
+ } else {
+ cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0;
+ }
+#endif
+
+ frames_to_blur_backward = (frames_to_blur / 2);
+ frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+ start_frame = distance + frames_to_blur_forward;
+
+ // Setup frame pointers, NULL indicates frame not included in filter.
+ for (frame = 0; frame < frames_to_blur; ++frame) {
+ const int which_buffer = start_frame - frame;
+ struct lookahead_entry *buf =
+ av1_lookahead_peek(cpi->lookahead, which_buffer);
+ frames[frames_to_blur - 1 - frame] = &buf->img;
+ }
+
+ if (frames_to_blur > 0) {
+// Setup scaling factors. Scaling on each of the arnr frames is not
+// supported.
+// ARF is produced at the native frame size and resized when coded.
+#if CONFIG_HIGHBITDEPTH
+ av1_setup_scale_factors_for_frame(
+ &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ frames[0]->y_crop_width, frames[0]->y_crop_height,
+ cpi->common.use_highbitdepth);
+#else
+ av1_setup_scale_factors_for_frame(
+ &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ frames[0]->y_crop_width, frames[0]->y_crop_height);
+#endif // CONFIG_HIGHBITDEPTH
+ }
+
+ temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+ frames_to_blur_backward, strength, &sf);
+}
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
new file mode 100644
index 0000000000..bc0863a638
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
new file mode 100644
index 0000000000..f48493bf89
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -0,0 +1,887 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+ { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 },
+ { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 },
+ { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 },
+ { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 },
+ { 9, 7 }, { 9, 5 }, { 9, 3 }, { 9, 1 }, { 8, 31 }, { 8, 29 }, { 8, 27 },
+ { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 },
+ { 8, 11 }, { 8, 9 }, { 8, 7 }, { 8, 5 }, { 8, 3 }, { 8, 1 }, { 7, 15 },
+ { 7, 13 }, { 7, 11 }, { 7, 9 }, { 7, 7 }, { 7, 5 }, { 7, 3 }, { 7, 1 },
+ { 6, 7 }, { 6, 5 }, { 6, 3 }, { 6, 1 }, { 5, 3 }, { 5, 1 }, { 4, 1 },
+ { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },
+ { 4, 0 }, { 5, 0 }, { 5, 2 }, { 6, 0 }, { 6, 2 }, { 6, 4 }, { 6, 6 },
+ { 7, 0 }, { 7, 2 }, { 7, 4 }, { 7, 6 }, { 7, 8 }, { 7, 10 }, { 7, 12 },
+ { 7, 14 }, { 8, 0 }, { 8, 2 }, { 8, 4 }, { 8, 6 }, { 8, 8 }, { 8, 10 },
+ { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 },
+ { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 }, { 9, 2 }, { 9, 4 }, { 9, 6 },
+ { 9, 8 }, { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 },
+ { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 },
+ { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 },
+ { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 }
+};
+const TOKENVALUE *av1_dct_cat_lt_10_value_tokens =
+ dct_cat_lt_10_value_tokens +
+ (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) /
+ 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element av1_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+ 3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282,
+ 3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772,
+ 2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742,
+ 2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195,
+ 2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864, 512, 512, 512,
+ 512, 0, 512, 512, 512, 512, 864, 1229, 1256, 1453, 1696, 1893, 1652,
+ 1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476,
+ 2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622,
+ 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+ 3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681,
+ 3704, 3750, 3773,
+};
+const int *av1_dct_cat_lt_10_value_cost =
+ dct_cat_lt_10_value_cost +
+ (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2;
+
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+/* clang-format off */
+const aom_tree_index av1_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+ -EOB_TOKEN, 2, // 0 = EOB
+ -ZERO_TOKEN, 4, // 1 = ZERO
+ -ONE_TOKEN, 6, // 2 = ONE
+ 8, 12, // 3 = LOW_VAL
+ -TWO_TOKEN, 10, // 4 = TWO
+ -THREE_TOKEN, -FOUR_TOKEN, // 5 = THREE
+ 14, 16, // 6 = HIGH_LOW
+ -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 7 = CAT_ONE
+ 18, 20, // 8 = CAT_THREEFOUR
+ -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 9 = CAT_THREE
+ -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE
+};
+/* clang-format on */
+
+static const int16_t zero_cost[] = { 0 };
+static const int16_t sign_cost[1] = { 512 };
+static const int16_t cat1_cost[1 << 1] = { 864, 1229 };
+static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 };
+static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023,
+ 2195, 2334, 2427, 2566 };
+static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476,
+ 2534, 2615, 2661, 2742, 2800, 2881,
+ 2977, 3058, 3116, 3197 };
+static const int16_t cat5_cost[1 << 5] = {
+ 2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
+ 2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
+ 3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773
+};
+const int16_t av1_cat6_low_cost[256] = {
+ 3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574,
+ 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822,
+ 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053,
+ 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301,
+ 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253,
+ 4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461,
+ 4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708,
+ 4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940,
+ 4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198,
+ 5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000,
+ 5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207,
+ 5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
+ 5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722,
+ 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945,
+ 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886,
+ 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094,
+ 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352,
+ 6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609,
+ 6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831,
+ 6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982
+};
+const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES] = {
+ 100, 2263, 2739, 4902, 3160, 5323, 5799, 7962, 3678, 5841, 6317,
+ 8480, 6738, 8901, 9377, 11540, 3678, 5841, 6317, 8480, 6738, 8901,
+ 9377, 11540, 7256, 9419, 9895, 12058, 10316, 12479, 12955, 15118, 3678,
+ 5841, 6317, 8480, 6738, 8901, 9377, 11540, 7256, 9419, 9895, 12058,
+ 10316, 12479, 12955, 15118, 7256, 9419, 9895, 12058, 10316, 12479, 12955,
+ 15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696,
+#if CONFIG_HIGHBITDEPTH
+ 4193, 6356, 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410,
+ 12573, 10831, 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994,
+ 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,
+ 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
+ 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
+ 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193, 6356,
+ 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831,
+ 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633,
+ 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410,
+ 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+ 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
+ 17090, 17566, 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088,
+ 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+ 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
+ 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924,
+ 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+ 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659,
+ 23822, 22080, 24243, 24719, 26882, 4193, 6356, 6832, 8995, 7253, 9416,
+ 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 7771,
+ 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
+ 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, 10831, 12994, 13470,
+ 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512,
+ 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987,
+ 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148,
+ 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503,
+ 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
+ 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442,
+ 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244,
+ 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719,
+ 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
+ 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
+ 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+ 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+ 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
+ 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379,
+ 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759,
+ 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+ 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120,
+ 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595,
+ 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+ 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 4193, 6356, 6832,
+ 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994,
+ 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349,
+ 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573,
+ 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
+ 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090,
+ 17566, 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346,
+ 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+ 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+ 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087,
+ 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442,
+ 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822,
+ 22080, 24243, 24719, 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985,
+ 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
+ 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+ 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+ 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
+ 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
+ 24719, 26882, 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957,
+ 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759,
+ 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+ 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+ 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595,
+ 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975,
+ 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503,
+ 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087,
+ 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864,
+ 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
+ 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
+ 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542,
+ 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017,
+ 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819,
+ 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596,
+ 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758,
+ 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113,
+ 25276, 25752, 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181,
+ 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+ 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+ 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
+ 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+ 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
+ 27915, 26173, 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695,
+ 22171, 24334, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050,
+ 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430,
+ 26688, 28851, 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749,
+ 27912, 23628, 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791,
+ 26267, 28430, 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266,
+ 32429, 32905, 35068
+#endif
+};
+
+const uint8_t av1_cat6_skipped_bits_discount[8] = {
+ 0, 3, 6, 9, 12, 18, 24, 30
+};
+
+#if CONFIG_NEW_MULTISYMBOL
+const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
+ { 0, 0, 0, zero_cost }, // ZERO_TOKEN
+ { 0, 0, 1, sign_cost }, // ONE_TOKEN
+ { 0, 0, 2, sign_cost }, // TWO_TOKEN
+ { 0, 0, 3, sign_cost }, // THREE_TOKEN
+ { 0, 0, 4, sign_cost }, // FOUR_TOKEN
+ { av1_cat1_cdf, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN
+ { av1_cat2_cdf, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN
+ { av1_cat3_cdf, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN
+ { av1_cat4_cdf, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN
+ { av1_cat5_cdf, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN
+ { av1_cat6_cdf, 18, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN
+ { 0, 0, 0, zero_cost } // EOB_TOKEN
+};
+#else
+const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
+ { 0, 0, 0, zero_cost }, // ZERO_TOKEN
+ { 0, 0, 1, sign_cost }, // ONE_TOKEN
+ { 0, 0, 2, sign_cost }, // TWO_TOKEN
+ { 0, 0, 3, sign_cost }, // THREE_TOKEN
+ { 0, 0, 4, sign_cost }, // FOUR_TOKEN
+ { av1_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN
+ { av1_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN
+ { av1_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN
+ { av1_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN
+ { av1_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN
+ { av1_cat6_prob, 18, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN
+ { 0, 0, 0, zero_cost } // EOB_TOKEN
+};
+#endif
+
+#if !CONFIG_EC_MULTISYMBOL
+const struct av1_token av1_coef_encodings[ENTROPY_TOKENS] = {
+ { 2, 2 }, { 6, 3 }, { 28, 5 }, { 58, 6 }, { 59, 6 }, { 60, 6 },
+ { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 }
+};
+#endif // !CONFIG_EC_MULTISYMBOL
+
+struct tokenize_b_args {
+ const AV1_COMP *cpi;
+ ThreadData *td;
+ TOKENEXTRA **tp;
+ int this_rate;
+};
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *const cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const PLANE_TYPE type = pd->plane_type;
+ const int ref = is_inter_block(mbmi);
+ const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, ref);
+ const int rate = av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order,
+ pd->above_context + blk_col,
+ pd->left_context + blk_row, 0);
+ args->this_rate += rate;
+ (void)plane_bsize;
+ av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
+ blk_row);
+}
+
+static void set_entropy_context_b(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ (void)plane_bsize;
+ av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
+ blk_row);
+}
+
+#if CONFIG_NEW_TOKENSET
+static INLINE void add_token(TOKENEXTRA **t,
+ aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
+ aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
+ int eob_val, int first_val, int32_t extra,
+ uint8_t token) {
+ (*t)->token = token;
+ (*t)->extra = extra;
+ (*t)->tail_cdf = tail_cdf;
+ (*t)->head_cdf = head_cdf;
+ (*t)->eob_val = eob_val;
+ (*t)->first_val = first_val;
+ (*t)++;
+}
+
+#else // CONFIG_NEW_TOKENSET
+static INLINE void add_token(
+ TOKENEXTRA **t, const aom_prob *context_tree,
+#if CONFIG_EC_MULTISYMBOL
+ aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
+#endif // CONFIG_EC_MULTISYMBOL
+ int32_t extra, uint8_t token, uint8_t skip_eob_node, unsigned int *counts) {
+ (*t)->token = token;
+ (*t)->extra = extra;
+ (*t)->context_tree = context_tree;
+#if CONFIG_EC_MULTISYMBOL
+ (*t)->token_cdf = token_cdf;
+#endif // CONFIG_EC_MULTISYMBOL
+ (*t)->skip_eob_node = skip_eob_node;
+ (*t)++;
+ ++counts[token];
+}
+#endif // CONFIG_NEW_TOKENSET
+#endif // !CONFIG_PVQ || CONFIG_VAR_TX
+
+#if CONFIG_PALETTE
+void av1_tokenize_palette_sb(const AV1_COMP *cpi,
+ const struct ThreadData *const td, int plane,
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate) {
+ const MACROBLOCK *const x = &td->mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const uint8_t *const color_map = xd->plane[plane].color_index_map;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int n = pmi->palette_size[plane];
+ int i, j;
+ int this_rate = 0;
+ uint8_t color_order[PALETTE_MAX_SIZE];
+ const aom_prob(
+ *const probs)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] =
+ plane == 0 ? av1_default_palette_y_color_index_prob
+ : av1_default_palette_uv_color_index_prob;
+ int plane_block_width, rows, cols;
+ av1_get_block_dimensions(bsize, plane, xd, &plane_block_width, NULL, &rows,
+ &cols);
+ assert(plane == 0 || plane == 1);
+
+#if CONFIG_PALETTE_THROUGHPUT
+ int k;
+ for (k = 1; k < rows + cols - 1; ++k) {
+ for (j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+ i = k - j;
+#else
+ for (i = 0; i < rows; ++i) {
+ for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+#endif // CONFIG_PALETTE_THROUGHPUT
+ int color_new_idx;
+ const int color_ctx = av1_get_palette_color_index_context(
+ color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
+ assert(color_new_idx >= 0 && color_new_idx < n);
+ if (dry_run == DRY_RUN_COSTCOEFFS)
+ this_rate += cpi->palette_y_color_cost[n - PALETTE_MIN_SIZE][color_ctx]
+ [color_new_idx];
+ (*t)->token = color_new_idx;
+ (*t)->context_tree = probs[n - PALETTE_MIN_SIZE][color_ctx];
+ (*t)->skip_eob_node = 0;
+ ++(*t);
+ }
+ }
+ if (rate) *rate += this_rate;
+}
+#endif // CONFIG_PALETTE
+
+#if CONFIG_PVQ
+static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x,
+ PVQ_INFO *pvq) {
+ PVQ_QUEUE *q = x->pvq_q;
+ if (q->curr_pos >= q->buf_len) {
+ int new_buf_len = 2 * q->buf_len + 1;
+ PVQ_INFO *new_buf;
+ CHECK_MEM_ERROR(cm, new_buf, aom_malloc(new_buf_len * sizeof(PVQ_INFO)));
+ memcpy(new_buf, q->buf, q->buf_len * sizeof(PVQ_INFO));
+ aom_free(q->buf);
+ q->buf = new_buf;
+ q->buf_len = new_buf_len;
+ }
+ OD_COPY(q->buf + q->curr_pos, pvq, 1);
+ ++q->curr_pos;
+}
+
+// NOTE: This does not actually generate tokens, instead we store the encoding
+// decisions made for PVQ in a queue that we will read from when
+// actually writing the bitstream in write_modes_b
+static void tokenize_pvq(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ PVQ_INFO *pvq_info;
+
+ (void)block;
+ (void)blk_row;
+ (void)blk_col;
+ (void)plane_bsize;
+ (void)tx_size;
+
+ assert(block < MAX_PVQ_BLOCKS_IN_SB);
+ pvq_info = &x->pvq[block][plane];
+ add_pvq_block((AV1_COMMON * const)cm, x, pvq_info);
+}
+#endif // CONFIG_PVQ
+
+static void tokenize_b(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+#if !CONFIG_PVQ
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TOKENEXTRA **tp = args->tp;
+ uint8_t token_cache[MAX_TX_SQUARE];
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ int pt; /* near block/prev token context index */
+ int c;
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ const int eob = p->eobs[block];
+ const PLANE_TYPE type = pd->plane_type;
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+#if CONFIG_SUPERTX
+ const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+#else
+ const int segment_id = mbmi->segment_id;
+#endif // CONFIG_SUEPRTX
+ const int16_t *scan, *nb;
+ const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+ const SCAN_ORDER *const scan_order =
+ get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+ const int ref = is_inter_block(mbmi);
+ unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref];
+#if !CONFIG_NEW_TOKENSET
+#if CONFIG_SUBFRAME_PROB_UPDATE
+ const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+ cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx]
+ [txsize_sqr_map[tx_size]][type][ref];
+#else
+ aom_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+ cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
+#endif // CONFIG_SUBFRAME_PROB_UPDATE
+#endif // !CONFIG_NEW_TOKENSET
+#if CONFIG_EC_ADAPT
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#elif CONFIG_EC_MULTISYMBOL
+ FRAME_CONTEXT *ec_ctx = cpi->common.fc;
+#endif
+#if CONFIG_NEW_TOKENSET
+ aom_cdf_prob(
+ *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+ ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref];
+ aom_cdf_prob(
+ *const coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+ ec_ctx->coef_tail_cdfs[txsize_sqr_map[tx_size]][type][ref];
+ unsigned int(*const blockz_count)[2] =
+ td->counts->blockz_count[txsize_sqr_map[tx_size]][type][ref];
+ int eob_val;
+ int first_val = 1;
+#else
+#if CONFIG_EC_MULTISYMBOL
+ aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
+ ec_ctx->coef_cdfs[txsize_sqr_map[tx_size]][type][ref];
+#endif
+ int skip_eob = 0;
+#endif
+ const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
+ td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
+ const uint8_t *const band = get_band_translate(tx_size);
+ int16_t token;
+ EXTRABIT extra;
+ (void)plane_bsize;
+ pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+ pd->left_context + blk_row);
+ scan = scan_order->scan;
+ nb = scan_order->neighbors;
+ c = 0;
+
+#if CONFIG_NEW_TOKENSET
+ if (eob == 0)
+ add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1,
+ 1, 0, BLOCK_Z_TOKEN);
+
+ ++blockz_count[pt][eob != 0];
+
+ while (c < eob) {
+ int v = qcoeff[scan[c]];
+ first_val = (c == 0);
+
+ if (!v) {
+ add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
+ 0, first_val, 0, ZERO_TOKEN);
+ ++counts[band[c]][pt][ZERO_TOKEN];
+ token_cache[scan[c]] = 0;
+ } else {
+ eob_val =
+ (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
+
+ av1_get_token_extra(v, &token, &extra);
+
+ add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
+ eob_val, first_val, extra, (uint8_t)token);
+
+ if (eob_val != LAST_EOB) {
+ ++counts[band[c]][pt][token];
+ ++eob_branch[band[c]][pt];
+ counts[band[c]][pt][EOB_TOKEN] += eob_val != NO_EOB;
+ }
+
+ token_cache[scan[c]] = av1_pt_energy_class[token];
+ }
+ ++c;
+ pt = get_coef_context(nb, token_cache, AOMMIN(c, eob - 1));
+ }
+#else
+ while (c < eob) {
+ const int v = qcoeff[scan[c]];
+ eob_branch[band[c]][pt] += !skip_eob;
+
+ av1_get_token_extra(v, &token, &extra);
+
+ add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_EC_MULTISYMBOL
+ &coef_cdfs[band[c]][pt],
+#endif
+ extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
+
+ token_cache[scan[c]] = av1_pt_energy_class[token];
+ ++c;
+ pt = get_coef_context(nb, token_cache, c);
+ skip_eob = (token == ZERO_TOKEN);
+ }
+ if (c < seg_eob) {
+ add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_EC_MULTISYMBOL
+ NULL,
+#endif
+ 0, EOB_TOKEN, 0, counts[band[c]][pt]);
+ ++eob_branch[band[c]][pt];
+ }
+#endif // CONFIG_NEW_TOKENSET
+
+#if CONFIG_COEF_INTERLEAVE
+ t->token = EOSB_TOKEN;
+ t++;
+#endif
+
+ *tp = t;
+
+#if CONFIG_ADAPT_SCAN
+ // Since dqcoeff is not available here, we pass qcoeff into
+ // av1_update_scan_count_facade(). The update behavior should be the same
+ // because av1_update_scan_count_facade() only cares if coefficients are zero
+ // or not.
+ av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
+ qcoeff, c);
+#endif
+
+ av1_set_contexts(xd, pd, plane, tx_size, c > 0, blk_col, blk_row);
+#else // !CONFIG_PVQ
+ tokenize_pvq(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+#endif // !CONFIG_PVQ
+}
+
+struct is_skippable_args {
+ uint16_t *eobs;
+ int *skippable;
+};
+static void is_skippable(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) {
+ struct is_skippable_args *args = argv;
+ (void)plane;
+ (void)plane_bsize;
+ (void)tx_size;
+ (void)blk_row;
+ (void)blk_col;
+ args->skippable[0] &= (!args->eobs[block]);
+}
+
+// TODO(yaowu): rewrite and optimize this function to remove the usage of
+// av1_foreach_transform_block() and simplify is_skippable().
+int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+ int result = 1;
+ struct is_skippable_args args = { x->plane[plane].eobs, &result };
+ av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
+ &args);
+ return result;
+}
+
+#if CONFIG_VAR_TX
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+ TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
+ int blk_col, int block, int plane, void *arg) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ const int tx_row = blk_row >> (1 - pd->subsampling_y);
+ const int tx_col = blk_col >> (1 - pd->subsampling_x);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ TX_SIZE plane_tx_size;
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ plane_tx_size =
+ plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+ : mbmi->inter_tx_size[tx_row][tx_col];
+
+ if (tx_size == plane_tx_size) {
+ plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
+ if (!dry_run)
+ tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+ else if (dry_run == DRY_RUN_NORMAL)
+ set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, arg);
+ else if (dry_run == DRY_RUN_COSTCOEFFS)
+ cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+ } else {
+ // Half the block size in transform block unit.
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsl = tx_size_wide_unit[sub_txs];
+ int i;
+
+ assert(bsl > 0);
+
+ for (i = 0; i < 4; ++i) {
+ const int offsetr = blk_row + ((i >> 1) * bsl);
+ const int offsetc = blk_col + ((i & 0x01) * bsl);
+
+ int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
+ block, plane, arg);
+ block += step;
+ }
+ }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+ RUN_TYPE dry_run, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ TOKENEXTRA *t_backup = *t;
+ const int ctx = av1_get_skip_context(xd);
+ const int skip_inc =
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+ struct tokenize_b_args arg = { cpi, td, t, 0 };
+ int plane;
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ if (mbmi->skip) {
+ if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+ reset_skip_context(xd, bsize);
+ if (dry_run) *t = t_backup;
+ return;
+ }
+
+ if (!dry_run)
+ td->counts->skip[ctx][0] += skip_inc;
+ else
+ *t = t_backup;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y)) {
+#if !CONFIG_PVQ
+ if (!dry_run) {
+ (*t)->token = EOSB_TOKEN;
+ (*t)++;
+ }
+#endif
+ continue;
+ }
+#endif
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+ const BLOCK_SIZE plane_bsize =
+ AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#else
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#endif
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+ int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+ int idx, idy;
+ int block = 0;
+ int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ for (idy = 0; idy < mi_height; idy += bh) {
+ for (idx = 0; idx < mi_width; idx += bw) {
+ tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx,
+ block, plane, &arg);
+ block += step;
+ }
+ }
+
+ if (!dry_run) {
+ (*t)->token = EOSB_TOKEN;
+ (*t)++;
+ }
+ }
+ if (rate) *rate += arg.this_rate;
+}
+#endif // CONFIG_VAR_TX
+
+void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ const int mi_row, const int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const int ctx = av1_get_skip_context(xd);
+ const int skip_inc =
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+ struct tokenize_b_args arg = { cpi, td, t, 0 };
+ if (mbmi->skip) {
+ if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+ reset_skip_context(xd, bsize);
+ return;
+ }
+
+ if (!dry_run) {
+#if CONFIG_COEF_INTERLEAVE
+ td->counts->skip[ctx][0] += skip_inc;
+ av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg);
+#else
+ int plane;
+
+ td->counts->skip[ctx][0] += skip_inc;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y)) {
+#if !CONFIG_PVQ
+ (*t)->token = EOSB_TOKEN;
+ (*t)++;
+#endif
+ continue;
+ }
+#else
+ (void)mi_row;
+ (void)mi_col;
+#endif
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+ &arg);
+#if !CONFIG_PVQ
+ (*t)->token = EOSB_TOKEN;
+ (*t)++;
+#endif // !CONFIG_PVQ
+ }
+#endif
+ }
+#if !CONFIG_PVQ
+ else if (dry_run == DRY_RUN_NORMAL) {
+ int plane;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y))
+ continue;
+#else
+ (void)mi_row;
+ (void)mi_col;
+#endif
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane,
+ set_entropy_context_b, &arg);
+ }
+ } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+ int plane;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_CB4X4
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y))
+ continue;
+#else
+ (void)mi_row;
+ (void)mi_col;
+#endif
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
+ &arg);
+ }
+ }
+#endif // !CONFIG_PVQ
+
+ if (rate) *rate += arg.this_rate;
+}
+
+#if CONFIG_SUPERTX
+void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td,
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ TOKENEXTRA *t_backup = *t;
+ const int ctx = av1_get_skip_context(xd);
+ const int skip_inc =
+ !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
+ struct tokenize_b_args arg = { cpi, td, t, 0 };
+ if (mbmi->skip) {
+ if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+ reset_skip_context(xd, bsize);
+ if (dry_run) *t = t_backup;
+ return;
+ }
+
+ if (!dry_run) {
+ int plane;
+ td->counts->skip[ctx][0] += skip_inc;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+ &arg);
+ (*t)->token = EOSB_TOKEN;
+ (*t)++;
+ }
+ } else if (dry_run == DRY_RUN_NORMAL) {
+ int plane;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane,
+ set_entropy_context_b, &arg);
+ *t = t_backup;
+ } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+ int plane;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
+ &arg);
+ }
+ if (rate) *rate += arg.this_rate;
+}
+#endif // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
new file mode 100644
index 0000000000..3928111d6f
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TOKENIZE_H_
+#define AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/treewriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOSB_TOKEN 127 // Not signalled, encoder only
+
+#if CONFIG_HIGHBITDEPTH
+typedef int32_t EXTRABIT;
+#else
+typedef int16_t EXTRABIT;
+#endif
+
+typedef struct {
+ int16_t token;
+ EXTRABIT extra;
+} TOKENVALUE;
+
+typedef struct {
+#if CONFIG_NEW_TOKENSET
+ aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+ aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+ int eob_val;
+ int first_val;
+#elif CONFIG_EC_MULTISYMBOL
+ aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+#endif
+ const aom_prob *context_tree;
+ EXTRABIT extra;
+ uint8_t token;
+ uint8_t skip_eob_node;
+} TOKENEXTRA;
+
+extern const aom_tree_index av1_coef_tree[];
+extern const aom_tree_index av1_coef_con_tree[];
+#if !CONFIG_EC_MULTISYMBOL
+extern const struct av1_token av1_coef_encodings[];
+#endif // !CONFIG_EC_MULTISYMBOL
+
+int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef enum {
+ OUTPUT_ENABLED = 0,
+ DRY_RUN_NORMAL,
+ DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+#if CONFIG_VAR_TX
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+ TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *rate);
+#endif
+#if CONFIG_PALETTE
+void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
+ const struct ThreadData *const td, int plane,
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate);
+#endif // CONFIG_PALETTE
+void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate, const int mi_row, const int mi_col);
+#if CONFIG_SUPERTX
+void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td,
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate);
+#endif
+
+extern const int16_t *av1_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ * improve cache locality, since it's needed for costing when the rest of the
+ * fields are not.
+ */
+extern const TOKENVALUE *av1_dct_value_tokens_ptr;
+extern const TOKENVALUE *av1_dct_cat_lt_10_value_tokens;
+extern const int *av1_dct_cat_lt_10_value_cost;
+extern const int16_t av1_cat6_low_cost[256];
+#if CONFIG_HIGHBITDEPTH
+#define CAT6_HIGH_COST_ENTRIES 1024
+#else
+#define CAT6_HIGH_COST_ENTRIES 64
+#endif
+extern const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES];
+extern const uint8_t av1_cat6_skipped_bits_discount[8];
+
+static INLINE void av1_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ *token = CATEGORY6_TOKEN;
+ if (v >= CAT6_MIN_VAL)
+ *extra = 2 * v - 2 * CAT6_MIN_VAL;
+ else
+ *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+ return;
+ }
+ *token = av1_dct_cat_lt_10_value_tokens[v].token;
+ *extra = av1_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t av1_get_token(int v) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10;
+ return av1_dct_cat_lt_10_value_tokens[v].token;
+}
+
+static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ EXTRABIT extrabits;
+ *token = CATEGORY6_TOKEN;
+ extrabits = abs(v) - CAT6_MIN_VAL;
+ return av1_cat6_low_cost[extrabits & 0xff] +
+ av1_cat6_high_cost[extrabits >> 8] -
+ av1_cat6_skipped_bits_discount[18 - cat6_bits];
+ }
+ *token = av1_dct_cat_lt_10_value_tokens[v].token;
+ return av1_dct_cat_lt_10_value_cost[v];
+}
+
+#if !CONFIG_PVQ || CONFIG_VAR_TX
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+ TX_SIZE tx_size) {
+ const int eob_max = tx_size_2d[tx_size];
+ return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/treewriter.c b/third_party/aom/av1/encoder/treewriter.c
new file mode 100644
index 0000000000..50be72413b
--- /dev/null
+++ b/third_party/aom/av1/encoder/treewriter.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/treewriter.h"
+
+static void tree2tok(struct av1_token *tokens, const aom_tree_index *tree,
+ int i, int v, int l) {
+ v += v;
+ ++l;
+
+ do {
+ const aom_tree_index j = tree[i++];
+ if (j <= 0) {
+ tokens[-j].value = v;
+ tokens[-j].len = l;
+ } else {
+ tree2tok(tokens, tree, j, v, l);
+ }
+ } while (++v & 1);
+}
+
+void av1_tokens_from_tree(struct av1_token *tokens,
+ const aom_tree_index *tree) {
+ tree2tok(tokens, tree, 0, 0, 0);
+}
+
+static unsigned int convert_distribution(unsigned int i, aom_tree tree,
+ unsigned int branch_ct[][2],
+ const unsigned int num_events[]) {
+ unsigned int left, right;
+
+ if (tree[i] <= 0)
+ left = num_events[-tree[i]];
+ else
+ left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+ if (tree[i + 1] <= 0)
+ right = num_events[-tree[i + 1]];
+ else
+ right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
+
+ branch_ct[i >> 1][0] = left;
+ branch_ct[i >> 1][1] = right;
+ return left + right;
+}
+
+void av1_tree_probs_from_distribution(aom_tree tree,
+ unsigned int branch_ct[/* n-1 */][2],
+ const unsigned int num_events[/* n */]) {
+ convert_distribution(0, tree, branch_ct, num_events);
+}
diff --git a/third_party/aom/av1/encoder/treewriter.h b/third_party/aom/av1/encoder/treewriter.h
new file mode 100644
index 0000000000..9a4cb86cb2
--- /dev/null
+++ b/third_party/aom/av1/encoder/treewriter.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TREEWRITER_H_
+#define AV1_ENCODER_TREEWRITER_H_
+
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_tree_probs_from_distribution(aom_tree tree,
+ unsigned int branch_ct[/* n - 1 */][2],
+ const unsigned int num_events[/* n */]);
+
+struct av1_token {
+ int value;
+ int len;
+};
+
+void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *);
+
+static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree,
+ const aom_prob *probs,
+ const struct av1_token *token) {
+ aom_write_tree(w, tree, probs, token->value, token->len, 0);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_ENCODER_TREEWRITER_H_
diff --git a/third_party/aom/av1/encoder/variance_tree.c b/third_party/aom/av1/encoder/variance_tree.c
new file mode 100644
index 0000000000..9384cd78ef
--- /dev/null
+++ b/third_party/aom/av1/encoder/variance_tree.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/variance_tree.h"
+#include "av1/encoder/encoder.h"
+
+void av1_setup_var_tree(struct AV1Common *cm, ThreadData *td) {
+ int i, j;
+#if CONFIG_EXT_PARTITION
+ const int leaf_nodes = 1024;
+ const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
+#else
+ const int leaf_nodes = 256;
+ const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#endif // CONFIG_EXT_PARTITION
+ int index = 0;
+ VAR_TREE *this_var;
+ int nodes;
+
+ aom_free(td->var_tree);
+ CHECK_MEM_ERROR(cm, td->var_tree,
+ aom_calloc(tree_nodes, sizeof(*td->var_tree)));
+
+ this_var = &td->var_tree[0];
+
+ // Sets up all the leaf nodes in the tree.
+ for (index = 0; index < leaf_nodes; ++index) {
+ VAR_TREE *const leaf = &td->var_tree[index];
+ leaf->split[0] = NULL;
+ }
+
+ // Each node has 4 leaf nodes, fill in the child pointers
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (i = 0; i < nodes; ++i, ++index) {
+ VAR_TREE *const node = &td->var_tree[index];
+ for (j = 0; j < 4; j++) node->split[j] = this_var++;
+ }
+ }
+
+ // Set up the root node for the largest superblock size
+ i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+ td->var_root[i] = &td->var_tree[tree_nodes - 1];
+ // Set up the root nodes for the rest of the possible superblock sizes
+ while (--i >= 0) {
+ td->var_root[i] = td->var_root[i + 1]->split[0];
+ }
+}
+
+void av1_free_var_tree(ThreadData *td) {
+ aom_free(td->var_tree);
+ td->var_tree = NULL;
+}
diff --git a/third_party/aom/av1/encoder/variance_tree.h b/third_party/aom/av1/encoder/variance_tree.h
new file mode 100644
index 0000000000..a9f27302e9
--- /dev/null
+++ b/third_party/aom/av1/encoder/variance_tree.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_VARIANCE_TREE_H_
+#define AV1_ENCODER_VARIANCE_TREE_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct ThreadData;
+
+typedef struct {
+ int64_t sum_square_error;
+ int64_t sum_error;
+ int log2_count;
+ int variance;
+} VAR;
+
+typedef struct {
+ VAR none;
+ VAR horz[2];
+ VAR vert[2];
+} partition_variance;
+
+typedef struct VAR_TREE {
+ int force_split;
+ partition_variance variances;
+ struct VAR_TREE *split[4];
+ BLOCK_SIZE bsize;
+ const uint8_t *src;
+ const uint8_t *ref;
+ int src_stride;
+ int ref_stride;
+ int width;
+ int height;
+#if CONFIG_HIGHBITDEPTH
+ int highbd;
+#endif // CONFIG_HIGHBITDEPTH
+} VAR_TREE;
+
+void av1_setup_var_tree(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_var_tree(struct ThreadData *td);
+
+// Set variance values given sum square error, sum error, count.
+static INLINE void fill_variance(int64_t s2, int64_t s, int c, VAR *v) {
+ v->sum_square_error = s2;
+ v->sum_error = s;
+ v->log2_count = c;
+ v->variance =
+ (int)(256 * (v->sum_square_error -
+ ((v->sum_error * v->sum_error) >> v->log2_count)) >>
+ v->log2_count);
+}
+
+static INLINE void sum_2_variances(const VAR *a, const VAR *b, VAR *r) {
+ assert(a->log2_count == b->log2_count);
+ fill_variance(a->sum_square_error + b->sum_square_error,
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static INLINE void fill_variance_node(VAR_TREE *vt) {
+ sum_2_variances(&vt->split[0]->variances.none, &vt->split[1]->variances.none,
+ &vt->variances.horz[0]);
+ sum_2_variances(&vt->split[2]->variances.none, &vt->split[3]->variances.none,
+ &vt->variances.horz[1]);
+ sum_2_variances(&vt->split[0]->variances.none, &vt->split[2]->variances.none,
+ &vt->variances.vert[0]);
+ sum_2_variances(&vt->split[1]->variances.none, &vt->split[3]->variances.none,
+ &vt->variances.vert[1]);
+ sum_2_variances(&vt->variances.vert[0], &vt->variances.vert[1],
+ &vt->variances.none);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* AV1_ENCODER_VARIANCE_TREE_H_ */
diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c
new file mode 100644
index 0000000000..e6edbb6af0
--- /dev/null
+++ b/third_party/aom/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1: Residuals of p1.
+ * (source - p1)
+ * d: Difference of p1 and p0.
+ * (p1 - p0)
+ * m: The blending mask
+ * N: Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ * where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ * is equivalent to:
+ * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ * which is the SSE of the residuals of the compound predictor scaled up by
+ * MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ uint64_t csse = 0;
+ int i;
+
+ for (i = 0; i < N; i++) {
+ int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+ t = clamp(t, INT16_MIN, INT16_MAX);
+ csse += t * t;
+ }
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds: Difference of the squares of the residuals.
+ * r0**2 - r1**2
+ * m: The blending mask
+ * N: Number of pixels
+ * limit: Pre-computed threshold value.
+ * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ * >
+ * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ * which can be simplified to:
+ *
+ * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * The right hand side does not depend on the mask, and needs to be passed as
+ * the 'limit' parameter.
+ *
+ * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ * hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ * Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ * being small, this should not cause a noticeable issue.
+ */
+int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
+ int64_t limit) {
+ int64_t acc = 0;
+
+ do {
+ acc += *ds++ * *m++;
+ } while (--N);
+
+ return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ int i;
+
+ for (i = 0; i < N; i++)
+ d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 0000000000..fa5626002f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+ const int shift, const int scale,
+ __m128i *qcoeff, __m128i *dquan,
+ __m128i *sign) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+
+ *sign = _mm_cmplt_epi32(*coeff, zero);
+ *sign = _mm_or_si128(*sign, one);
+ *coeff = _mm_abs_epi32(*coeff);
+
+ qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+ qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+ qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+ qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+ qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+ dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+ dquan[0] = _mm_srli_epi64(dquan[0], scale);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+ const __m128i *sign,
+ const __m128i *param, const int shift,
+ const int scale, tran_low_t *qAddr,
+ tran_low_t *dqAddr) {
+ __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+ __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+ qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+ qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+ dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+ dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+ // combine L&H
+ qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+ qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+ qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+ qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+ dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+ dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+ dquan[0] = _mm_and_si128(dquan[0], mask0H);
+ dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+ qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+ dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+ qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+ dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+
+ _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+ _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+ __m128i *eob) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, iscanIdx;
+ const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+ const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+ __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+ __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+ nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+ nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+ mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+ iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+ iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+ iscanIdx = _mm_and_si128(iscanIdx, mask);
+ *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+ __m128i eob_shuffled;
+ uint16_t eobValue;
+ eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eobValue = _mm_extract_epi16(*eob, 0);
+ return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+ const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+ __m128i eob = _mm_setzero_si128();
+ const tran_low_t *src = coeff_ptr;
+ tran_low_t *quanAddr = qcoeff_ptr;
+ tran_low_t *dquanAddr = dqcoeff_ptr;
+ const int shift = 16 - log_scale;
+ const int coeff_stride = 4;
+ const int quan_stride = coeff_stride;
+ (void)skip_block;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+ memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+ if (!skip_block) {
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+
+ qparam[0] =
+ _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
+ qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
+ qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
+
+ // DC and first 3 AC
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+
+ // update round/quan/dquan for AC
+ qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+ qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
+ qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
+
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr, dquanAddr);
+
+ // next 4 AC
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr + quan_stride,
+ dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+
+ // loop for the rest of AC
+ while (count > 0) {
+ src += coeff_stride << 1;
+ quanAddr += quan_stride << 1;
+ dquanAddr += quan_stride << 1;
+ iscan += quan_stride << 1;
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
+ dequant, &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr, dquanAddr);
+
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
+ dequant, &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr + quan_stride,
+ dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+ }
+ *eob_ptr = get_accumulated_eob(&eob);
+ } else {
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 0000000000..f9c95b6cb2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+
+void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ __m128i zero;
+ __m128i thr;
+ int16_t nzflag;
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ coeff_ptr += n_coeffs;
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+ zero = _mm_setzero_si128();
+
+ if (!skip_block) {
+ __m128i eob;
+ __m128i round, quant, dequant;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
+ {
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ }
+
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
+ coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ thr = _mm_srai_epi16(dequant, 1);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
+ coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
+ }
+
+ if (nzflag) {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+ } else {
+ do {
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+ n_coeffs += 8 * 2;
+ } while (n_coeffs < 0);
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..ad4ae274e2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+ cmp dword skipm, 0
+ jne .blank
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+%ifidn %1, fp_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m1, m5
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ mova m3, [r2q] ; m3 = dequant
+ mov r3, qcoeffmp
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+%ifidn %1, fp_32x32
+ psllw m2, 1
+%endif
+ pxor m5, m5 ; m5 = dedicated zero
+
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea r5q, [ r5q+ncoeffq*2]
+ lea r3q, [ r3q+ncoeffq*2]
+ lea r4q, [r4q+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m8
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; r4[i] = r3[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+ psrlw m0, m3, 2
+%else
+ psrlw m0, m3, 1
+%endif
+ mova [r4q+ncoeffq*2+ 0], m8
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+
+ pcmpgtw m7, m6, m0
+ pcmpgtw m12, m11, m0
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
+
+ or r6, r2
+ jz .skip_iter
+
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m14
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; r4[i] = r3[i] * q
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+ mova [r4q+ncoeffq*2+ 0], m14
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+ jmp .accumulate_eob
+.skip_iter:
+ mova [r3q+ncoeffq*2+ 0], m5
+ mova [r3q+ncoeffq*2+16], m5
+ mova [r4q+ncoeffq*2+ 0], m5
+ mova [r4q+ncoeffq*2+16], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw r6, m8, 0
+ mov [r2], r6
+ RET
+
+ ; skip-block, i.e. just write all zeroes
+.blank:
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+
+ lea r0q, [r0q+ncoeffq*2]
+ lea r2q, [r2q+ncoeffq*2]
+ neg ncoeffq
+ pxor m7, m7
+.blank_loop:
+ mova [r0q+ncoeffq*2+ 0], m7
+ mova [r0q+ncoeffq*2+16], m7
+ mova [r2q+ncoeffq*2+ 0], m7
+ mova [r2q+ncoeffq*2+16], m7
+ add ncoeffq, mmsize
+ jl .blank_loop
+ mov word [r3q], 0
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..dcc697ba30
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,219 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_16x16_sse2) PRIVATE
+sym(av1_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_8x8_sse2) PRIVATE
+sym(av1_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
new file mode 100644
index 0000000000..37c4b0d888
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
@@ -0,0 +1,3884 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr) {
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i mask;
+
+ if (!flipud) {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ } else {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ }
+
+ in[0] = _mm_slli_epi16(in[0], 4);
+ in[1] = _mm_slli_epi16(in[1], 4);
+ in[2] = _mm_slli_epi16(in[2], 4);
+ in[3] = _mm_slli_epi16(in[3], 4);
+
+ mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+ in[0] = _mm_add_epi16(in[0], mask);
+ in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
+ const __m128i kOne = _mm_set1_epi16(1);
+ __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+ __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+ __m128i out01 = _mm_add_epi16(in01, kOne);
+ __m128i out23 = _mm_add_epi16(in23, kOne);
+ out01 = _mm_srai_epi16(out01, 2);
+ out23 = _mm_srai_epi16(out23, 2);
+ store_output(&out01, (output + 0 * 8));
+ store_output(&out23, (output + 1 * 8));
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+ // Combine and transpose
+ // 00 01 02 03 20 21 22 23
+ // 10 11 12 13 30 31 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ // 00 10 20 30 01 11 21 31
+ // 02 12 22 32 03 13 23 33
+ // only use the first 4 16-bit integers
+ res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+ res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+static void fdct4_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u[4], v[4];
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[3], in[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
+ u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+ transpose_4x4(in);
+}
+
+static void fadst4_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+ const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+ const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+ __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpacklo_epi16(in[2], kZero);
+ u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[6]);
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[2]);
+ in[1] = _mm_packs_epi32(u[1], u[3]);
+ transpose_4x4(in);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx4_sse2(__m128i *in) {
+ const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+ const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i v0, v1, v2, v3;
+ __m128i u0, u1, u2, u3;
+
+ v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+ v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+ v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+ v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+
+ u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+ u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+ u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+ u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+
+ v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u0, u2);
+ in[1] = _mm_packs_epi32(u1, u3);
+ transpose_4x4(in);
+}
+#endif // CONFIG_EXT_TX
+
+void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[4];
+
+ switch (tx_type) {
+ case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fdct4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fdct4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in, stride, 0, 1);
+ fdct4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in, stride, 1, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in, stride, 0, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fidtx4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fdct4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fidtx4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fidtx4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, in, stride, 0, 1);
+ fidtx4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+}
+
+void av1_fdct8x8_quant_sse2(const int16_t *input, int stride,
+ int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ __m128i zero;
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ __m128i *in[8];
+ int index = 0;
+
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)coeff_ptr;
+
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ in[0] = &in0;
+ in[1] = &in1;
+ in[2] = &in2;
+ in[3] = &in3;
+ in[4] = &in4;
+ in[5] = &in5;
+ in[6] = &in6;
+ in[7] = &in7;
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = _mm_add_epi16(in0, in7);
+ const __m128i q1 = _mm_add_epi16(in1, in6);
+ const __m128i q2 = _mm_add_epi16(in2, in5);
+ const __m128i q3 = _mm_add_epi16(in3, in4);
+ const __m128i q4 = _mm_sub_epi16(in3, in4);
+ const __m128i q5 = _mm_sub_epi16(in2, in5);
+ const __m128i q6 = _mm_sub_epi16(in1, in6);
+ const __m128i q7 = _mm_sub_epi16(in0, in7);
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = _mm_add_epi16(q0, q3);
+ const __m128i r1 = _mm_add_epi16(q1, q2);
+ const __m128i r2 = _mm_sub_epi16(q1, q2);
+ const __m128i r3 = _mm_sub_epi16(q0, q3);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+ // Add/subtract
+ const __m128i x0 = _mm_add_epi16(q4, r0);
+ const __m128i x1 = _mm_sub_epi16(q4, r0);
+ const __m128i x2 = _mm_sub_epi16(q7, r1);
+ const __m128i x3 = _mm_add_epi16(q7, r1);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ }
+
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+ zero = _mm_setzero_si128();
+
+ if (!skip_block) {
+ __m128i eob;
+ __m128i round, quant, dequant;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
+ {
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ }
+
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ coeff0 = *in[0];
+ coeff1 = *in[1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // AC only loop
+ index = 2;
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+ coeff0 = *in[index];
+ coeff1 = *in[index + 1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ n_coeffs += 8 * 2;
+ index += 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+ } else {
+ do {
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+ n_coeffs += 8 * 2;
+ } while (n_coeffs < 0);
+ *eob_ptr = 0;
+ }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr) {
+ if (!flipud) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ } else {
+ in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+ }
+
+ in[0] = _mm_slli_epi16(in[0], 2);
+ in[1] = _mm_slli_epi16(in[1], 2);
+ in[2] = _mm_slli_epi16(in[2], 2);
+ in[3] = _mm_slli_epi16(in[3], 2);
+ in[4] = _mm_slli_epi16(in[4], 2);
+ in[5] = _mm_slli_epi16(in[5], 2);
+ in[6] = _mm_slli_epi16(in[6], 2);
+ in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
+ __m128i sign0 = _mm_srai_epi16(res[0], 15);
+ __m128i sign1 = _mm_srai_epi16(res[1], 15);
+ __m128i sign2 = _mm_srai_epi16(res[2], 15);
+ __m128i sign3 = _mm_srai_epi16(res[3], 15);
+ __m128i sign4 = _mm_srai_epi16(res[4], 15);
+ __m128i sign5 = _mm_srai_epi16(res[5], 15);
+ __m128i sign6 = _mm_srai_epi16(res[6], 15);
+ __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+ if (bit == 2) {
+ const __m128i const_rounding = _mm_set1_epi16(1);
+ res[0] = _mm_adds_epi16(res[0], const_rounding);
+ res[1] = _mm_adds_epi16(res[1], const_rounding);
+ res[2] = _mm_adds_epi16(res[2], const_rounding);
+ res[3] = _mm_adds_epi16(res[3], const_rounding);
+ res[4] = _mm_adds_epi16(res[4], const_rounding);
+ res[5] = _mm_adds_epi16(res[5], const_rounding);
+ res[6] = _mm_adds_epi16(res[6], const_rounding);
+ res[7] = _mm_adds_epi16(res[7], const_rounding);
+ }
+
+ res[0] = _mm_sub_epi16(res[0], sign0);
+ res[1] = _mm_sub_epi16(res[1], sign1);
+ res[2] = _mm_sub_epi16(res[2], sign2);
+ res[3] = _mm_sub_epi16(res[3], sign3);
+ res[4] = _mm_sub_epi16(res[4], sign4);
+ res[5] = _mm_sub_epi16(res[5], sign5);
+ res[6] = _mm_sub_epi16(res[6], sign6);
+ res[7] = _mm_sub_epi16(res[7], sign7);
+
+ if (bit == 1) {
+ res[0] = _mm_srai_epi16(res[0], 1);
+ res[1] = _mm_srai_epi16(res[1], 1);
+ res[2] = _mm_srai_epi16(res[2], 1);
+ res[3] = _mm_srai_epi16(res[3], 1);
+ res[4] = _mm_srai_epi16(res[4], 1);
+ res[5] = _mm_srai_epi16(res[5], 1);
+ res[6] = _mm_srai_epi16(res[6], 1);
+ res[7] = _mm_srai_epi16(res[7], 1);
+ } else {
+ res[0] = _mm_srai_epi16(res[0], 2);
+ res[1] = _mm_srai_epi16(res[1], 2);
+ res[2] = _mm_srai_epi16(res[2], 2);
+ res[3] = _mm_srai_epi16(res[3], 2);
+ res[4] = _mm_srai_epi16(res[4], 2);
+ res[5] = _mm_srai_epi16(res[5], 2);
+ res[6] = _mm_srai_epi16(res[6], 2);
+ res[7] = _mm_srai_epi16(res[7], 2);
+ }
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+ int stride) {
+ store_output(&res[0], (output + 0 * stride));
+ store_output(&res[1], (output + 1 * stride));
+ store_output(&res[2], (output + 2 * stride));
+ store_output(&res[3], (output + 3 * stride));
+ store_output(&res[4], (output + 4 * stride));
+ store_output(&res[5], (output + 5 * stride));
+ store_output(&res[6], (output + 6 * stride));
+ store_output(&res[7], (output + 7 * stride));
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 44 54 45 55 46 56 47 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 25 35
+ // 44 54 64 74 45 55 65 75
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+}
+
+static void fdct8_sse2(__m128i *in) {
+ // constants
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 1
+ s0 = _mm_add_epi16(in[0], in[7]);
+ s1 = _mm_add_epi16(in[1], in[6]);
+ s2 = _mm_add_epi16(in[2], in[5]);
+ s3 = _mm_add_epi16(in[3], in[4]);
+ s4 = _mm_sub_epi16(in[3], in[4]);
+ s5 = _mm_sub_epi16(in[2], in[5]);
+ s6 = _mm_sub_epi16(in[1], in[6]);
+ s7 = _mm_sub_epi16(in[0], in[7]);
+
+ u0 = _mm_add_epi16(s0, s3);
+ u1 = _mm_add_epi16(s1, s2);
+ u2 = _mm_sub_epi16(s1, s2);
+ u3 = _mm_sub_epi16(s0, s3);
+ // interleave and perform butterfly multiplication/addition
+ v0 = _mm_unpacklo_epi16(u0, u1);
+ v1 = _mm_unpackhi_epi16(u0, u1);
+ v2 = _mm_unpacklo_epi16(u2, u3);
+ v3 = _mm_unpackhi_epi16(u2, u3);
+
+ u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+ u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+ u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+ u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+ u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+ u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+ u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+ u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[4] = _mm_packs_epi32(u2, u3);
+ in[6] = _mm_packs_epi32(u6, u7);
+
+ // stage 2
+ // interleave and perform butterfly multiplication/addition
+ u0 = _mm_unpacklo_epi16(s6, s5);
+ u1 = _mm_unpackhi_epi16(s6, s5);
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+ u0 = _mm_packs_epi32(v0, v1);
+ u1 = _mm_packs_epi32(v2, v3);
+
+ // stage 3
+ s0 = _mm_add_epi16(s4, u0);
+ s1 = _mm_sub_epi16(s4, u0);
+ s2 = _mm_sub_epi16(s7, u1);
+ s3 = _mm_add_epi16(s7, u1);
+
+ // stage 4
+ u0 = _mm_unpacklo_epi16(s0, s3);
+ u1 = _mm_unpackhi_epi16(s0, s3);
+ u2 = _mm_unpacklo_epi16(s1, s2);
+ u3 = _mm_unpackhi_epi16(s1, s2);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+ v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+ v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+ v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+ v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+ v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+ v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+ v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v0, v1);
+ in[3] = _mm_packs_epi32(v4, v5);
+ in[5] = _mm_packs_epi32(v2, v3);
+ in[7] = _mm_packs_epi32(v6, v7);
+
+ // transpose
+ array_transpose_8x8(in, in);
+}
+
+static void fadst8_sse2(__m128i *in) {
+ // Constants
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ v0 = _mm_add_epi32(w0, w4);
+ v1 = _mm_add_epi32(w1, w5);
+ v2 = _mm_add_epi32(w2, w6);
+ v3 = _mm_add_epi32(w3, w7);
+ v4 = _mm_sub_epi32(w0, w4);
+ v5 = _mm_sub_epi32(w1, w5);
+ v6 = _mm_sub_epi32(w2, w6);
+ v7 = _mm_sub_epi32(w3, w7);
+
+ w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
+
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_packs_epi32(v0, v1);
+ s1 = _mm_packs_epi32(v2, v3);
+ s2 = _mm_packs_epi32(v4, v5);
+ s3 = _mm_packs_epi32(v6, v7);
+
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ // FIXME(jingning): do subtract using bit inversion?
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+
+ // transpose
+ array_transpose_8x8(in, in);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx8_sse2(__m128i *in) {
+ in[0] = _mm_slli_epi16(in[0], 1);
+ in[1] = _mm_slli_epi16(in[1], 1);
+ in[2] = _mm_slli_epi16(in[2], 1);
+ in[3] = _mm_slli_epi16(in[3], 1);
+ in[4] = _mm_slli_epi16(in[4], 1);
+ in[5] = _mm_slli_epi16(in[5], 1);
+ in[6] = _mm_slli_epi16(in[6], 1);
+ in[7] = _mm_slli_epi16(in[7], 1);
+
+ array_transpose_8x8(in, in);
+}
+#endif // CONFIG_EXT_TX
+
+void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[8];
+
+ switch (tx_type) {
+ case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fadst8_sse2(in);
+ fdct8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fdct8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in, stride, 1, 0);
+ fadst8_sse2(in);
+ fdct8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1);
+ fdct8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 1);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in, stride, 1, 0);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case IDTX:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fidtx8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case V_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fdct8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case H_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fidtx8_sse2(in);
+ fdct8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case V_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fadst8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case H_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fidtx8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 0);
+ fadst8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1);
+ fidtx8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
+ __m128i *in1, int stride, int flipud,
+ int fliplr) {
+ // Load 4 8x8 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+ const int16_t *botL = input + 8 * stride;
+ const int16_t *botR = input + 8 * stride + 8;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ // Swap right columns
+ tmp = topR;
+ topR = botR;
+ botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ // Swap bottom rows
+ tmp = botL;
+ botL = botR;
+ botR = tmp;
+ }
+
+ // load first 8 columns
+ load_buffer_8x8(topL, in0, stride, flipud, fliplr);
+ load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
+
+ // load second 8 columns
+ load_buffer_8x8(topR, in1, stride, flipud, fliplr);
+ load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
+ __m128i *in1, int stride) {
+ // write first 8 columns
+ write_buffer_8x8(output, in0, stride);
+ write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+ // write second 8 columns
+ output += 8;
+ write_buffer_8x8(output, in1, stride);
+ write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8(res0, res0);
+ array_transpose_8x8(res1, tbuf);
+ array_transpose_8x8(res0 + 8, res1);
+ array_transpose_8x8(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+ // perform rounding operations
+ right_shift_8x8(res0, 2);
+ right_shift_8x8(res0 + 8, 2);
+ right_shift_8x8(res1, 2);
+ right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(__m128i *in) {
+ // perform 16x16 1-D DCT for 8 columns
+ __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ // stage 1
+ i[0] = _mm_add_epi16(in[0], in[15]);
+ i[1] = _mm_add_epi16(in[1], in[14]);
+ i[2] = _mm_add_epi16(in[2], in[13]);
+ i[3] = _mm_add_epi16(in[3], in[12]);
+ i[4] = _mm_add_epi16(in[4], in[11]);
+ i[5] = _mm_add_epi16(in[5], in[10]);
+ i[6] = _mm_add_epi16(in[6], in[9]);
+ i[7] = _mm_add_epi16(in[7], in[8]);
+
+ s[0] = _mm_sub_epi16(in[7], in[8]);
+ s[1] = _mm_sub_epi16(in[6], in[9]);
+ s[2] = _mm_sub_epi16(in[5], in[10]);
+ s[3] = _mm_sub_epi16(in[4], in[11]);
+ s[4] = _mm_sub_epi16(in[3], in[12]);
+ s[5] = _mm_sub_epi16(in[2], in[13]);
+ s[6] = _mm_sub_epi16(in[1], in[14]);
+ s[7] = _mm_sub_epi16(in[0], in[15]);
+
+ p[0] = _mm_add_epi16(i[0], i[7]);
+ p[1] = _mm_add_epi16(i[1], i[6]);
+ p[2] = _mm_add_epi16(i[2], i[5]);
+ p[3] = _mm_add_epi16(i[3], i[4]);
+ p[4] = _mm_sub_epi16(i[3], i[4]);
+ p[5] = _mm_sub_epi16(i[2], i[5]);
+ p[6] = _mm_sub_epi16(i[1], i[6]);
+ p[7] = _mm_sub_epi16(i[0], i[7]);
+
+ u[0] = _mm_add_epi16(p[0], p[3]);
+ u[1] = _mm_add_epi16(p[1], p[2]);
+ u[2] = _mm_sub_epi16(p[1], p[2]);
+ u[3] = _mm_sub_epi16(p[0], p[3]);
+
+ v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+ v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+ v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+ v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+ u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+ u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+ u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+ u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+ u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+ u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[4] = _mm_packs_epi32(u[4], u[5]);
+ in[8] = _mm_packs_epi32(u[2], u[3]);
+ in[12] = _mm_packs_epi32(u[6], u[7]);
+
+ u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[1]);
+ u[1] = _mm_packs_epi32(v[2], v[3]);
+
+ t[0] = _mm_add_epi16(p[4], u[0]);
+ t[1] = _mm_sub_epi16(p[4], u[0]);
+ t[2] = _mm_sub_epi16(p[7], u[1]);
+ t[3] = _mm_add_epi16(p[7], u[1]);
+
+ u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+ u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+ u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ in[2] = _mm_packs_epi32(v[0], v[1]);
+ in[6] = _mm_packs_epi32(v[4], v[5]);
+ in[10] = _mm_packs_epi32(v[2], v[3]);
+ in[14] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[2] = _mm_packs_epi32(v[0], v[1]);
+ t[3] = _mm_packs_epi32(v[2], v[3]);
+ t[4] = _mm_packs_epi32(v[4], v[5]);
+ t[5] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 3
+ p[0] = _mm_add_epi16(s[0], t[3]);
+ p[1] = _mm_add_epi16(s[1], t[2]);
+ p[2] = _mm_sub_epi16(s[1], t[2]);
+ p[3] = _mm_sub_epi16(s[0], t[3]);
+ p[4] = _mm_sub_epi16(s[7], t[4]);
+ p[5] = _mm_sub_epi16(s[6], t[5]);
+ p[6] = _mm_add_epi16(s[6], t[5]);
+ p[7] = _mm_add_epi16(s[7], t[4]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+ u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+ u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[1] = _mm_packs_epi32(v[0], v[1]);
+ t[2] = _mm_packs_epi32(v[2], v[3]);
+ t[5] = _mm_packs_epi32(v[4], v[5]);
+ t[6] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 5
+ s[0] = _mm_add_epi16(p[0], t[1]);
+ s[1] = _mm_sub_epi16(p[0], t[1]);
+ s[2] = _mm_sub_epi16(p[3], t[2]);
+ s[3] = _mm_add_epi16(p[3], t[2]);
+ s[4] = _mm_add_epi16(p[4], t[5]);
+ s[5] = _mm_sub_epi16(p[4], t[5]);
+ s[6] = _mm_sub_epi16(p[7], t[6]);
+ s[7] = _mm_add_epi16(p[7], t[6]);
+
+ // stage 6
+ u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+ u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+ v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+ v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+ v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+ v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+ v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+ v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+ v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+ v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+ v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+ v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v[0], v[1]);
+ in[9] = _mm_packs_epi32(v[2], v[3]);
+ in[5] = _mm_packs_epi32(v[4], v[5]);
+ in[13] = _mm_packs_epi32(v[6], v[7]);
+ in[3] = _mm_packs_epi32(v[8], v[9]);
+ in[11] = _mm_packs_epi32(v[10], v[11]);
+ in[7] = _mm_packs_epi32(v[12], v[13]);
+ in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+static void fadst16_8col(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ v[0] = _mm_add_epi32(u[0], u[8]);
+ v[1] = _mm_add_epi32(u[1], u[9]);
+ v[2] = _mm_add_epi32(u[2], u[10]);
+ v[3] = _mm_add_epi32(u[3], u[11]);
+ v[4] = _mm_add_epi32(u[4], u[12]);
+ v[5] = _mm_add_epi32(u[5], u[13]);
+ v[6] = _mm_add_epi32(u[6], u[14]);
+ v[7] = _mm_add_epi32(u[7], u[15]);
+
+ v[16] = _mm_add_epi32(v[0], v[4]);
+ v[17] = _mm_add_epi32(v[1], v[5]);
+ v[18] = _mm_add_epi32(v[2], v[6]);
+ v[19] = _mm_add_epi32(v[3], v[7]);
+ v[20] = _mm_sub_epi32(v[0], v[4]);
+ v[21] = _mm_sub_epi32(v[1], v[5]);
+ v[22] = _mm_sub_epi32(v[2], v[6]);
+ v[23] = _mm_sub_epi32(v[3], v[7]);
+ v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ s[0] = _mm_packs_epi32(v[16], v[17]);
+ s[1] = _mm_packs_epi32(v[18], v[19]);
+ s[2] = _mm_packs_epi32(v[20], v[21]);
+ s[3] = _mm_packs_epi32(v[22], v[23]);
+
+ v[8] = _mm_sub_epi32(u[0], u[8]);
+ v[9] = _mm_sub_epi32(u[1], u[9]);
+ v[10] = _mm_sub_epi32(u[2], u[10]);
+ v[11] = _mm_sub_epi32(u[3], u[11]);
+ v[12] = _mm_sub_epi32(u[4], u[12]);
+ v[13] = _mm_sub_epi32(u[5], u[13]);
+ v[14] = _mm_sub_epi32(u[6], u[14]);
+ v[15] = _mm_sub_epi32(u[7], u[15]);
+
+ v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ s[4] = _mm_packs_epi32(v[8], v[9]);
+ s[5] = _mm_packs_epi32(v[10], v[11]);
+ s[6] = _mm_packs_epi32(v[12], v[13]);
+ s[7] = _mm_packs_epi32(v[14], v[15]);
+ //
+
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[8] = _mm_add_epi32(u[0], u[4]);
+ v[9] = _mm_add_epi32(u[1], u[5]);
+ v[10] = _mm_add_epi32(u[2], u[6]);
+ v[11] = _mm_add_epi32(u[3], u[7]);
+ v[12] = _mm_sub_epi32(u[0], u[4]);
+ v[13] = _mm_sub_epi32(u[1], u[5]);
+ v[14] = _mm_sub_epi32(u[2], u[6]);
+ v[15] = _mm_sub_epi32(u[3], u[7]);
+
+ v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ s[8] = _mm_packs_epi32(v[8], v[9]);
+ s[9] = _mm_packs_epi32(v[10], v[11]);
+ s[10] = _mm_packs_epi32(v[12], v[13]);
+ s[11] = _mm_packs_epi32(v[14], v[15]);
+
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(s[4], s[5]);
+ u[1] = _mm_unpackhi_epi16(s[4], s[5]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void fdct16_sse2(__m128i *in0, __m128i *in1) {
+ fdct16_8col(in0);
+ fdct16_8col(in1);
+ array_transpose_16x16(in0, in1);
+}
+
+static void fadst16_sse2(__m128i *in0, __m128i *in1) {
+ fadst16_8col(in0);
+ fadst16_8col(in1);
+ array_transpose_16x16(in0, in1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
+ idtx16_8col(in0);
+ idtx16_8col(in1);
+ array_transpose_16x16(in0, in1);
+}
+#endif // CONFIG_EXT_TX
+
+void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in0[16], in1[16];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fdct16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case ADST_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fdct16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case ADST_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 1);
+ fdct16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 1, 1);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 1);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case IDTX:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case V_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fdct16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case H_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case V_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case H_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 1);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+}
+
+static INLINE void prepare_4x8_row_first(__m128i *in) {
+ in[0] = _mm_unpacklo_epi64(in[0], in[2]);
+ in[1] = _mm_unpacklo_epi64(in[1], in[3]);
+ transpose_4x4(in);
+ in[4] = _mm_unpacklo_epi64(in[4], in[6]);
+ in[5] = _mm_unpacklo_epi64(in[5], in[7]);
+ transpose_4x4(in + 4);
+}
+
+// Load input into the left-hand half of in (ie, into lanes 0..3 of
+// each element of in). The right hand half (lanes 4..7) should be
+// treated as being filled with "don't care" values.
+static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr) {
+ const int shift = 2;
+ if (!flipud) {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
+ } else {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
+ in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+ in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+ in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+ in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+ }
+
+ in[0] = _mm_slli_epi16(in[0], shift);
+ in[1] = _mm_slli_epi16(in[1], shift);
+ in[2] = _mm_slli_epi16(in[2], shift);
+ in[3] = _mm_slli_epi16(in[3], shift);
+ in[4] = _mm_slli_epi16(in[4], shift);
+ in[5] = _mm_slli_epi16(in[5], shift);
+ in[6] = _mm_slli_epi16(in[6], shift);
+ in[7] = _mm_slli_epi16(in[7], shift);
+
+ scale_sqrt2_8x4(in);
+ scale_sqrt2_8x4(in + 4);
+ prepare_4x8_row_first(in);
+}
+
+static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
+ __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
+ const int shift = 1;
+
+ // revert the 8x8 txfm's transpose
+ array_transpose_8x8(res, res);
+
+ in01 = _mm_unpacklo_epi64(res[0], res[1]);
+ in23 = _mm_unpacklo_epi64(res[2], res[3]);
+ in45 = _mm_unpacklo_epi64(res[4], res[5]);
+ in67 = _mm_unpacklo_epi64(res[6], res[7]);
+
+ sign01 = _mm_srai_epi16(in01, 15);
+ sign23 = _mm_srai_epi16(in23, 15);
+ sign45 = _mm_srai_epi16(in45, 15);
+ sign67 = _mm_srai_epi16(in67, 15);
+
+ in01 = _mm_sub_epi16(in01, sign01);
+ in23 = _mm_sub_epi16(in23, sign23);
+ in45 = _mm_sub_epi16(in45, sign45);
+ in67 = _mm_sub_epi16(in67, sign67);
+
+ in01 = _mm_srai_epi16(in01, shift);
+ in23 = _mm_srai_epi16(in23, shift);
+ in45 = _mm_srai_epi16(in45, shift);
+ in67 = _mm_srai_epi16(in67, shift);
+
+ store_output(&in01, (output + 0 * 8));
+ store_output(&in23, (output + 1 * 8));
+ store_output(&in45, (output + 2 * 8));
+ store_output(&in67, (output + 3 * 8));
+}
+
+void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case ADST_DCT:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case DCT_ADST:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case ADST_ADST:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_4x8(input, in, stride, 1, 0);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x8(input, in, stride, 0, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x8(input, in, stride, 1, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x8(input, in, stride, 0, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x8(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case IDTX:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+ case V_DCT:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case H_DCT:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+ case V_ADST:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case H_ADST:
+ load_buffer_4x8(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x8(input, in, stride, 1, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x8(input, in, stride, 0, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ write_buffer_4x8(output, in);
+}
+
+// Load input into the left-hand half of in (ie, into lanes 0..3 of
+// each element of in). The right hand half (lanes 4..7) should be
+// treated as being filled with "don't care" values.
+// The input is split horizontally into two 4x4
+// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
+// block of 'in' and 'r' is stored in the bottom-left block.
+// This is to allow us to reuse 4x4 transforms.
+static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr) {
+ const int shift = 2;
+ if (!flipud) {
+ in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
+ } else {
+ in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
+ in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
+ in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
+ in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ }
+
+ in[0] = _mm_slli_epi16(in[0], shift);
+ in[1] = _mm_slli_epi16(in[1], shift);
+ in[2] = _mm_slli_epi16(in[2], shift);
+ in[3] = _mm_slli_epi16(in[3], shift);
+
+ scale_sqrt2_8x4(in);
+
+ in[4] = _mm_shuffle_epi32(in[0], 0xe);
+ in[5] = _mm_shuffle_epi32(in[1], 0xe);
+ in[6] = _mm_shuffle_epi32(in[2], 0xe);
+ in[7] = _mm_shuffle_epi32(in[3], 0xe);
+}
+
+static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
+ __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
+ const int shift = 1;
+ sign0 = _mm_srai_epi16(res[0], 15);
+ sign1 = _mm_srai_epi16(res[1], 15);
+ sign2 = _mm_srai_epi16(res[2], 15);
+ sign3 = _mm_srai_epi16(res[3], 15);
+
+ out0 = _mm_sub_epi16(res[0], sign0);
+ out1 = _mm_sub_epi16(res[1], sign1);
+ out2 = _mm_sub_epi16(res[2], sign2);
+ out3 = _mm_sub_epi16(res[3], sign3);
+
+ out0 = _mm_srai_epi16(out0, shift);
+ out1 = _mm_srai_epi16(out1, shift);
+ out2 = _mm_srai_epi16(out2, shift);
+ out3 = _mm_srai_epi16(out3, shift);
+
+ store_output(&out0, (output + 0 * 8));
+ store_output(&out1, (output + 1 * 8));
+ store_output(&out2, (output + 2 * 8));
+ store_output(&out3, (output + 3 * 8));
+}
+
+void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case ADST_DCT:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case DCT_ADST:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case ADST_ADST:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_8x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x4(input, in, stride, 0, 1);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x4(input, in, stride, 1, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x4(input, in, stride, 0, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case IDTX:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+ case V_DCT:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fdct4_sse2(in);
+ fdct4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+ case H_DCT:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fdct8_sse2(in);
+ break;
+ case V_ADST:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+ case H_ADST:
+ load_buffer_8x4(input, in, stride, 0, 0);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in + 4);
+ fidtx8_sse2(in);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x4(input, in, stride, 0, 1);
+ fidtx4_sse2(in);
+ fidtx4_sse2(in + 4);
+ fadst8_sse2(in);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ write_buffer_8x4(output, in);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr) {
+ // Load 2 8x8 blocks
+ const int16_t *t = input;
+ const int16_t *b = input + 8 * stride;
+
+ if (flipud) {
+ const int16_t *const tmp = t;
+ t = b;
+ b = tmp;
+ }
+
+ load_buffer_8x8(t, in, stride, flipud, fliplr);
+ scale_sqrt2_8x8(in);
+ load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
+ scale_sqrt2_8x8(in + 8);
+}
+
+static INLINE void round_power_of_two_signed(__m128i *x, int n) {
+ const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
+ const __m128i sign = _mm_srai_epi16(*x, 15);
+ const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
+ *x = _mm_srai_epi16(res, n);
+}
+
+static void row_8x16_rounding(__m128i *in, int bits) {
+ int i;
+ for (i = 0; i < 16; i++) {
+ round_power_of_two_signed(&in[i], bits);
+ }
+}
+
+void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[16];
+
+ __m128i *const t = in; // Alias to top 8x8 sub block
+ __m128i *const b = in + 8; // Alias to bottom 8x8 sub block
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fdct8_sse2(t);
+ fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case ADST_DCT:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fdct8_sse2(t);
+ fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case DCT_ADST:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case ADST_ADST:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_8x16(input, in, stride, 1, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fdct8_sse2(t);
+ fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x16(input, in, stride, 0, 1);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x16(input, in, stride, 1, 1);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x16(input, in, stride, 0, 1);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x16(input, in, stride, 1, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case IDTX:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fidtx8_sse2(t);
+ fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+ case V_DCT:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fidtx8_sse2(t);
+ fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case H_DCT:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fdct8_sse2(t);
+ fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+ case V_ADST:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fidtx8_sse2(t);
+ fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case H_ADST:
+ load_buffer_8x16(input, in, stride, 0, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x16(input, in, stride, 1, 0);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fidtx8_sse2(t);
+ fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x16(input, in, stride, 0, 1);
+ array_transpose_8x8(t, t);
+ array_transpose_8x8(b, b);
+ fadst8_sse2(t);
+ fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ write_buffer_8x8(output, t, 8);
+ write_buffer_8x8(output + 64, b, 8);
+}
+
+static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr) {
+ // Load 2 8x8 blocks
+ const int16_t *l = input;
+ const int16_t *r = input + 8;
+
+ if (fliplr) {
+ const int16_t *const tmp = l;
+ l = r;
+ r = tmp;
+ }
+
+ // load first 8 columns
+ load_buffer_8x8(l, in, stride, flipud, fliplr);
+ scale_sqrt2_8x8(in);
+ load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
+ scale_sqrt2_8x8(in + 8);
+}
+
+#define col_16x8_rounding row_8x16_rounding
+
+void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[16];
+
+ __m128i *const l = in; // Alias to left 8x8 sub block
+ __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store
+ // in the second half of the array
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fdct8_sse2(l);
+ fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case ADST_DCT:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case DCT_ADST:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fdct8_sse2(l);
+ fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case ADST_ADST:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_16x8(input, in, stride, 1, 0);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x8(input, in, stride, 0, 1);
+ fdct8_sse2(l);
+ fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x8(input, in, stride, 1, 1);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x8(input, in, stride, 0, 1);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x8(input, in, stride, 1, 0);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case IDTX:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fidtx8_sse2(l);
+ fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+ case V_DCT:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fdct8_sse2(l);
+ fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+ case H_DCT:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fidtx8_sse2(l);
+ fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fdct16_8col(in);
+ break;
+ case V_ADST:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+ case H_ADST:
+ load_buffer_16x8(input, in, stride, 0, 0);
+ fidtx8_sse2(l);
+ fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x8(input, in, stride, 1, 0);
+ fadst8_sse2(l);
+ fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
+ idtx16_8col(in);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x8(input, in, stride, 0, 1);
+ fidtx8_sse2(l);
+ fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
+ fadst16_8col(in);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ array_transpose_8x8(l, l);
+ array_transpose_8x8(r, r);
+ write_buffer_8x8(output, l, 16);
+ write_buffer_8x8(output + 8, r, 16);
+}
+
+// Note: The 16-column 32-element transforms expect their input to be
+// split up into a 2x2 grid of 8x16 blocks
+static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+ __m128i *br) {
+ fdct32_8col(tl, bl);
+ fdct32_8col(tr, br);
+ array_transpose_16x16(tl, tr);
+ array_transpose_16x16(bl, br);
+}
+
+#if CONFIG_EXT_TX
+static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+ __m128i *br) {
+ int i;
+ for (i = 0; i < 16; ++i) {
+ tl[i] = _mm_slli_epi16(tl[i], 2);
+ tr[i] = _mm_slli_epi16(tr[i], 2);
+ bl[i] = _mm_slli_epi16(bl[i], 2);
+ br[i] = _mm_slli_epi16(br[i], 2);
+ }
+ array_transpose_16x16(tl, tr);
+ array_transpose_16x16(bl, br);
+}
+#endif
+
+static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
+ __m128i *intr, __m128i *inbl,
+ __m128i *inbr, int stride, int flipud,
+ int fliplr) {
+ int i;
+ if (flipud) {
+ input = input + 31 * stride;
+ stride = -stride;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ intl[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+ intr[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+ inbl[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
+ inbr[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
+ }
+
+ if (fliplr) {
+ __m128i tmp;
+ for (i = 0; i < 16; ++i) {
+ tmp = intl[i];
+ intl[i] = mm_reverse_epi16(intr[i]);
+ intr[i] = mm_reverse_epi16(tmp);
+ tmp = inbl[i];
+ inbl[i] = mm_reverse_epi16(inbr[i]);
+ inbr[i] = mm_reverse_epi16(tmp);
+ }
+ }
+
+ scale_sqrt2_8x16(intl);
+ scale_sqrt2_8x16(intr);
+ scale_sqrt2_8x16(inbl);
+ scale_sqrt2_8x16(inbr);
+}
+
+static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
+ __m128i *restr, __m128i *resbl,
+ __m128i *resbr) {
+ int i;
+ for (i = 0; i < 16; ++i) {
+ store_output(&restl[i], output + i * 16 + 0);
+ store_output(&restr[i], output + i * 16 + 8);
+ store_output(&resbl[i], output + (i + 16) * 16 + 0);
+ store_output(&resbr[i], output + (i + 16) * 16 + 8);
+ }
+}
+
+static INLINE void round_signed_8x8(__m128i *in, const int bit) {
+ const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
+ __m128i sign0 = _mm_srai_epi16(in[0], 15);
+ __m128i sign1 = _mm_srai_epi16(in[1], 15);
+ __m128i sign2 = _mm_srai_epi16(in[2], 15);
+ __m128i sign3 = _mm_srai_epi16(in[3], 15);
+ __m128i sign4 = _mm_srai_epi16(in[4], 15);
+ __m128i sign5 = _mm_srai_epi16(in[5], 15);
+ __m128i sign6 = _mm_srai_epi16(in[6], 15);
+ __m128i sign7 = _mm_srai_epi16(in[7], 15);
+
+ in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
+ in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
+ in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
+ in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
+ in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
+ in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
+ in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
+ in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
+
+ in[0] = _mm_srai_epi16(in[0], bit);
+ in[1] = _mm_srai_epi16(in[1], bit);
+ in[2] = _mm_srai_epi16(in[2], bit);
+ in[3] = _mm_srai_epi16(in[3], bit);
+ in[4] = _mm_srai_epi16(in[4], bit);
+ in[5] = _mm_srai_epi16(in[5], bit);
+ in[6] = _mm_srai_epi16(in[6], bit);
+ in[7] = _mm_srai_epi16(in[7], bit);
+}
+
+static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
+ const int bit = 4;
+ round_signed_8x8(in0, bit);
+ round_signed_8x8(in0 + 8, bit);
+ round_signed_8x8(in1, bit);
+ round_signed_8x8(in1 + 8, bit);
+}
+
+// Note:
+// suffix "t" indicates the transpose operation comes first
+static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ fdct16_8col(in0);
+ fdct16_8col(in1);
+}
+
+static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ fadst16_8col(in0);
+ fadst16_8col(in1);
+}
+
+static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+ __m128i *br) {
+ array_transpose_16x16(tl, tr);
+ array_transpose_16x16(bl, br);
+ fdct32_8col(tl, bl);
+ fdct32_8col(tr, br);
+}
+
+typedef enum transpose_indicator_ {
+ transpose,
+ no_transpose,
+} transpose_indicator;
+
+static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+ __m128i *br, transpose_indicator t) {
+ __m128i tmpl[16], tmpr[16];
+ int i;
+
+ // Copy the bottom half of the input to temporary storage
+ for (i = 0; i < 16; ++i) {
+ tmpl[i] = bl[i];
+ tmpr[i] = br[i];
+ }
+
+ // Generate the bottom half of the output
+ for (i = 0; i < 16; ++i) {
+ bl[i] = _mm_slli_epi16(tl[i], 2);
+ br[i] = _mm_slli_epi16(tr[i], 2);
+ }
+ array_transpose_16x16(bl, br);
+
+ // Copy the temporary storage back to the top half of the input
+ for (i = 0; i < 16; ++i) {
+ tl[i] = tmpl[i];
+ tr[i] = tmpr[i];
+ }
+
+ // Generate the top half of the output
+ scale_sqrt2_8x16(tl);
+ scale_sqrt2_8x16(tr);
+ if (t == transpose)
+ fdct16t_sse2(tl, tr);
+ else
+ fdct16_sse2(tl, tr);
+}
+
+// Note on data layout, for both this and the 32x16 transforms:
+// So that we can reuse the 16-element transforms easily,
+// we want to split the input into 8x16 blocks.
+// For 16x32, this means the input is a 2x2 grid of such blocks.
+// For 32x16, it means the input is a 4x1 grid.
+void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i intl[16], intr[16], inbl[16], inbr[16];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fdct16t_sse2(intl, intr);
+ fdct16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fdct32t_16col(intl, intr, inbl, inbr);
+ break;
+ case ADST_DCT:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fdct16t_sse2(intl, intr);
+ fdct16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+ case DCT_ADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fdct32t_16col(intl, intr, inbl, inbr);
+ break;
+ case ADST_ADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+ fdct16t_sse2(intl, intr);
+ fdct16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fdct32t_16col(intl, intr, inbl, inbr);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+ case IDTX:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fidtx16_sse2(intl, intr);
+ fidtx16_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fidtx32_16col(intl, intr, inbl, inbr);
+ break;
+ case V_DCT:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fidtx16_sse2(intl, intr);
+ fidtx16_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fdct32t_16col(intl, intr, inbl, inbr);
+ break;
+ case H_DCT:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fdct16t_sse2(intl, intr);
+ fdct16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fidtx32_16col(intl, intr, inbl, inbr);
+ break;
+ case V_ADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fidtx16_sse2(intl, intr);
+ fidtx16_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+ case H_ADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fidtx32_16col(intl, intr, inbl, inbr);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+ fidtx16_sse2(intl, intr);
+ fidtx16_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fhalfright32_16col(intl, intr, inbl, inbr, transpose);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+ fadst16t_sse2(intl, intr);
+ fadst16t_sse2(inbl, inbr);
+ round_signed_16x16(intl, intr);
+ round_signed_16x16(inbl, inbr);
+ fidtx32_16col(intl, intr, inbl, inbr);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ write_buffer_16x32(output, intl, intr, inbl, inbr);
+}
+
+static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
+ __m128i *in1, __m128i *in2, __m128i *in3,
+ int stride, int flipud, int fliplr) {
+ int i;
+ if (flipud) {
+ input += 15 * stride;
+ stride = -stride;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ in0[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+ in1[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+ in2[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
+ in3[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
+ }
+
+ if (fliplr) {
+ for (i = 0; i < 16; ++i) {
+ __m128i tmp1 = in0[i];
+ __m128i tmp2 = in1[i];
+ in0[i] = mm_reverse_epi16(in3[i]);
+ in1[i] = mm_reverse_epi16(in2[i]);
+ in2[i] = mm_reverse_epi16(tmp2);
+ in3[i] = mm_reverse_epi16(tmp1);
+ }
+ }
+
+ scale_sqrt2_8x16(in0);
+ scale_sqrt2_8x16(in1);
+ scale_sqrt2_8x16(in2);
+ scale_sqrt2_8x16(in3);
+}
+
+static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
+ __m128i *res1, __m128i *res2,
+ __m128i *res3) {
+ int i;
+ for (i = 0; i < 16; ++i) {
+ store_output(&res0[i], output + i * 32 + 0);
+ store_output(&res1[i], output + i * 32 + 8);
+ store_output(&res2[i], output + i * 32 + 16);
+ store_output(&res3[i], output + i * 32 + 24);
+ }
+}
+
+void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in0[16], in1[16], in2[16], in3[16];
+
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct16_sse2(in0, in1);
+ fdct16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fdct32_16col(in0, in1, in2, in3);
+ break;
+ case ADST_DCT:
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fdct32_16col(in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ fdct16_sse2(in0, in1);
+ fdct16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+ case ADST_ADST:
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fdct32_16col(in0, in1, in2, in3);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+ fdct16_sse2(in0, in1);
+ fdct16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+ case IDTX:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ fidtx16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fidtx32_16col(in0, in1, in2, in3);
+ break;
+ case V_DCT:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+ fdct16_sse2(in0, in1);
+ fdct16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fidtx32_16col(in0, in1, in2, in3);
+ break;
+ case H_DCT:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ fidtx16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fdct32_16col(in0, in1, in2, in3);
+ break;
+ case V_ADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fidtx32_16col(in0, in1, in2, in3);
+ break;
+ case H_ADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ fidtx16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+ case V_FLIPADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ fadst16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fidtx32_16col(in0, in1, in2, in3);
+ break;
+ case H_FLIPADST:
+ load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+ fidtx16_sse2(in0, in1);
+ fidtx16_sse2(in2, in3);
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(in2, in3);
+ fhalfright32_16col(in0, in1, in2, in3, no_transpose);
+ break;
+#endif
+ default: assert(0); break;
+ }
+ write_buffer_32x16(output, in0, in1, in2, in3);
+}
+
+// Note:
+// 32x32 hybrid fwd txfm
+// 4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
+static INLINE void load_buffer_32x32(const int16_t *input,
+ __m128i *in0 /*in0[32]*/,
+ __m128i *in1 /*in1[32]*/,
+ __m128i *in2 /*in2[32]*/,
+ __m128i *in3 /*in3[32]*/, int stride,
+ int flipud, int fliplr) {
+ if (flipud) {
+ input += 31 * stride;
+ stride = -stride;
+ }
+
+ int i;
+ for (i = 0; i < 32; ++i) {
+ in0[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+ in1[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+ in2[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
+ in3[i] = _mm_slli_epi16(
+ _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
+ }
+
+ if (fliplr) {
+ for (i = 0; i < 32; ++i) {
+ __m128i tmp1 = in0[i];
+ __m128i tmp2 = in1[i];
+ in0[i] = mm_reverse_epi16(in3[i]);
+ in1[i] = mm_reverse_epi16(in2[i]);
+ in2[i] = mm_reverse_epi16(tmp2);
+ in3[i] = mm_reverse_epi16(tmp1);
+ }
+ }
+}
+
+static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
+ __m128i *b0r /*b0r[16]*/,
+ __m128i *b1l /*b1l[16]*/,
+ __m128i *b1r /*b1r[16]*/) {
+ int i;
+ for (i = 0; i < 16; ++i) {
+ __m128i tmp0 = b1l[i];
+ __m128i tmp1 = b1r[i];
+ b1l[i] = b0l[i];
+ b1r[i] = b0r[i];
+ b0l[i] = tmp0;
+ b0r[i] = tmp1;
+ }
+}
+
+static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ fdct32_8col(in0, &in0[16]);
+ fdct32_8col(in1, &in1[16]);
+ fdct32_8col(in2, &in2[16]);
+ fdct32_8col(in3, &in3[16]);
+
+ array_transpose_16x16(in0, in1);
+ array_transpose_16x16(&in0[16], &in1[16]);
+ array_transpose_16x16(in2, in3);
+ array_transpose_16x16(&in2[16], &in3[16]);
+
+ swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+
+static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
+ fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
+ swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+
+#if CONFIG_EXT_TX
+static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ fidtx32_16col(in0, in1, &in0[16], &in1[16]);
+ fidtx32_16col(in2, in3, &in2[16], &in3[16]);
+ swap_16x16(&in0[16], &in1[16], in2, in3);
+}
+#endif
+
+static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3) {
+ round_signed_16x16(in0, in1);
+ round_signed_16x16(&in0[16], &in1[16]);
+ round_signed_16x16(in2, in3);
+ round_signed_16x16(&in2[16], &in3[16]);
+}
+
+static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
+ __m128i *in3, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 32; ++i) {
+ store_output(&in0[i], output + i * 32 + 0);
+ store_output(&in1[i], output + i * 32 + 8);
+ store_output(&in2[i], output + i * 32 + 16);
+ store_output(&in3[i], output + i * 32 + 24);
+ }
+}
+
+void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in0[32], in1[32], in2[32], in3[32];
+
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case ADST_DCT:
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case ADST_ADST:
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case IDTX:
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case V_DCT:
+ fdct32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case H_DCT:
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fdct32(in0, in1, in2, in3);
+ break;
+ case V_ADST:
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case H_ADST:
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+ case V_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
+ fhalfright32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fidtx32(in0, in1, in2, in3);
+ break;
+ case H_FLIPADST:
+ load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
+ fidtx32(in0, in1, in2, in3);
+ round_signed_32x32(in0, in1, in2, in3);
+ fhalfright32(in0, in1, in2, in3);
+ break;
+#endif
+ default: assert(0);
+ }
+ write_buffer_32x32(in0, in1, in2, in3, output);
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000000..a99db3d6ed
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,87 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m1
+ movq m4, m0
+ psubw m3, m2
+ psubw m4, m3
+ psraw m4, 1
+ movq m5, m4
+ psubw m5, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m4
+ paddw m3, m5
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ ; 00 01 02 03
+ ; 10 11 12 13
+ ; 20 21 22 23
+ ; 30 31 32 33
+ punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
+ punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
+ mova m1, m0
+ punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
+ punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+ lea r3q, [inputq + strideq*4]
+ movq m0, [inputq] ;a1
+ movq m1, [inputq + strideq*2] ;b1
+ movq m2, [r3q] ;c1
+ movq m3, [r3q + strideq*2] ;d1
+
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ SWAP 1, 2
+ psrldq m1, m0, 8
+ psrldq m3, m2, 8
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+
+ psllw m0, 2
+ psllw m1, 2
+
+%if CONFIG_HIGHBITDEPTH
+ ; sign extension
+ mova m2, m0
+ mova m3, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+ punpckhwd m2, m2
+ punpckhwd m3, m3
+ psrad m0, 16
+ psrad m1, 16
+ psrad m2, 16
+ psrad m3, 16
+ mova [outputq], m0
+ mova [outputq + 16], m2
+ mova [outputq + 32], m1
+ mova [outputq + 48], m3
+%else
+ mova [outputq], m0
+ mova [outputq + 16], m1
+%endif
+
+ RET
diff --git a/third_party/aom/av1/encoder/x86/dct_ssse3.c b/third_party/aom/av1/encoder/x86/dct_ssse3.c
new file mode 100644
index 0000000000..717a99af8f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_ssse3.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h> // SSSE3
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+void av1_fdct8x8_quant_ssse3(
+ const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ __m128i zero;
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ __m128i *in[8];
+ int index = 0;
+
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)coeff_ptr;
+
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ in[0] = &in0;
+ in[1] = &in1;
+ in[2] = &in2;
+ in[3] = &in3;
+ in[4] = &in4;
+ in[5] = &in5;
+ in[6] = &in6;
+ in[7] = &in7;
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = _mm_add_epi16(in0, in7);
+ const __m128i q1 = _mm_add_epi16(in1, in6);
+ const __m128i q2 = _mm_add_epi16(in2, in5);
+ const __m128i q3 = _mm_add_epi16(in3, in4);
+ const __m128i q4 = _mm_sub_epi16(in3, in4);
+ const __m128i q5 = _mm_sub_epi16(in2, in5);
+ const __m128i q6 = _mm_sub_epi16(in1, in6);
+ const __m128i q7 = _mm_sub_epi16(in0, in7);
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = _mm_add_epi16(q0, q3);
+ const __m128i r1 = _mm_add_epi16(q1, q2);
+ const __m128i r2 = _mm_sub_epi16(q1, q2);
+ const __m128i r3 = _mm_sub_epi16(q0, q3);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_sub_epi16(q6, q5);
+ const __m128i d1 = _mm_add_epi16(q6, q5);
+ const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
+ const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
+ // Add/subtract
+ const __m128i x0 = _mm_add_epi16(q4, r0);
+ const __m128i x1 = _mm_sub_epi16(q4, r0);
+ const __m128i x2 = _mm_sub_epi16(q7, r1);
+ const __m128i x3 = _mm_add_epi16(q7, r1);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ }
+
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+ zero = _mm_setzero_si128();
+
+ if (!skip_block) {
+ __m128i eob;
+ __m128i round, quant, dequant, thr;
+ int16_t nzflag;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
+ {
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ }
+
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ coeff0 = *in[0];
+ coeff1 = *in[1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ // AC only loop
+ index = 2;
+ thr = _mm_srai_epi16(dequant, 1);
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+ coeff0 = *in[index];
+ coeff1 = *in[index + 1];
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
+ }
+
+ if (nzflag) {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ n_coeffs += 8 * 2;
+ index += 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+ } else {
+ do {
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+ n_coeffs += 8 * 2;
+ } while (n_coeffs < 0);
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 0000000000..ae733a1ce8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+
+int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+ __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+ __m256i sse_reg_64hi, ssz_reg_64hi;
+ __m128i sse_reg128, ssz_reg128;
+ int64_t sse;
+ int i;
+ const __m256i zero_reg = _mm256_set1_epi16(0);
+
+ // init sse and ssz registerd to zero
+ sse_reg = _mm256_set1_epi16(0);
+ ssz_reg = _mm256_set1_epi16(0);
+
+ for (i = 0; i < block_size; i += 16) {
+ // load 32 bytes from coeff and dqcoeff
+ coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
+ dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+ // dqcoeff - coeff
+ dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+ // madd (dqcoeff - coeff)
+ dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+ // madd coeff
+ coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+ // expand each double word of madd (dqcoeff - coeff) to quad word
+ exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+ exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+ // expand each double word of madd (coeff) to quad word
+ exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+ exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+ // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+ }
+ // save the higher 64 bit of each 128 bit lane
+ sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+ ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+ // add the higher 64 bit to the low 64 bit
+ sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+ // add each 64 bit from each of the 128 bit lane of the 256 bit
+ sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+ _mm256_extractf128_si256(sse_reg, 1));
+
+ ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+ _mm256_extractf128_si256(ssz_reg, 1));
+
+ // store the results
+ _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+ _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+ _mm256_zeroupper();
+ return sse;
+}
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 0000000000..4680f1fabd
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,125 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+; int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+ punpckldq m7, m1, m5
+ paddq m4, m0
+ punpckhdq m1, m5
+ paddq m4, m7
+ punpckldq m7, m2, m5
+ paddq m4, m1
+ punpckhdq m2, m5
+ paddq m6, m7
+ punpckldq m7, m3, m5
+ paddq m6, m2
+ punpckhdq m3, m5
+ paddq m6, m7
+ paddq m6, m3
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ movhlps m7, m6
+ paddq m4, m5
+ paddq m6, m7
+%if ARCH_X86_64
+ movq rax, m4
+ movq [sszq], m6
+%else
+ mov eax, sszm
+ pshufd m5, m4, 0x1
+ movq [eax], m6
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+ pxor m4, m4 ; sse accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ ; accumulate in 64bit
+ punpckldq m3, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m3
+ punpckldq m3, m1, m5
+ paddq m4, m0
+ punpckhdq m1, m5
+ paddq m4, m3
+ paddq m4, m1
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ paddq m4, m5
+%if ARCH_X86_64
+ movq rax, m4
+%else
+ pshufd m5, m4, 0x1
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..777304ace7
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i, j, test;
+ uint32_t temp[4];
+ __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 8) {
+ // Load the data into xmm registers
+ __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+ __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+ __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+ __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+ // Check if any values require more than 15 bit
+ max = _mm_set1_epi32(0x3fff);
+ min = _mm_set1_epi32(0xffffc000);
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+ _mm_cmplt_epi32(mm_coeff, min));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+ _mm_cmplt_epi32(mm_coeff2, min));
+ cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+ _mm_cmplt_epi32(mm_dqcoeff, min));
+ cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+ _mm_cmplt_epi32(mm_dqcoeff2, min));
+ test = _mm_movemask_epi8(
+ _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+ if (!test) {
+ __m128i mm_diff, error_sse2, sqcoeff_sse2;
+ mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+ mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+ mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+ error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+ sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+ _mm_storeu_si128((__m128i *)temp, error_sse2);
+ error = error + temp[0] + temp[1] + temp[2] + temp[3];
+ _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+ sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+ } else {
+ for (j = 0; j < 8; j++) {
+ const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+ }
+ }
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 0000000000..f201a29aaa
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,1895 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ if (!flipud) {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ } else {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ }
+
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[2]);
+ in[3] = _mm_cvtepi16_epi32(in[3]);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i s0, s1, s2, s3;
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ s0 = _mm_add_epi32(in[0], in[3]);
+ s1 = _mm_add_epi32(in[1], in[2]);
+ s2 = _mm_sub_epi32(in[1], in[2]);
+ s3 = _mm_sub_epi32(in[0], in[3]);
+
+ // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+ u0 = _mm_mullo_epi32(s0, cospi32);
+ u1 = _mm_mullo_epi32(s1, cospi32);
+ u2 = _mm_add_epi32(u0, u1);
+ v0 = _mm_sub_epi32(u0, u1);
+
+ u3 = _mm_add_epi32(u2, rnding);
+ v1 = _mm_add_epi32(v0, rnding);
+
+ u0 = _mm_srai_epi32(u3, bit);
+ u2 = _mm_srai_epi32(v1, bit);
+
+ // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+ v0 = _mm_mullo_epi32(s2, cospi48);
+ v1 = _mm_mullo_epi32(s3, cospi16);
+ v2 = _mm_add_epi32(v0, v1);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u1 = _mm_srai_epi32(v3, bit);
+
+ v0 = _mm_mullo_epi32(s2, cospi16);
+ v1 = _mm_mullo_epi32(s3, cospi48);
+ v2 = _mm_sub_epi32(v1, v0);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u3 = _mm_srai_epi32(v3, bit);
+
+ // Note: shift[1] and shift[2] are zeros
+
+ // Transpose 4x4 32-bit
+ v0 = _mm_unpacklo_epi32(u0, u1);
+ v1 = _mm_unpackhi_epi32(u0, u1);
+ v2 = _mm_unpacklo_epi32(u2, u3);
+ v3 = _mm_unpackhi_epi32(u2, u3);
+
+ in[0] = _mm_unpacklo_epi64(v0, v2);
+ in[1] = _mm_unpackhi_epi64(v0, v2);
+ in[2] = _mm_unpacklo_epi64(v1, v3);
+ in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+// Note:
+// We implement av1_fwd_txfm2d_4x4(). This function is kept here since
+// av1_highbd_fht4x4_c() is not removed yet
+void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ (void)input;
+ (void)output;
+ (void)stride;
+ (void)tx_type;
+ assert(0);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i s0, s1, s2, s3;
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = _mm_mullo_epi32(in[3], cospi8);
+ u1 = _mm_mullo_epi32(in[0], cospi56);
+ u2 = _mm_add_epi32(u0, u1);
+ s0 = _mm_add_epi32(u2, rnding);
+ s0 = _mm_srai_epi32(s0, bit);
+
+ v0 = _mm_mullo_epi32(in[3], cospi56);
+ v1 = _mm_mullo_epi32(in[0], cospi8);
+ v2 = _mm_sub_epi32(v0, v1);
+ s1 = _mm_add_epi32(v2, rnding);
+ s1 = _mm_srai_epi32(s1, bit);
+
+ u0 = _mm_mullo_epi32(in[1], cospi40);
+ u1 = _mm_mullo_epi32(in[2], cospi24);
+ u2 = _mm_add_epi32(u0, u1);
+ s2 = _mm_add_epi32(u2, rnding);
+ s2 = _mm_srai_epi32(s2, bit);
+
+ v0 = _mm_mullo_epi32(in[1], cospi24);
+ v1 = _mm_mullo_epi32(in[2], cospi40);
+ v2 = _mm_sub_epi32(v0, v1);
+ s3 = _mm_add_epi32(v2, rnding);
+ s3 = _mm_srai_epi32(s3, bit);
+
+ // stage 3
+ u0 = _mm_add_epi32(s0, s2);
+ u2 = _mm_sub_epi32(s0, s2);
+ u1 = _mm_add_epi32(s1, s3);
+ u3 = _mm_sub_epi32(s1, s3);
+
+ // stage 4
+ v0 = _mm_mullo_epi32(u2, cospi32);
+ v1 = _mm_mullo_epi32(u3, cospi32);
+ v2 = _mm_add_epi32(v0, v1);
+ s2 = _mm_add_epi32(v2, rnding);
+ u2 = _mm_srai_epi32(s2, bit);
+
+ v2 = _mm_sub_epi32(v0, v1);
+ s3 = _mm_add_epi32(v2, rnding);
+ u3 = _mm_srai_epi32(s3, bit);
+
+ // u0, u1, u2, u3
+ u2 = _mm_sub_epi32(kZero, u2);
+ u1 = _mm_sub_epi32(kZero, u1);
+
+ // u0, u2, u3, u1
+ // Transpose 4x4 32-bit
+ v0 = _mm_unpacklo_epi32(u0, u2);
+ v1 = _mm_unpackhi_epi32(u0, u2);
+ v2 = _mm_unpacklo_epi32(u3, u1);
+ v3 = _mm_unpackhi_epi32(u3, u1);
+
+ in[0] = _mm_unpacklo_epi64(v0, v2);
+ in[1] = _mm_unpackhi_epi64(v0, v2);
+ in[2] = _mm_unpacklo_epi64(v1, v3);
+ in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+ int input_stride, int tx_type, int bd) {
+ __m128i in[4];
+ const TXFM_2D_CFG *cfg = NULL;
+
+ switch (tx_type) {
+ case DCT_DCT:
+ cfg = &fwd_txfm_2d_cfg_dct_dct_4;
+ load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+ fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_DCT:
+ cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+ load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_ADST:
+ cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+ load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+ fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_ADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+ load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+ load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+ fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+ load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+ fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+ write_buffer_4x4(in, coeff);
+ break;
+#endif
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i u;
+ if (!flipud) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ } else {
+ in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+ }
+
+ u = _mm_unpackhi_epi64(in[4], in[4]);
+ in[8] = _mm_cvtepi16_epi32(in[4]);
+ in[9] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[5], in[5]);
+ in[10] = _mm_cvtepi16_epi32(in[5]);
+ in[11] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[6], in[6]);
+ in[12] = _mm_cvtepi16_epi32(in[6]);
+ in[13] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[7], in[7]);
+ in[14] = _mm_cvtepi16_epi32(in[7]);
+ in[15] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[3], in[3]);
+ in[6] = _mm_cvtepi16_epi32(in[3]);
+ in[7] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[2], in[2]);
+ in[4] = _mm_cvtepi16_epi32(in[2]);
+ in[5] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[1], in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[1]);
+ in[3] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[0], in[0]);
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(u);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+ in[4] = _mm_slli_epi32(in[4], shift);
+ in[5] = _mm_slli_epi32(in[5], shift);
+ in[6] = _mm_slli_epi32(in[6], shift);
+ in[7] = _mm_slli_epi32(in[7], shift);
+
+ in[8] = _mm_slli_epi32(in[8], shift);
+ in[9] = _mm_slli_epi32(in[9], shift);
+ in[10] = _mm_slli_epi32(in[10], shift);
+ in[11] = _mm_slli_epi32(in[11], shift);
+ in[12] = _mm_slli_epi32(in[12], shift);
+ in[13] = _mm_slli_epi32(in[13], shift);
+ in[14] = _mm_slli_epi32(in[14], shift);
+ in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rounding);
+ in[1] = _mm_add_epi32(in[1], rounding);
+ in[2] = _mm_add_epi32(in[2], rounding);
+ in[3] = _mm_add_epi32(in[3], rounding);
+ in[4] = _mm_add_epi32(in[4], rounding);
+ in[5] = _mm_add_epi32(in[5], rounding);
+ in[6] = _mm_add_epi32(in[6], rounding);
+ in[7] = _mm_add_epi32(in[7], rounding);
+ in[8] = _mm_add_epi32(in[8], rounding);
+ in[9] = _mm_add_epi32(in[9], rounding);
+ in[10] = _mm_add_epi32(in[10], rounding);
+ in[11] = _mm_add_epi32(in[11], rounding);
+ in[12] = _mm_add_epi32(in[12], rounding);
+ in[13] = _mm_add_epi32(in[13], rounding);
+ in[14] = _mm_add_epi32(in[14], rounding);
+ in[15] = _mm_add_epi32(in[15], rounding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ in[4] = _mm_srai_epi32(in[4], shift);
+ in[5] = _mm_srai_epi32(in[5], shift);
+ in[6] = _mm_srai_epi32(in[6], shift);
+ in[7] = _mm_srai_epi32(in[7], shift);
+ in[8] = _mm_srai_epi32(in[8], shift);
+ in[9] = _mm_srai_epi32(in[9], shift);
+ in[10] = _mm_srai_epi32(in[10], shift);
+ in[11] = _mm_srai_epi32(in[11], shift);
+ in[12] = _mm_srai_epi32(in[12], shift);
+ in[13] = _mm_srai_epi32(in[13], shift);
+ in[14] = _mm_srai_epi32(in[14], shift);
+ in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+ _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+ _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+ _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+ _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+ _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+ _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+ _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+ _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+ _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[8], v[8];
+
+ // Even 8 points 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[0], in[14]);
+ v[7] = _mm_sub_epi32(in[0], in[14]); // v[7]
+ u[1] = _mm_add_epi32(in[2], in[12]);
+ u[6] = _mm_sub_epi32(in[2], in[12]);
+ u[2] = _mm_add_epi32(in[4], in[10]);
+ u[5] = _mm_sub_epi32(in[4], in[10]);
+ u[3] = _mm_add_epi32(in[6], in[8]);
+ v[4] = _mm_sub_epi32(in[6], in[8]); // v[4]
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[3]);
+ v[3] = _mm_sub_epi32(u[0], u[3]);
+ v[1] = _mm_add_epi32(u[1], u[2]);
+ v[2] = _mm_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm_mullo_epi32(u[5], cospim32);
+ v[6] = _mm_mullo_epi32(u[6], cospi32);
+ v[5] = _mm_add_epi32(v[5], v[6]);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ u[0] = _mm_mullo_epi32(u[5], cospi32);
+ v[6] = _mm_mullo_epi32(u[6], cospim32);
+ v[6] = _mm_sub_epi32(u[0], v[6]);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm_mullo_epi32(v[0], cospi32);
+ v[1] = _mm_mullo_epi32(v[1], cospi32);
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_sub_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm_mullo_epi32(v[2], cospi48);
+ v[1] = _mm_mullo_epi32(v[3], cospi16);
+ u[2] = _mm_add_epi32(v[0], v[1]);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ v[0] = _mm_mullo_epi32(v[2], cospi16);
+ v[1] = _mm_mullo_epi32(v[3], cospi48);
+ u[3] = _mm_sub_epi32(v[1], v[0]);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ u[4] = _mm_add_epi32(v[4], v[5]);
+ u[5] = _mm_sub_epi32(v[4], v[5]);
+ u[6] = _mm_sub_epi32(v[7], v[6]);
+ u[7] = _mm_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm_mullo_epi32(u[4], cospi56);
+ v[1] = _mm_mullo_epi32(u[7], cospi8);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[2] = _mm_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm_mullo_epi32(u[4], cospi8);
+ v[1] = _mm_mullo_epi32(u[7], cospi56);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[14] = _mm_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi24);
+ v[1] = _mm_mullo_epi32(u[6], cospi40);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[10] = _mm_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi40);
+ v[1] = _mm_mullo_epi32(u[6], cospi24);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[6] = _mm_srai_epi32(v[0], bit); // buf0[6]
+
+ out[0] = u[0]; // buf0[0]
+ out[8] = u[1]; // buf0[1]
+ out[4] = u[2]; // buf0[2]
+ out[12] = u[3]; // buf0[3]
+
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[1], in[15]);
+ v[7] = _mm_sub_epi32(in[1], in[15]); // v[7]
+ u[1] = _mm_add_epi32(in[3], in[13]);
+ u[6] = _mm_sub_epi32(in[3], in[13]);
+ u[2] = _mm_add_epi32(in[5], in[11]);
+ u[5] = _mm_sub_epi32(in[5], in[11]);
+ u[3] = _mm_add_epi32(in[7], in[9]);
+ v[4] = _mm_sub_epi32(in[7], in[9]); // v[4]
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[3]);
+ v[3] = _mm_sub_epi32(u[0], u[3]);
+ v[1] = _mm_add_epi32(u[1], u[2]);
+ v[2] = _mm_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm_mullo_epi32(u[5], cospim32);
+ v[6] = _mm_mullo_epi32(u[6], cospi32);
+ v[5] = _mm_add_epi32(v[5], v[6]);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ u[0] = _mm_mullo_epi32(u[5], cospi32);
+ v[6] = _mm_mullo_epi32(u[6], cospim32);
+ v[6] = _mm_sub_epi32(u[0], v[6]);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm_mullo_epi32(v[0], cospi32);
+ v[1] = _mm_mullo_epi32(v[1], cospi32);
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_sub_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm_mullo_epi32(v[2], cospi48);
+ v[1] = _mm_mullo_epi32(v[3], cospi16);
+ u[2] = _mm_add_epi32(v[0], v[1]);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ v[0] = _mm_mullo_epi32(v[2], cospi16);
+ v[1] = _mm_mullo_epi32(v[3], cospi48);
+ u[3] = _mm_sub_epi32(v[1], v[0]);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ u[4] = _mm_add_epi32(v[4], v[5]);
+ u[5] = _mm_sub_epi32(v[4], v[5]);
+ u[6] = _mm_sub_epi32(v[7], v[6]);
+ u[7] = _mm_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm_mullo_epi32(u[4], cospi56);
+ v[1] = _mm_mullo_epi32(u[7], cospi8);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[3] = _mm_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm_mullo_epi32(u[4], cospi8);
+ v[1] = _mm_mullo_epi32(u[7], cospi56);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[15] = _mm_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi24);
+ v[1] = _mm_mullo_epi32(u[6], cospi40);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[11] = _mm_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi40);
+ v[1] = _mm_mullo_epi32(u[6], cospi24);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[7] = _mm_srai_epi32(v[0], bit); // buf0[6]
+
+ out[1] = u[0]; // buf0[0]
+ out[9] = u[1]; // buf0[1]
+ out[5] = u[2]; // buf0[2]
+ out[13] = u[3]; // buf0[3]
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i u[8], v[8], x;
+
+ // Even 8 points: 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[14], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[14], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[10], cospi20);
+ x = _mm_mullo_epi32(in[4], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[10], cospi44);
+ x = _mm_mullo_epi32(in[4], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[6], cospi36);
+ x = _mm_mullo_epi32(in[8], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[6], cospi28);
+ x = _mm_mullo_epi32(in[8], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[2], cospi52);
+ x = _mm_mullo_epi32(in[12], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[2], cospi12);
+ x = _mm_mullo_epi32(in[12], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ v[0] = _mm_add_epi32(u[0], u[4]);
+ v[4] = _mm_sub_epi32(u[0], u[4]);
+ v[1] = _mm_add_epi32(u[1], u[5]);
+ v[5] = _mm_sub_epi32(u[1], u[5]);
+ v[2] = _mm_add_epi32(u[2], u[6]);
+ v[6] = _mm_sub_epi32(u[2], u[6]);
+ v[3] = _mm_add_epi32(u[3], u[7]);
+ v[7] = _mm_sub_epi32(u[3], u[7]);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ v[0] = _mm_add_epi32(u[0], u[2]);
+ v[2] = _mm_sub_epi32(u[0], u[2]);
+ v[1] = _mm_add_epi32(u[1], u[3]);
+ v[3] = _mm_sub_epi32(u[1], u[3]);
+ v[4] = _mm_add_epi32(u[4], u[6]);
+ v[6] = _mm_sub_epi32(u[4], u[6]);
+ v[5] = _mm_add_epi32(u[5], u[7]);
+ v[7] = _mm_sub_epi32(u[5], u[7]);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ out[0] = u[0];
+ out[2] = _mm_sub_epi32(kZero, u[4]);
+ out[4] = u[6];
+ out[6] = _mm_sub_epi32(kZero, u[2]);
+ out[8] = u[3];
+ out[10] = _mm_sub_epi32(kZero, u[7]);
+ out[12] = u[5];
+ out[14] = _mm_sub_epi32(kZero, u[1]);
+
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[15], cospi4);
+ x = _mm_mullo_epi32(in[1], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[15], cospi60);
+ x = _mm_mullo_epi32(in[1], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[11], cospi20);
+ x = _mm_mullo_epi32(in[5], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[11], cospi44);
+ x = _mm_mullo_epi32(in[5], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[7], cospi36);
+ x = _mm_mullo_epi32(in[9], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[7], cospi28);
+ x = _mm_mullo_epi32(in[9], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[3], cospi52);
+ x = _mm_mullo_epi32(in[13], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[3], cospi12);
+ x = _mm_mullo_epi32(in[13], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ v[0] = _mm_add_epi32(u[0], u[4]);
+ v[4] = _mm_sub_epi32(u[0], u[4]);
+ v[1] = _mm_add_epi32(u[1], u[5]);
+ v[5] = _mm_sub_epi32(u[1], u[5]);
+ v[2] = _mm_add_epi32(u[2], u[6]);
+ v[6] = _mm_sub_epi32(u[2], u[6]);
+ v[3] = _mm_add_epi32(u[3], u[7]);
+ v[7] = _mm_sub_epi32(u[3], u[7]);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ v[0] = _mm_add_epi32(u[0], u[2]);
+ v[2] = _mm_sub_epi32(u[0], u[2]);
+ v[1] = _mm_add_epi32(u[1], u[3]);
+ v[3] = _mm_sub_epi32(u[1], u[3]);
+ v[4] = _mm_add_epi32(u[4], u[6]);
+ v[6] = _mm_sub_epi32(u[4], u[6]);
+ v[5] = _mm_add_epi32(u[5], u[7]);
+ v[7] = _mm_sub_epi32(u[5], u[7]);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ out[1] = u[0];
+ out[3] = _mm_sub_epi32(kZero, u[4]);
+ out[5] = u[6];
+ out[7] = _mm_sub_epi32(kZero, u[2]);
+ out[9] = u[3];
+ out[11] = _mm_sub_epi32(kZero, u[7]);
+ out[13] = u[5];
+ out[15] = _mm_sub_epi32(kZero, u[1]);
+}
+
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ int tx_type, int bd) {
+ __m128i in[16], out[16];
+ const TXFM_2D_CFG *cfg = NULL;
+
+ switch (tx_type) {
+ case DCT_DCT:
+ cfg = &fwd_txfm_2d_cfg_dct_dct_8;
+ load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+ fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case ADST_DCT:
+ cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+ load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case DCT_ADST:
+ cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+ load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+ fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case ADST_ADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+ load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+ load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+ fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+ load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+ col_txfm_8x8_rounding(out, -cfg->shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+ int row_index = 0;
+ int dst_index = 0;
+ int src_index = 0;
+
+ // row 0, 1, .., 7
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 8);
+
+ // row 8, 9, ..., 15
+ src_index += 16;
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i in[64];
+ // Load 4 8x8 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+ const int16_t *botL = input + 8 * stride;
+ const int16_t *botR = input + 8 * stride + 8;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ // Swap right columns
+ tmp = topR;
+ topR = botR;
+ botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ // Swap bottom rows
+ tmp = botL;
+ botL = botR;
+ botR = tmp;
+ }
+
+ // load first 8 columns
+ load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+ // load second 8 columns
+ load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+ convert_8x8_to_16x16(in, out);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[16], v[16], x;
+ const int col_num = 4;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+ u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[7]);
+ v[7] = _mm_sub_epi32(u[0], u[7]);
+ v[1] = _mm_add_epi32(u[1], u[6]);
+ v[6] = _mm_sub_epi32(u[1], u[6]);
+ v[2] = _mm_add_epi32(u[2], u[5]);
+ v[5] = _mm_sub_epi32(u[2], u[5]);
+ v[3] = _mm_add_epi32(u[3], u[4]);
+ v[4] = _mm_sub_epi32(u[3], u[4]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ v[10] = _mm_mullo_epi32(u[10], cospim32);
+ x = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[13], cospim32);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospim32);
+ x = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi32);
+ x = _mm_mullo_epi32(u[12], cospim32);
+ v[12] = _mm_sub_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[3]);
+ u[3] = _mm_sub_epi32(v[0], v[3]);
+ u[1] = _mm_add_epi32(v[1], v[2]);
+ u[2] = _mm_sub_epi32(v[1], v[2]);
+ u[4] = v[4];
+
+ u[5] = _mm_mullo_epi32(v[5], cospim32);
+ x = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi32);
+ x = _mm_mullo_epi32(v[6], cospim32);
+ u[6] = _mm_sub_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm_add_epi32(v[8], v[11]);
+ u[11] = _mm_sub_epi32(v[8], v[11]);
+ u[9] = _mm_add_epi32(v[9], v[10]);
+ u[10] = _mm_sub_epi32(v[9], v[10]);
+ u[12] = _mm_sub_epi32(v[15], v[12]);
+ u[15] = _mm_add_epi32(v[15], v[12]);
+ u[13] = _mm_sub_epi32(v[14], v[13]);
+ u[14] = _mm_add_epi32(v[14], v[13]);
+
+ // stage 4
+ u[0] = _mm_mullo_epi32(u[0], cospi32);
+ u[1] = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(u[0], u[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(u[0], u[1]);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(u[2], cospi48);
+ x = _mm_mullo_epi32(u[3], cospi16);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(u[2], cospi16);
+ x = _mm_mullo_epi32(u[3], cospi48);
+ v[3] = _mm_sub_epi32(x, v[3]);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_add_epi32(u[4], u[5]);
+ v[5] = _mm_sub_epi32(u[4], u[5]);
+ v[6] = _mm_sub_epi32(u[7], u[6]);
+ v[7] = _mm_add_epi32(u[7], u[6]);
+ v[8] = u[8];
+
+ v[9] = _mm_mullo_epi32(u[9], cospim16);
+ x = _mm_mullo_epi32(u[14], cospi48);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi48);
+ x = _mm_mullo_epi32(u[14], cospim16);
+ v[14] = _mm_sub_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospim48);
+ x = _mm_mullo_epi32(u[13], cospim16);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospim16);
+ x = _mm_mullo_epi32(u[13], cospim48);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi56);
+ x = _mm_mullo_epi32(v[7], cospi8);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[7] = _mm_mullo_epi32(v[4], cospi8);
+ x = _mm_mullo_epi32(v[7], cospi56);
+ u[7] = _mm_sub_epi32(x, u[7]);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[5] = _mm_mullo_epi32(v[5], cospi24);
+ x = _mm_mullo_epi32(v[6], cospi40);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi40);
+ x = _mm_mullo_epi32(v[6], cospi24);
+ u[6] = _mm_sub_epi32(x, u[6]);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[8] = _mm_add_epi32(v[8], v[9]);
+ u[9] = _mm_sub_epi32(v[8], v[9]);
+ u[10] = _mm_sub_epi32(v[11], v[10]);
+ u[11] = _mm_add_epi32(v[11], v[10]);
+ u[12] = _mm_add_epi32(v[12], v[13]);
+ u[13] = _mm_sub_epi32(v[12], v[13]);
+ u[14] = _mm_sub_epi32(v[15], v[14]);
+ u[15] = _mm_add_epi32(v[15], v[14]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi60);
+ x = _mm_mullo_epi32(u[15], cospi4);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[15] = _mm_mullo_epi32(u[8], cospi4);
+ x = _mm_mullo_epi32(u[15], cospi60);
+ v[15] = _mm_sub_epi32(x, v[15]);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ v[9] = _mm_mullo_epi32(u[9], cospi28);
+ x = _mm_mullo_epi32(u[14], cospi36);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi36);
+ x = _mm_mullo_epi32(u[14], cospi28);
+ v[14] = _mm_sub_epi32(x, v[14]);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi44);
+ x = _mm_mullo_epi32(u[13], cospi20);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi20);
+ x = _mm_mullo_epi32(u[13], cospi44);
+ v[13] = _mm_sub_epi32(x, v[13]);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospi12);
+ x = _mm_mullo_epi32(u[12], cospi52);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi52);
+ x = _mm_mullo_epi32(u[12], cospi12);
+ v[12] = _mm_sub_epi32(x, v[12]);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ out[0 * col_num + col] = v[0];
+ out[1 * col_num + col] = v[8];
+ out[2 * col_num + col] = v[4];
+ out[3 * col_num + col] = v[12];
+ out[4 * col_num + col] = v[2];
+ out[5 * col_num + col] = v[10];
+ out[6 * col_num + col] = v[6];
+ out[7 * col_num + col] = v[14];
+ out[8 * col_num + col] = v[1];
+ out[9 * col_num + col] = v[9];
+ out[10 * col_num + col] = v[5];
+ out[11 * col_num + col] = v[13];
+ out[12 * col_num + col] = v[3];
+ out[13 * col_num + col] = v[11];
+ out[14 * col_num + col] = v[7];
+ out[15 * col_num + col] = v[15];
+ }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+ const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[16], v[16], x, y;
+ const int col_num = 4;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
+ x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+ v[0] = _mm_add_epi32(v[0], x);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
+ x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+ v[1] = _mm_sub_epi32(v[1], x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
+ x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
+ x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+ v[3] = _mm_sub_epi32(v[3], x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
+ x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
+ x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
+ x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
+ x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
+ x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
+ x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
+ x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
+ x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
+ x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
+ x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
+ x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
+ x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi8);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi8);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi40);
+ x = _mm_mullo_epi32(u[11], cospi24);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(u[10], cospi24);
+ x = _mm_mullo_epi32(u[11], cospi40);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[12], cospim56);
+ x = _mm_mullo_epi32(u[13], cospi8);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi8);
+ x = _mm_mullo_epi32(u[13], cospim56);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim24);
+ x = _mm_mullo_epi32(u[15], cospi40);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi40);
+ x = _mm_mullo_epi32(u[15], cospim24);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 5
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm_mullo_epi32(u[4], cospi16);
+ x = _mm_mullo_epi32(u[5], cospi48);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(u[4], cospi48);
+ x = _mm_mullo_epi32(u[5], cospi16);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(u[6], cospim48);
+ x = _mm_mullo_epi32(u[7], cospi16);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(u[6], cospi16);
+ x = _mm_mullo_epi32(u[7], cospim48);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm_mullo_epi32(u[12], cospi16);
+ x = _mm_mullo_epi32(u[13], cospi48);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi48);
+ x = _mm_mullo_epi32(u[13], cospi16);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim48);
+ x = _mm_mullo_epi32(u[15], cospi16);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi16);
+ x = _mm_mullo_epi32(u[15], cospim48);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 7
+ u[0] = _mm_add_epi32(v[0], v[2]);
+ u[2] = _mm_sub_epi32(v[0], v[2]);
+ u[1] = _mm_add_epi32(v[1], v[3]);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[4] = _mm_add_epi32(v[4], v[6]);
+ u[6] = _mm_sub_epi32(v[4], v[6]);
+ u[5] = _mm_add_epi32(v[5], v[7]);
+ u[7] = _mm_sub_epi32(v[5], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[10]);
+ u[10] = _mm_sub_epi32(v[8], v[10]);
+ u[9] = _mm_add_epi32(v[9], v[11]);
+ u[11] = _mm_sub_epi32(v[9], v[11]);
+ u[12] = _mm_add_epi32(v[12], v[14]);
+ u[14] = _mm_sub_epi32(v[12], v[14]);
+ u[13] = _mm_add_epi32(v[13], v[15]);
+ u[15] = _mm_sub_epi32(v[13], v[15]);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ out[0 * col_num + col] = v[0];
+ out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
+ out[2 * col_num + col] = v[12];
+ out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
+ out[4 * col_num + col] = v[6];
+ out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
+ out[6 * col_num + col] = v[10];
+ out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
+ out[8 * col_num + col] = v[3];
+ out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
+ out[10 * col_num + col] = v[15];
+ out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
+ out[12 * col_num + col] = v[5];
+ out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
+ out[14 * col_num + col] = v[9];
+ out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
+ }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+ // Note:
+ // We split 16x16 rounding into 4 sections of 8x8 rounding,
+ // instead of 4 columns
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+ col_txfm_8x8_rounding(&in[32], shift);
+ col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+ const int size_8x8 = 16 * 4;
+ write_buffer_8x8(&in[0], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[16], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[32], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[48], output);
+}
+
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, int tx_type, int bd) {
+ __m128i in[64], out[64];
+ const TXFM_2D_CFG *cfg = NULL;
+
+ switch (tx_type) {
+ case DCT_DCT:
+ cfg = &fwd_txfm_2d_cfg_dct_dct_16;
+ load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+ fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case ADST_DCT:
+ cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+ load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case DCT_ADST:
+ cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+ load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+ fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case ADST_ADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+ load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+ load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+ fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+ load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+ col_txfm_16x16_rounding(out, -cfg->shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0);
+ }
+ (void)bd;
+}
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
new file mode 100644
index 0000000000..198e4e4c4c
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -0,0 +1,1678 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // avx2
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/fwd_txfm_avx2.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static int32_t get_16x16_sum(const int16_t *input, int stride) {
+ __m256i r0, r1, r2, r3, u0, u1;
+ __m256i zero = _mm256_setzero_si256();
+ __m256i sum = _mm256_setzero_si256();
+ const int16_t *blockBound = input + (stride << 4);
+ __m128i v0, v1;
+
+ while (input < blockBound) {
+ r0 = _mm256_loadu_si256((__m256i const *)input);
+ r1 = _mm256_loadu_si256((__m256i const *)(input + stride));
+ r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride));
+ r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride));
+
+ u0 = _mm256_add_epi16(r0, r1);
+ u1 = _mm256_add_epi16(r2, r3);
+ sum = _mm256_add_epi16(sum, u0);
+ sum = _mm256_add_epi16(sum, u1);
+
+ input += stride << 2;
+ }
+
+ // unpack 16 int16_t into 2x8 int32_t
+ u0 = _mm256_unpacklo_epi16(zero, sum);
+ u1 = _mm256_unpackhi_epi16(zero, sum);
+ u0 = _mm256_srai_epi32(u0, 16);
+ u1 = _mm256_srai_epi32(u1, 16);
+ sum = _mm256_add_epi32(u0, u1);
+
+ u0 = _mm256_srli_si256(sum, 8);
+ u1 = _mm256_add_epi32(sum, u0);
+
+ v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1),
+ _mm256_castsi256_si128(u1));
+ v1 = _mm_srli_si128(v0, 4);
+ v0 = _mm_add_epi32(v0, v1);
+ return (int32_t)_mm_extract_epi32(v0, 0);
+}
+
+void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32_t dc = get_16x16_sum(input, stride);
+ output[0] = (tran_low_t)(dc >> 1);
+ _mm256_zeroupper();
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int stride,
+ int flipud, int fliplr, __m256i *in) {
+ if (!flipud) {
+ in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
+ in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
+ in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
+ in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
+ in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
+ in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
+ in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
+ in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
+ in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
+ in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
+ in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
+ in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
+ in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
+ in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
+ in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
+ in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
+ } else {
+ in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
+ in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
+ in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
+ in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
+ in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
+ in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
+ in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
+ in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
+ in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
+ in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
+ in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
+ in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
+ in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
+ in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
+ in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
+ in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ mm256_reverse_epi16(&in[0]);
+ mm256_reverse_epi16(&in[1]);
+ mm256_reverse_epi16(&in[2]);
+ mm256_reverse_epi16(&in[3]);
+ mm256_reverse_epi16(&in[4]);
+ mm256_reverse_epi16(&in[5]);
+ mm256_reverse_epi16(&in[6]);
+ mm256_reverse_epi16(&in[7]);
+ mm256_reverse_epi16(&in[8]);
+ mm256_reverse_epi16(&in[9]);
+ mm256_reverse_epi16(&in[10]);
+ mm256_reverse_epi16(&in[11]);
+ mm256_reverse_epi16(&in[12]);
+ mm256_reverse_epi16(&in[13]);
+ mm256_reverse_epi16(&in[14]);
+ mm256_reverse_epi16(&in[15]);
+ }
+
+ in[0] = _mm256_slli_epi16(in[0], 2);
+ in[1] = _mm256_slli_epi16(in[1], 2);
+ in[2] = _mm256_slli_epi16(in[2], 2);
+ in[3] = _mm256_slli_epi16(in[3], 2);
+ in[4] = _mm256_slli_epi16(in[4], 2);
+ in[5] = _mm256_slli_epi16(in[5], 2);
+ in[6] = _mm256_slli_epi16(in[6], 2);
+ in[7] = _mm256_slli_epi16(in[7], 2);
+ in[8] = _mm256_slli_epi16(in[8], 2);
+ in[9] = _mm256_slli_epi16(in[9], 2);
+ in[10] = _mm256_slli_epi16(in[10], 2);
+ in[11] = _mm256_slli_epi16(in[11], 2);
+ in[12] = _mm256_slli_epi16(in[12], 2);
+ in[13] = _mm256_slli_epi16(in[13], 2);
+ in[14] = _mm256_slli_epi16(in[14], 2);
+ in[15] = _mm256_slli_epi16(in[15], 2);
+}
+
+static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 16; ++i) {
+ storeu_output_avx2(&in[i], output + (i << 4));
+ }
+}
+
+static void right_shift_16x16(__m256i *in) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i s0 = _mm256_srai_epi16(in[0], 15);
+ __m256i s1 = _mm256_srai_epi16(in[1], 15);
+ __m256i s2 = _mm256_srai_epi16(in[2], 15);
+ __m256i s3 = _mm256_srai_epi16(in[3], 15);
+ __m256i s4 = _mm256_srai_epi16(in[4], 15);
+ __m256i s5 = _mm256_srai_epi16(in[5], 15);
+ __m256i s6 = _mm256_srai_epi16(in[6], 15);
+ __m256i s7 = _mm256_srai_epi16(in[7], 15);
+ __m256i s8 = _mm256_srai_epi16(in[8], 15);
+ __m256i s9 = _mm256_srai_epi16(in[9], 15);
+ __m256i s10 = _mm256_srai_epi16(in[10], 15);
+ __m256i s11 = _mm256_srai_epi16(in[11], 15);
+ __m256i s12 = _mm256_srai_epi16(in[12], 15);
+ __m256i s13 = _mm256_srai_epi16(in[13], 15);
+ __m256i s14 = _mm256_srai_epi16(in[14], 15);
+ __m256i s15 = _mm256_srai_epi16(in[15], 15);
+
+ in[0] = _mm256_add_epi16(in[0], one);
+ in[1] = _mm256_add_epi16(in[1], one);
+ in[2] = _mm256_add_epi16(in[2], one);
+ in[3] = _mm256_add_epi16(in[3], one);
+ in[4] = _mm256_add_epi16(in[4], one);
+ in[5] = _mm256_add_epi16(in[5], one);
+ in[6] = _mm256_add_epi16(in[6], one);
+ in[7] = _mm256_add_epi16(in[7], one);
+ in[8] = _mm256_add_epi16(in[8], one);
+ in[9] = _mm256_add_epi16(in[9], one);
+ in[10] = _mm256_add_epi16(in[10], one);
+ in[11] = _mm256_add_epi16(in[11], one);
+ in[12] = _mm256_add_epi16(in[12], one);
+ in[13] = _mm256_add_epi16(in[13], one);
+ in[14] = _mm256_add_epi16(in[14], one);
+ in[15] = _mm256_add_epi16(in[15], one);
+
+ in[0] = _mm256_sub_epi16(in[0], s0);
+ in[1] = _mm256_sub_epi16(in[1], s1);
+ in[2] = _mm256_sub_epi16(in[2], s2);
+ in[3] = _mm256_sub_epi16(in[3], s3);
+ in[4] = _mm256_sub_epi16(in[4], s4);
+ in[5] = _mm256_sub_epi16(in[5], s5);
+ in[6] = _mm256_sub_epi16(in[6], s6);
+ in[7] = _mm256_sub_epi16(in[7], s7);
+ in[8] = _mm256_sub_epi16(in[8], s8);
+ in[9] = _mm256_sub_epi16(in[9], s9);
+ in[10] = _mm256_sub_epi16(in[10], s10);
+ in[11] = _mm256_sub_epi16(in[11], s11);
+ in[12] = _mm256_sub_epi16(in[12], s12);
+ in[13] = _mm256_sub_epi16(in[13], s13);
+ in[14] = _mm256_sub_epi16(in[14], s14);
+ in[15] = _mm256_sub_epi16(in[15], s15);
+
+ in[0] = _mm256_srai_epi16(in[0], 2);
+ in[1] = _mm256_srai_epi16(in[1], 2);
+ in[2] = _mm256_srai_epi16(in[2], 2);
+ in[3] = _mm256_srai_epi16(in[3], 2);
+ in[4] = _mm256_srai_epi16(in[4], 2);
+ in[5] = _mm256_srai_epi16(in[5], 2);
+ in[6] = _mm256_srai_epi16(in[6], 2);
+ in[7] = _mm256_srai_epi16(in[7], 2);
+ in[8] = _mm256_srai_epi16(in[8], 2);
+ in[9] = _mm256_srai_epi16(in[9], 2);
+ in[10] = _mm256_srai_epi16(in[10], 2);
+ in[11] = _mm256_srai_epi16(in[11], 2);
+ in[12] = _mm256_srai_epi16(in[12], 2);
+ in[13] = _mm256_srai_epi16(in[13], 2);
+ in[14] = _mm256_srai_epi16(in[14], 2);
+ in[15] = _mm256_srai_epi16(in[15], 2);
+}
+
+static void fdct16_avx2(__m256i *in) {
+ // sequence: cospi_L_H = pairs(L, H) and L first
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+ const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+ const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+ const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+ const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+
+ const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64);
+ const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+
+ const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64);
+ const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+
+ const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64);
+ const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+
+ const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64);
+ const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+ __m256i v0, v1, v2, v3;
+ __m256i x0, x1;
+
+ // 0, 4, 8, 12
+ u0 = _mm256_add_epi16(in[0], in[15]);
+ u1 = _mm256_add_epi16(in[1], in[14]);
+ u2 = _mm256_add_epi16(in[2], in[13]);
+ u3 = _mm256_add_epi16(in[3], in[12]);
+ u4 = _mm256_add_epi16(in[4], in[11]);
+ u5 = _mm256_add_epi16(in[5], in[10]);
+ u6 = _mm256_add_epi16(in[6], in[9]);
+ u7 = _mm256_add_epi16(in[7], in[8]);
+
+ s0 = _mm256_add_epi16(u0, u7);
+ s1 = _mm256_add_epi16(u1, u6);
+ s2 = _mm256_add_epi16(u2, u5);
+ s3 = _mm256_add_epi16(u3, u4);
+
+ // 0, 8
+ v0 = _mm256_add_epi16(s0, s3);
+ v1 = _mm256_add_epi16(s1, s2);
+
+ x0 = _mm256_unpacklo_epi16(v0, v1);
+ x1 = _mm256_unpackhi_epi16(v0, v1);
+
+ t0 = butter_fly(x0, x1, cospi_p16_p16);
+ t1 = butter_fly(x0, x1, cospi_p16_m16);
+
+ // 4, 12
+ v0 = _mm256_sub_epi16(s1, s2);
+ v1 = _mm256_sub_epi16(s0, s3);
+
+ x0 = _mm256_unpacklo_epi16(v0, v1);
+ x1 = _mm256_unpackhi_epi16(v0, v1);
+
+ t2 = butter_fly(x0, x1, cospi_p24_p08);
+ t3 = butter_fly(x0, x1, cospi_m08_p24);
+
+ // 2, 6, 10, 14
+ s0 = _mm256_sub_epi16(u3, u4);
+ s1 = _mm256_sub_epi16(u2, u5);
+ s2 = _mm256_sub_epi16(u1, u6);
+ s3 = _mm256_sub_epi16(u0, u7);
+
+ v0 = s0; // output[4]
+ v3 = s3; // output[7]
+
+ x0 = _mm256_unpacklo_epi16(s2, s1);
+ x1 = _mm256_unpackhi_epi16(s2, s1);
+
+ v2 = butter_fly(x0, x1, cospi_p16_p16); // output[5]
+ v1 = butter_fly(x0, x1, cospi_p16_m16); // output[6]
+
+ s0 = _mm256_add_epi16(v0, v1); // step[4]
+ s1 = _mm256_sub_epi16(v0, v1); // step[5]
+ s2 = _mm256_sub_epi16(v3, v2); // step[6]
+ s3 = _mm256_add_epi16(v3, v2); // step[7]
+
+ // 2, 14
+ x0 = _mm256_unpacklo_epi16(s0, s3);
+ x1 = _mm256_unpackhi_epi16(s0, s3);
+
+ t4 = butter_fly(x0, x1, cospi_p28_p04);
+ t5 = butter_fly(x0, x1, cospi_m04_p28);
+
+ // 10, 6
+ x0 = _mm256_unpacklo_epi16(s1, s2);
+ x1 = _mm256_unpackhi_epi16(s1, s2);
+ t6 = butter_fly(x0, x1, cospi_p12_p20);
+ t7 = butter_fly(x0, x1, cospi_m20_p12);
+
+ // 1, 3, 5, 7, 9, 11, 13, 15
+ s0 = _mm256_sub_epi16(in[7], in[8]); // step[8]
+ s1 = _mm256_sub_epi16(in[6], in[9]); // step[9]
+ u2 = _mm256_sub_epi16(in[5], in[10]);
+ u3 = _mm256_sub_epi16(in[4], in[11]);
+ u4 = _mm256_sub_epi16(in[3], in[12]);
+ u5 = _mm256_sub_epi16(in[2], in[13]);
+ s6 = _mm256_sub_epi16(in[1], in[14]); // step[14]
+ s7 = _mm256_sub_epi16(in[0], in[15]); // step[15]
+
+ in[0] = t0;
+ in[8] = t1;
+ in[4] = t2;
+ in[12] = t3;
+ in[2] = t4;
+ in[14] = t5;
+ in[10] = t6;
+ in[6] = t7;
+
+ x0 = _mm256_unpacklo_epi16(u5, u2);
+ x1 = _mm256_unpackhi_epi16(u5, u2);
+
+ s2 = butter_fly(x0, x1, cospi_p16_p16); // step[13]
+ s5 = butter_fly(x0, x1, cospi_p16_m16); // step[10]
+
+ x0 = _mm256_unpacklo_epi16(u4, u3);
+ x1 = _mm256_unpackhi_epi16(u4, u3);
+
+ s3 = butter_fly(x0, x1, cospi_p16_p16); // step[12]
+ s4 = butter_fly(x0, x1, cospi_p16_m16); // step[11]
+
+ u0 = _mm256_add_epi16(s0, s4); // output[8]
+ u1 = _mm256_add_epi16(s1, s5);
+ u2 = _mm256_sub_epi16(s1, s5);
+ u3 = _mm256_sub_epi16(s0, s4);
+ u4 = _mm256_sub_epi16(s7, s3);
+ u5 = _mm256_sub_epi16(s6, s2);
+ u6 = _mm256_add_epi16(s6, s2);
+ u7 = _mm256_add_epi16(s7, s3);
+
+ // stage 4
+ s0 = u0;
+ s3 = u3;
+ s4 = u4;
+ s7 = u7;
+
+ x0 = _mm256_unpacklo_epi16(u1, u6);
+ x1 = _mm256_unpackhi_epi16(u1, u6);
+
+ s1 = butter_fly(x0, x1, cospi_m08_p24);
+ s6 = butter_fly(x0, x1, cospi_p24_p08);
+
+ x0 = _mm256_unpacklo_epi16(u2, u5);
+ x1 = _mm256_unpackhi_epi16(u2, u5);
+
+ s2 = butter_fly(x0, x1, cospi_m24_m08);
+ s5 = butter_fly(x0, x1, cospi_m08_p24);
+
+ // stage 5
+ u0 = _mm256_add_epi16(s0, s1);
+ u1 = _mm256_sub_epi16(s0, s1);
+ u2 = _mm256_sub_epi16(s3, s2);
+ u3 = _mm256_add_epi16(s3, s2);
+ u4 = _mm256_add_epi16(s4, s5);
+ u5 = _mm256_sub_epi16(s4, s5);
+ u6 = _mm256_sub_epi16(s7, s6);
+ u7 = _mm256_add_epi16(s7, s6);
+
+ // stage 6
+ x0 = _mm256_unpacklo_epi16(u0, u7);
+ x1 = _mm256_unpackhi_epi16(u0, u7);
+ in[1] = butter_fly(x0, x1, cospi_p30_p02);
+ in[15] = butter_fly(x0, x1, cospi_m02_p30);
+
+ x0 = _mm256_unpacklo_epi16(u1, u6);
+ x1 = _mm256_unpackhi_epi16(u1, u6);
+ in[9] = butter_fly(x0, x1, cospi_p14_p18);
+ in[7] = butter_fly(x0, x1, cospi_m18_p14);
+
+ x0 = _mm256_unpacklo_epi16(u2, u5);
+ x1 = _mm256_unpackhi_epi16(u2, u5);
+ in[5] = butter_fly(x0, x1, cospi_p22_p10);
+ in[11] = butter_fly(x0, x1, cospi_m10_p22);
+
+ x0 = _mm256_unpacklo_epi16(u3, u4);
+ x1 = _mm256_unpackhi_epi16(u3, u4);
+ in[13] = butter_fly(x0, x1, cospi_p06_p26);
+ in[3] = butter_fly(x0, x1, cospi_m26_p06);
+}
+
+void fadst16_avx2(__m256i *in) {
+ const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+ const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+ const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+ const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+ const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+ const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+ const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+ const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+ const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+ const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+ const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+ const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+ __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m256i y0, y1;
+
+ // stage 1, s takes low 256 bits; x takes high 256 bits
+ y0 = _mm256_unpacklo_epi16(in[15], in[0]);
+ y1 = _mm256_unpackhi_epi16(in[15], in[0]);
+ s0 = _mm256_madd_epi16(y0, cospi_p01_p31);
+ x0 = _mm256_madd_epi16(y1, cospi_p01_p31);
+ s1 = _mm256_madd_epi16(y0, cospi_p31_m01);
+ x1 = _mm256_madd_epi16(y1, cospi_p31_m01);
+
+ y0 = _mm256_unpacklo_epi16(in[13], in[2]);
+ y1 = _mm256_unpackhi_epi16(in[13], in[2]);
+ s2 = _mm256_madd_epi16(y0, cospi_p05_p27);
+ x2 = _mm256_madd_epi16(y1, cospi_p05_p27);
+ s3 = _mm256_madd_epi16(y0, cospi_p27_m05);
+ x3 = _mm256_madd_epi16(y1, cospi_p27_m05);
+
+ y0 = _mm256_unpacklo_epi16(in[11], in[4]);
+ y1 = _mm256_unpackhi_epi16(in[11], in[4]);
+ s4 = _mm256_madd_epi16(y0, cospi_p09_p23);
+ x4 = _mm256_madd_epi16(y1, cospi_p09_p23);
+ s5 = _mm256_madd_epi16(y0, cospi_p23_m09);
+ x5 = _mm256_madd_epi16(y1, cospi_p23_m09);
+
+ y0 = _mm256_unpacklo_epi16(in[9], in[6]);
+ y1 = _mm256_unpackhi_epi16(in[9], in[6]);
+ s6 = _mm256_madd_epi16(y0, cospi_p13_p19);
+ x6 = _mm256_madd_epi16(y1, cospi_p13_p19);
+ s7 = _mm256_madd_epi16(y0, cospi_p19_m13);
+ x7 = _mm256_madd_epi16(y1, cospi_p19_m13);
+
+ y0 = _mm256_unpacklo_epi16(in[7], in[8]);
+ y1 = _mm256_unpackhi_epi16(in[7], in[8]);
+ s8 = _mm256_madd_epi16(y0, cospi_p17_p15);
+ x8 = _mm256_madd_epi16(y1, cospi_p17_p15);
+ s9 = _mm256_madd_epi16(y0, cospi_p15_m17);
+ x9 = _mm256_madd_epi16(y1, cospi_p15_m17);
+
+ y0 = _mm256_unpacklo_epi16(in[5], in[10]);
+ y1 = _mm256_unpackhi_epi16(in[5], in[10]);
+ s10 = _mm256_madd_epi16(y0, cospi_p21_p11);
+ x10 = _mm256_madd_epi16(y1, cospi_p21_p11);
+ s11 = _mm256_madd_epi16(y0, cospi_p11_m21);
+ x11 = _mm256_madd_epi16(y1, cospi_p11_m21);
+
+ y0 = _mm256_unpacklo_epi16(in[3], in[12]);
+ y1 = _mm256_unpackhi_epi16(in[3], in[12]);
+ s12 = _mm256_madd_epi16(y0, cospi_p25_p07);
+ x12 = _mm256_madd_epi16(y1, cospi_p25_p07);
+ s13 = _mm256_madd_epi16(y0, cospi_p07_m25);
+ x13 = _mm256_madd_epi16(y1, cospi_p07_m25);
+
+ y0 = _mm256_unpacklo_epi16(in[1], in[14]);
+ y1 = _mm256_unpackhi_epi16(in[1], in[14]);
+ s14 = _mm256_madd_epi16(y0, cospi_p29_p03);
+ x14 = _mm256_madd_epi16(y1, cospi_p29_p03);
+ s15 = _mm256_madd_epi16(y0, cospi_p03_m29);
+ x15 = _mm256_madd_epi16(y1, cospi_p03_m29);
+
+ // u takes low 256 bits; v takes high 256 bits
+ u0 = _mm256_add_epi32(s0, s8);
+ u1 = _mm256_add_epi32(s1, s9);
+ u2 = _mm256_add_epi32(s2, s10);
+ u3 = _mm256_add_epi32(s3, s11);
+ u4 = _mm256_add_epi32(s4, s12);
+ u5 = _mm256_add_epi32(s5, s13);
+ u6 = _mm256_add_epi32(s6, s14);
+ u7 = _mm256_add_epi32(s7, s15);
+
+ u8 = _mm256_sub_epi32(s0, s8);
+ u9 = _mm256_sub_epi32(s1, s9);
+ u10 = _mm256_sub_epi32(s2, s10);
+ u11 = _mm256_sub_epi32(s3, s11);
+ u12 = _mm256_sub_epi32(s4, s12);
+ u13 = _mm256_sub_epi32(s5, s13);
+ u14 = _mm256_sub_epi32(s6, s14);
+ u15 = _mm256_sub_epi32(s7, s15);
+
+ v0 = _mm256_add_epi32(x0, x8);
+ v1 = _mm256_add_epi32(x1, x9);
+ v2 = _mm256_add_epi32(x2, x10);
+ v3 = _mm256_add_epi32(x3, x11);
+ v4 = _mm256_add_epi32(x4, x12);
+ v5 = _mm256_add_epi32(x5, x13);
+ v6 = _mm256_add_epi32(x6, x14);
+ v7 = _mm256_add_epi32(x7, x15);
+
+ v8 = _mm256_sub_epi32(x0, x8);
+ v9 = _mm256_sub_epi32(x1, x9);
+ v10 = _mm256_sub_epi32(x2, x10);
+ v11 = _mm256_sub_epi32(x3, x11);
+ v12 = _mm256_sub_epi32(x4, x12);
+ v13 = _mm256_sub_epi32(x5, x13);
+ v14 = _mm256_sub_epi32(x6, x14);
+ v15 = _mm256_sub_epi32(x7, x15);
+
+ // low 256 bits rounding
+ u8 = _mm256_add_epi32(u8, dct_rounding);
+ u9 = _mm256_add_epi32(u9, dct_rounding);
+ u10 = _mm256_add_epi32(u10, dct_rounding);
+ u11 = _mm256_add_epi32(u11, dct_rounding);
+ u12 = _mm256_add_epi32(u12, dct_rounding);
+ u13 = _mm256_add_epi32(u13, dct_rounding);
+ u14 = _mm256_add_epi32(u14, dct_rounding);
+ u15 = _mm256_add_epi32(u15, dct_rounding);
+
+ u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS);
+ u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS);
+ u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
+ u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
+ u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+ u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+ u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+ u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+ // high 256 bits rounding
+ v8 = _mm256_add_epi32(v8, dct_rounding);
+ v9 = _mm256_add_epi32(v9, dct_rounding);
+ v10 = _mm256_add_epi32(v10, dct_rounding);
+ v11 = _mm256_add_epi32(v11, dct_rounding);
+ v12 = _mm256_add_epi32(v12, dct_rounding);
+ v13 = _mm256_add_epi32(v13, dct_rounding);
+ v14 = _mm256_add_epi32(v14, dct_rounding);
+ v15 = _mm256_add_epi32(v15, dct_rounding);
+
+ v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
+ v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
+ v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+ v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+ v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+ v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+ v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+ v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+ // Saturation pack 32-bit to 16-bit
+ x8 = _mm256_packs_epi32(u8, v8);
+ x9 = _mm256_packs_epi32(u9, v9);
+ x10 = _mm256_packs_epi32(u10, v10);
+ x11 = _mm256_packs_epi32(u11, v11);
+ x12 = _mm256_packs_epi32(u12, v12);
+ x13 = _mm256_packs_epi32(u13, v13);
+ x14 = _mm256_packs_epi32(u14, v14);
+ x15 = _mm256_packs_epi32(u15, v15);
+
+ // stage 2
+ y0 = _mm256_unpacklo_epi16(x8, x9);
+ y1 = _mm256_unpackhi_epi16(x8, x9);
+ s8 = _mm256_madd_epi16(y0, cospi_p04_p28);
+ x8 = _mm256_madd_epi16(y1, cospi_p04_p28);
+ s9 = _mm256_madd_epi16(y0, cospi_p28_m04);
+ x9 = _mm256_madd_epi16(y1, cospi_p28_m04);
+
+ y0 = _mm256_unpacklo_epi16(x10, x11);
+ y1 = _mm256_unpackhi_epi16(x10, x11);
+ s10 = _mm256_madd_epi16(y0, cospi_p20_p12);
+ x10 = _mm256_madd_epi16(y1, cospi_p20_p12);
+ s11 = _mm256_madd_epi16(y0, cospi_p12_m20);
+ x11 = _mm256_madd_epi16(y1, cospi_p12_m20);
+
+ y0 = _mm256_unpacklo_epi16(x12, x13);
+ y1 = _mm256_unpackhi_epi16(x12, x13);
+ s12 = _mm256_madd_epi16(y0, cospi_m28_p04);
+ x12 = _mm256_madd_epi16(y1, cospi_m28_p04);
+ s13 = _mm256_madd_epi16(y0, cospi_p04_p28);
+ x13 = _mm256_madd_epi16(y1, cospi_p04_p28);
+
+ y0 = _mm256_unpacklo_epi16(x14, x15);
+ y1 = _mm256_unpackhi_epi16(x14, x15);
+ s14 = _mm256_madd_epi16(y0, cospi_m12_p20);
+ x14 = _mm256_madd_epi16(y1, cospi_m12_p20);
+ s15 = _mm256_madd_epi16(y0, cospi_p20_p12);
+ x15 = _mm256_madd_epi16(y1, cospi_p20_p12);
+
+ x0 = _mm256_add_epi32(u0, u4);
+ s0 = _mm256_add_epi32(v0, v4);
+ x1 = _mm256_add_epi32(u1, u5);
+ s1 = _mm256_add_epi32(v1, v5);
+ x2 = _mm256_add_epi32(u2, u6);
+ s2 = _mm256_add_epi32(v2, v6);
+ x3 = _mm256_add_epi32(u3, u7);
+ s3 = _mm256_add_epi32(v3, v7);
+
+ v8 = _mm256_sub_epi32(u0, u4);
+ v9 = _mm256_sub_epi32(v0, v4);
+ v10 = _mm256_sub_epi32(u1, u5);
+ v11 = _mm256_sub_epi32(v1, v5);
+ v12 = _mm256_sub_epi32(u2, u6);
+ v13 = _mm256_sub_epi32(v2, v6);
+ v14 = _mm256_sub_epi32(u3, u7);
+ v15 = _mm256_sub_epi32(v3, v7);
+
+ v8 = _mm256_add_epi32(v8, dct_rounding);
+ v9 = _mm256_add_epi32(v9, dct_rounding);
+ v10 = _mm256_add_epi32(v10, dct_rounding);
+ v11 = _mm256_add_epi32(v11, dct_rounding);
+ v12 = _mm256_add_epi32(v12, dct_rounding);
+ v13 = _mm256_add_epi32(v13, dct_rounding);
+ v14 = _mm256_add_epi32(v14, dct_rounding);
+ v15 = _mm256_add_epi32(v15, dct_rounding);
+
+ v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
+ v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
+ v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+ v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+ v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+ v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+ v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+ v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+ x4 = _mm256_packs_epi32(v8, v9);
+ x5 = _mm256_packs_epi32(v10, v11);
+ x6 = _mm256_packs_epi32(v12, v13);
+ x7 = _mm256_packs_epi32(v14, v15);
+
+ u8 = _mm256_add_epi32(s8, s12);
+ u9 = _mm256_add_epi32(s9, s13);
+ u10 = _mm256_add_epi32(s10, s14);
+ u11 = _mm256_add_epi32(s11, s15);
+ u12 = _mm256_sub_epi32(s8, s12);
+ u13 = _mm256_sub_epi32(s9, s13);
+ u14 = _mm256_sub_epi32(s10, s14);
+ u15 = _mm256_sub_epi32(s11, s15);
+
+ v8 = _mm256_add_epi32(x8, x12);
+ v9 = _mm256_add_epi32(x9, x13);
+ v10 = _mm256_add_epi32(x10, x14);
+ v11 = _mm256_add_epi32(x11, x15);
+ v12 = _mm256_sub_epi32(x8, x12);
+ v13 = _mm256_sub_epi32(x9, x13);
+ v14 = _mm256_sub_epi32(x10, x14);
+ v15 = _mm256_sub_epi32(x11, x15);
+
+ u12 = _mm256_add_epi32(u12, dct_rounding);
+ u13 = _mm256_add_epi32(u13, dct_rounding);
+ u14 = _mm256_add_epi32(u14, dct_rounding);
+ u15 = _mm256_add_epi32(u15, dct_rounding);
+
+ u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+ u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+ u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+ u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+ v12 = _mm256_add_epi32(v12, dct_rounding);
+ v13 = _mm256_add_epi32(v13, dct_rounding);
+ v14 = _mm256_add_epi32(v14, dct_rounding);
+ v15 = _mm256_add_epi32(v15, dct_rounding);
+
+ v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+ v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+ v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+ v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+ x12 = _mm256_packs_epi32(u12, v12);
+ x13 = _mm256_packs_epi32(u13, v13);
+ x14 = _mm256_packs_epi32(u14, v14);
+ x15 = _mm256_packs_epi32(u15, v15);
+
+ // stage 3
+ y0 = _mm256_unpacklo_epi16(x4, x5);
+ y1 = _mm256_unpackhi_epi16(x4, x5);
+ s4 = _mm256_madd_epi16(y0, cospi_p08_p24);
+ x4 = _mm256_madd_epi16(y1, cospi_p08_p24);
+ s5 = _mm256_madd_epi16(y0, cospi_p24_m08);
+ x5 = _mm256_madd_epi16(y1, cospi_p24_m08);
+
+ y0 = _mm256_unpacklo_epi16(x6, x7);
+ y1 = _mm256_unpackhi_epi16(x6, x7);
+ s6 = _mm256_madd_epi16(y0, cospi_m24_p08);
+ x6 = _mm256_madd_epi16(y1, cospi_m24_p08);
+ s7 = _mm256_madd_epi16(y0, cospi_p08_p24);
+ x7 = _mm256_madd_epi16(y1, cospi_p08_p24);
+
+ y0 = _mm256_unpacklo_epi16(x12, x13);
+ y1 = _mm256_unpackhi_epi16(x12, x13);
+ s12 = _mm256_madd_epi16(y0, cospi_p08_p24);
+ x12 = _mm256_madd_epi16(y1, cospi_p08_p24);
+ s13 = _mm256_madd_epi16(y0, cospi_p24_m08);
+ x13 = _mm256_madd_epi16(y1, cospi_p24_m08);
+
+ y0 = _mm256_unpacklo_epi16(x14, x15);
+ y1 = _mm256_unpackhi_epi16(x14, x15);
+ s14 = _mm256_madd_epi16(y0, cospi_m24_p08);
+ x14 = _mm256_madd_epi16(y1, cospi_m24_p08);
+ s15 = _mm256_madd_epi16(y0, cospi_p08_p24);
+ x15 = _mm256_madd_epi16(y1, cospi_p08_p24);
+
+ u0 = _mm256_add_epi32(x0, x2);
+ v0 = _mm256_add_epi32(s0, s2);
+ u1 = _mm256_add_epi32(x1, x3);
+ v1 = _mm256_add_epi32(s1, s3);
+ u2 = _mm256_sub_epi32(x0, x2);
+ v2 = _mm256_sub_epi32(s0, s2);
+ u3 = _mm256_sub_epi32(x1, x3);
+ v3 = _mm256_sub_epi32(s1, s3);
+
+ u0 = _mm256_add_epi32(u0, dct_rounding);
+ v0 = _mm256_add_epi32(v0, dct_rounding);
+ u1 = _mm256_add_epi32(u1, dct_rounding);
+ v1 = _mm256_add_epi32(v1, dct_rounding);
+ u2 = _mm256_add_epi32(u2, dct_rounding);
+ v2 = _mm256_add_epi32(v2, dct_rounding);
+ u3 = _mm256_add_epi32(u3, dct_rounding);
+ v3 = _mm256_add_epi32(v3, dct_rounding);
+
+ u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+ v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+ v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+ v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+ v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+
+ in[0] = _mm256_packs_epi32(u0, v0);
+ x1 = _mm256_packs_epi32(u1, v1);
+ x2 = _mm256_packs_epi32(u2, v2);
+ x3 = _mm256_packs_epi32(u3, v3);
+
+ // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7
+ u4 = _mm256_add_epi32(s4, s6);
+ u5 = _mm256_add_epi32(s5, s7);
+ u6 = _mm256_sub_epi32(s4, s6);
+ u7 = _mm256_sub_epi32(s5, s7);
+
+ v4 = _mm256_add_epi32(x4, x6);
+ v5 = _mm256_add_epi32(x5, x7);
+ v6 = _mm256_sub_epi32(x4, x6);
+ v7 = _mm256_sub_epi32(x5, x7);
+
+ u4 = _mm256_add_epi32(u4, dct_rounding);
+ u5 = _mm256_add_epi32(u5, dct_rounding);
+ u6 = _mm256_add_epi32(u6, dct_rounding);
+ u7 = _mm256_add_epi32(u7, dct_rounding);
+
+ u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS);
+ u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS);
+ u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
+ u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
+
+ v4 = _mm256_add_epi32(v4, dct_rounding);
+ v5 = _mm256_add_epi32(v5, dct_rounding);
+ v6 = _mm256_add_epi32(v6, dct_rounding);
+ v7 = _mm256_add_epi32(v7, dct_rounding);
+
+ v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS);
+ v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS);
+ v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
+ v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
+
+ x4 = _mm256_packs_epi32(u4, v4);
+ in[12] = _mm256_packs_epi32(u5, v5);
+ x6 = _mm256_packs_epi32(u6, v6);
+ x7 = _mm256_packs_epi32(u7, v7);
+
+ u0 = _mm256_add_epi32(u8, u10);
+ v0 = _mm256_add_epi32(v8, v10);
+ u1 = _mm256_add_epi32(u9, u11);
+ v1 = _mm256_add_epi32(v9, v11);
+ u2 = _mm256_sub_epi32(u8, u10);
+ v2 = _mm256_sub_epi32(v8, v10);
+ u3 = _mm256_sub_epi32(u9, u11);
+ v3 = _mm256_sub_epi32(v9, v11);
+
+ u0 = _mm256_add_epi32(u0, dct_rounding);
+ v0 = _mm256_add_epi32(v0, dct_rounding);
+ u1 = _mm256_add_epi32(u1, dct_rounding);
+ v1 = _mm256_add_epi32(v1, dct_rounding);
+ u2 = _mm256_add_epi32(u2, dct_rounding);
+ v2 = _mm256_add_epi32(v2, dct_rounding);
+ u3 = _mm256_add_epi32(u3, dct_rounding);
+ v3 = _mm256_add_epi32(v3, dct_rounding);
+
+ u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+ v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+ v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+ v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+ v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+
+ x8 = _mm256_packs_epi32(u0, v0);
+ in[14] = _mm256_packs_epi32(u1, v1);
+ x10 = _mm256_packs_epi32(u2, v2);
+ x11 = _mm256_packs_epi32(u3, v3);
+
+ // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15
+ u12 = _mm256_add_epi32(s12, s14);
+ u13 = _mm256_add_epi32(s13, s15);
+ u14 = _mm256_sub_epi32(s12, s14);
+ u15 = _mm256_sub_epi32(s13, s15);
+
+ v12 = _mm256_add_epi32(x12, x14);
+ v13 = _mm256_add_epi32(x13, x15);
+ v14 = _mm256_sub_epi32(x12, x14);
+ v15 = _mm256_sub_epi32(x13, x15);
+
+ u12 = _mm256_add_epi32(u12, dct_rounding);
+ u13 = _mm256_add_epi32(u13, dct_rounding);
+ u14 = _mm256_add_epi32(u14, dct_rounding);
+ u15 = _mm256_add_epi32(u15, dct_rounding);
+
+ u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+ u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+ u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+ u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+ v12 = _mm256_add_epi32(v12, dct_rounding);
+ v13 = _mm256_add_epi32(v13, dct_rounding);
+ v14 = _mm256_add_epi32(v14, dct_rounding);
+ v15 = _mm256_add_epi32(v15, dct_rounding);
+
+ v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+ v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+ v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+ v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+ x12 = _mm256_packs_epi32(u12, v12);
+ x13 = _mm256_packs_epi32(u13, v13);
+ x14 = _mm256_packs_epi32(u14, v14);
+ x15 = _mm256_packs_epi32(u15, v15);
+ in[2] = x12;
+
+ // stage 4
+ y0 = _mm256_unpacklo_epi16(x2, x3);
+ y1 = _mm256_unpackhi_epi16(x2, x3);
+ s2 = _mm256_madd_epi16(y0, cospi_m16_m16);
+ x2 = _mm256_madd_epi16(y1, cospi_m16_m16);
+ s3 = _mm256_madd_epi16(y0, cospi_p16_m16);
+ x3 = _mm256_madd_epi16(y1, cospi_p16_m16);
+
+ y0 = _mm256_unpacklo_epi16(x6, x7);
+ y1 = _mm256_unpackhi_epi16(x6, x7);
+ s6 = _mm256_madd_epi16(y0, cospi_p16_p16);
+ x6 = _mm256_madd_epi16(y1, cospi_p16_p16);
+ s7 = _mm256_madd_epi16(y0, cospi_m16_p16);
+ x7 = _mm256_madd_epi16(y1, cospi_m16_p16);
+
+ y0 = _mm256_unpacklo_epi16(x10, x11);
+ y1 = _mm256_unpackhi_epi16(x10, x11);
+ s10 = _mm256_madd_epi16(y0, cospi_p16_p16);
+ x10 = _mm256_madd_epi16(y1, cospi_p16_p16);
+ s11 = _mm256_madd_epi16(y0, cospi_m16_p16);
+ x11 = _mm256_madd_epi16(y1, cospi_m16_p16);
+
+ y0 = _mm256_unpacklo_epi16(x14, x15);
+ y1 = _mm256_unpackhi_epi16(x14, x15);
+ s14 = _mm256_madd_epi16(y0, cospi_m16_m16);
+ x14 = _mm256_madd_epi16(y1, cospi_m16_m16);
+ s15 = _mm256_madd_epi16(y0, cospi_p16_m16);
+ x15 = _mm256_madd_epi16(y1, cospi_p16_m16);
+
+ // Rounding
+ u2 = _mm256_add_epi32(s2, dct_rounding);
+ u3 = _mm256_add_epi32(s3, dct_rounding);
+ u6 = _mm256_add_epi32(s6, dct_rounding);
+ u7 = _mm256_add_epi32(s7, dct_rounding);
+
+ u10 = _mm256_add_epi32(s10, dct_rounding);
+ u11 = _mm256_add_epi32(s11, dct_rounding);
+ u14 = _mm256_add_epi32(s14, dct_rounding);
+ u15 = _mm256_add_epi32(s15, dct_rounding);
+
+ u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+ u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+ u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
+ u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
+
+ u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
+ u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
+ u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+ u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+ v2 = _mm256_add_epi32(x2, dct_rounding);
+ v3 = _mm256_add_epi32(x3, dct_rounding);
+ v6 = _mm256_add_epi32(x6, dct_rounding);
+ v7 = _mm256_add_epi32(x7, dct_rounding);
+
+ v10 = _mm256_add_epi32(x10, dct_rounding);
+ v11 = _mm256_add_epi32(x11, dct_rounding);
+ v14 = _mm256_add_epi32(x14, dct_rounding);
+ v15 = _mm256_add_epi32(x15, dct_rounding);
+
+ v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+ v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+ v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
+ v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
+
+ v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+ v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+ v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+ v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+ in[7] = _mm256_packs_epi32(u2, v2);
+ in[8] = _mm256_packs_epi32(u3, v3);
+
+ in[4] = _mm256_packs_epi32(u6, v6);
+ in[11] = _mm256_packs_epi32(u7, v7);
+
+ in[6] = _mm256_packs_epi32(u10, v10);
+ in[9] = _mm256_packs_epi32(u11, v11);
+
+ in[5] = _mm256_packs_epi32(u14, v14);
+ in[10] = _mm256_packs_epi32(u15, v15);
+
+ in[1] = _mm256_sub_epi16(zero, x8);
+ in[3] = _mm256_sub_epi16(zero, x4);
+ in[13] = _mm256_sub_epi16(zero, x13);
+ in[15] = _mm256_sub_epi16(zero, x1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
+#endif
+
+void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m256i in[16];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fdct16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fdct16_avx2(in);
+ break;
+ case ADST_DCT:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fdct16_avx2(in);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fdct16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+ case ADST_ADST:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, stride, 1, 0, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fdct16_avx2(in);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, stride, 0, 1, in);
+ fdct16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, stride, 1, 1, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, stride, 0, 1, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, stride, 1, 0, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+ case IDTX:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fidtx16_avx2(in);
+ break;
+ case V_DCT:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fdct16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fidtx16_avx2(in);
+ break;
+ case H_DCT:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fdct16_avx2(in);
+ break;
+ case V_ADST:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fidtx16_avx2(in);
+ break;
+ case H_ADST:
+ load_buffer_16x16(input, stride, 0, 0, in);
+ fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x16(input, stride, 1, 0, in);
+ fadst16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fidtx16_avx2(in);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x16(input, stride, 0, 1, in);
+ fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
+ right_shift_16x16(in);
+ fadst16_avx2(in);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+ mm256_transpose_16x16(in);
+ write_buffer_16x16(in, output);
+ _mm256_zeroupper();
+}
+
+void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ // left and upper corner
+ int32_t sum = get_16x16_sum(input, stride);
+ // right and upper corner
+ sum += get_16x16_sum(input + 16, stride);
+ // left and lower corner
+ sum += get_16x16_sum(input + (stride << 4), stride);
+ // right and lower corner
+ sum += get_16x16_sum(input + (stride << 4) + 16, stride);
+
+ sum >>= 3;
+ output[0] = (tran_low_t)sum;
+ _mm256_zeroupper();
+}
+
+static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
+ int i = 0;
+ __m256i temp;
+ while (i < size) {
+ temp = a0[i];
+ a0[i] = a1[i];
+ a1[i] = temp;
+ i++;
+ }
+}
+
+static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
+ mm256_transpose_16x16(in0);
+ mm256_transpose_16x16(&in0[16]);
+ mm256_transpose_16x16(in1);
+ mm256_transpose_16x16(&in1[16]);
+ mm256_vectors_swap(&in0[16], in1, 16);
+}
+
+static void prepare_16x16_even(const __m256i *in, __m256i *even) {
+ even[0] = _mm256_add_epi16(in[0], in[31]);
+ even[1] = _mm256_add_epi16(in[1], in[30]);
+ even[2] = _mm256_add_epi16(in[2], in[29]);
+ even[3] = _mm256_add_epi16(in[3], in[28]);
+ even[4] = _mm256_add_epi16(in[4], in[27]);
+ even[5] = _mm256_add_epi16(in[5], in[26]);
+ even[6] = _mm256_add_epi16(in[6], in[25]);
+ even[7] = _mm256_add_epi16(in[7], in[24]);
+ even[8] = _mm256_add_epi16(in[8], in[23]);
+ even[9] = _mm256_add_epi16(in[9], in[22]);
+ even[10] = _mm256_add_epi16(in[10], in[21]);
+ even[11] = _mm256_add_epi16(in[11], in[20]);
+ even[12] = _mm256_add_epi16(in[12], in[19]);
+ even[13] = _mm256_add_epi16(in[13], in[18]);
+ even[14] = _mm256_add_epi16(in[14], in[17]);
+ even[15] = _mm256_add_epi16(in[15], in[16]);
+}
+
+static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
+ odd[0] = _mm256_sub_epi16(in[15], in[16]);
+ odd[1] = _mm256_sub_epi16(in[14], in[17]);
+ odd[2] = _mm256_sub_epi16(in[13], in[18]);
+ odd[3] = _mm256_sub_epi16(in[12], in[19]);
+ odd[4] = _mm256_sub_epi16(in[11], in[20]);
+ odd[5] = _mm256_sub_epi16(in[10], in[21]);
+ odd[6] = _mm256_sub_epi16(in[9], in[22]);
+ odd[7] = _mm256_sub_epi16(in[8], in[23]);
+ odd[8] = _mm256_sub_epi16(in[7], in[24]);
+ odd[9] = _mm256_sub_epi16(in[6], in[25]);
+ odd[10] = _mm256_sub_epi16(in[5], in[26]);
+ odd[11] = _mm256_sub_epi16(in[4], in[27]);
+ odd[12] = _mm256_sub_epi16(in[3], in[28]);
+ odd[13] = _mm256_sub_epi16(in[2], in[29]);
+ odd[14] = _mm256_sub_epi16(in[1], in[30]);
+ odd[15] = _mm256_sub_epi16(in[0], in[31]);
+}
+
+static void collect_16col(const __m256i *even, const __m256i *odd,
+ __m256i *out) {
+ // fdct16_avx2() already maps the output
+ out[0] = even[0];
+ out[2] = even[1];
+ out[4] = even[2];
+ out[6] = even[3];
+ out[8] = even[4];
+ out[10] = even[5];
+ out[12] = even[6];
+ out[14] = even[7];
+ out[16] = even[8];
+ out[18] = even[9];
+ out[20] = even[10];
+ out[22] = even[11];
+ out[24] = even[12];
+ out[26] = even[13];
+ out[28] = even[14];
+ out[30] = even[15];
+
+ out[1] = odd[0];
+ out[17] = odd[1];
+ out[9] = odd[2];
+ out[25] = odd[3];
+ out[5] = odd[4];
+ out[21] = odd[5];
+ out[13] = odd[6];
+ out[29] = odd[7];
+ out[3] = odd[8];
+ out[19] = odd[9];
+ out[11] = odd[10];
+ out[27] = odd[11];
+ out[7] = odd[12];
+ out[23] = odd[13];
+ out[15] = odd[14];
+ out[31] = odd[15];
+}
+
+static void collect_coeffs(const __m256i *first_16col_even,
+ const __m256i *first_16col_odd,
+ const __m256i *second_16col_even,
+ const __m256i *second_16col_odd, __m256i *in0,
+ __m256i *in1) {
+ collect_16col(first_16col_even, first_16col_odd, in0);
+ collect_16col(second_16col_even, second_16col_odd, in1);
+}
+
+static void fdct16_odd_avx2(__m256i *in) {
+ // sequence: cospi_L_H = pairs(L, H) and L first
+ const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+ const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+ const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+ const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+ const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
+ const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
+ const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
+ const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
+ const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
+ const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
+ const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
+ const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
+ const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+
+ __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
+ __m256i u0, u1;
+
+ // stage 1 is in prepare_16x16_odd()
+
+ // stage 2
+ y0 = in[0];
+ y1 = in[1];
+ y2 = in[2];
+ y3 = in[3];
+
+ u0 = _mm256_unpacklo_epi16(in[4], in[11]);
+ u1 = _mm256_unpackhi_epi16(in[4], in[11]);
+ y4 = butter_fly(u0, u1, cospi_m16_p16);
+ y11 = butter_fly(u0, u1, cospi_p16_p16);
+
+ u0 = _mm256_unpacklo_epi16(in[5], in[10]);
+ u1 = _mm256_unpackhi_epi16(in[5], in[10]);
+ y5 = butter_fly(u0, u1, cospi_m16_p16);
+ y10 = butter_fly(u0, u1, cospi_p16_p16);
+
+ u0 = _mm256_unpacklo_epi16(in[6], in[9]);
+ u1 = _mm256_unpackhi_epi16(in[6], in[9]);
+ y6 = butter_fly(u0, u1, cospi_m16_p16);
+ y9 = butter_fly(u0, u1, cospi_p16_p16);
+
+ u0 = _mm256_unpacklo_epi16(in[7], in[8]);
+ u1 = _mm256_unpackhi_epi16(in[7], in[8]);
+ y7 = butter_fly(u0, u1, cospi_m16_p16);
+ y8 = butter_fly(u0, u1, cospi_p16_p16);
+
+ y12 = in[12];
+ y13 = in[13];
+ y14 = in[14];
+ y15 = in[15];
+
+ // stage 3
+ x0 = _mm256_add_epi16(y0, y7);
+ x1 = _mm256_add_epi16(y1, y6);
+ x2 = _mm256_add_epi16(y2, y5);
+ x3 = _mm256_add_epi16(y3, y4);
+ x4 = _mm256_sub_epi16(y3, y4);
+ x5 = _mm256_sub_epi16(y2, y5);
+ x6 = _mm256_sub_epi16(y1, y6);
+ x7 = _mm256_sub_epi16(y0, y7);
+ x8 = _mm256_sub_epi16(y15, y8);
+ x9 = _mm256_sub_epi16(y14, y9);
+ x10 = _mm256_sub_epi16(y13, y10);
+ x11 = _mm256_sub_epi16(y12, y11);
+ x12 = _mm256_add_epi16(y12, y11);
+ x13 = _mm256_add_epi16(y13, y10);
+ x14 = _mm256_add_epi16(y14, y9);
+ x15 = _mm256_add_epi16(y15, y8);
+
+ // stage 4
+ y0 = x0;
+ y1 = x1;
+ y6 = x6;
+ y7 = x7;
+ y8 = x8;
+ y9 = x9;
+ y14 = x14;
+ y15 = x15;
+
+ u0 = _mm256_unpacklo_epi16(x2, x13);
+ u1 = _mm256_unpackhi_epi16(x2, x13);
+ y2 = butter_fly(u0, u1, cospi_m08_p24);
+ y13 = butter_fly(u0, u1, cospi_p24_p08);
+
+ u0 = _mm256_unpacklo_epi16(x3, x12);
+ u1 = _mm256_unpackhi_epi16(x3, x12);
+ y3 = butter_fly(u0, u1, cospi_m08_p24);
+ y12 = butter_fly(u0, u1, cospi_p24_p08);
+
+ u0 = _mm256_unpacklo_epi16(x4, x11);
+ u1 = _mm256_unpackhi_epi16(x4, x11);
+ y4 = butter_fly(u0, u1, cospi_m24_m08);
+ y11 = butter_fly(u0, u1, cospi_m08_p24);
+
+ u0 = _mm256_unpacklo_epi16(x5, x10);
+ u1 = _mm256_unpackhi_epi16(x5, x10);
+ y5 = butter_fly(u0, u1, cospi_m24_m08);
+ y10 = butter_fly(u0, u1, cospi_m08_p24);
+
+ // stage 5
+ x0 = _mm256_add_epi16(y0, y3);
+ x1 = _mm256_add_epi16(y1, y2);
+ x2 = _mm256_sub_epi16(y1, y2);
+ x3 = _mm256_sub_epi16(y0, y3);
+ x4 = _mm256_sub_epi16(y7, y4);
+ x5 = _mm256_sub_epi16(y6, y5);
+ x6 = _mm256_add_epi16(y6, y5);
+ x7 = _mm256_add_epi16(y7, y4);
+
+ x8 = _mm256_add_epi16(y8, y11);
+ x9 = _mm256_add_epi16(y9, y10);
+ x10 = _mm256_sub_epi16(y9, y10);
+ x11 = _mm256_sub_epi16(y8, y11);
+ x12 = _mm256_sub_epi16(y15, y12);
+ x13 = _mm256_sub_epi16(y14, y13);
+ x14 = _mm256_add_epi16(y14, y13);
+ x15 = _mm256_add_epi16(y15, y12);
+
+ // stage 6
+ y0 = x0;
+ y3 = x3;
+ y4 = x4;
+ y7 = x7;
+ y8 = x8;
+ y11 = x11;
+ y12 = x12;
+ y15 = x15;
+
+ u0 = _mm256_unpacklo_epi16(x1, x14);
+ u1 = _mm256_unpackhi_epi16(x1, x14);
+ y1 = butter_fly(u0, u1, cospi_m04_p28);
+ y14 = butter_fly(u0, u1, cospi_p28_p04);
+
+ u0 = _mm256_unpacklo_epi16(x2, x13);
+ u1 = _mm256_unpackhi_epi16(x2, x13);
+ y2 = butter_fly(u0, u1, cospi_m28_m04);
+ y13 = butter_fly(u0, u1, cospi_m04_p28);
+
+ u0 = _mm256_unpacklo_epi16(x5, x10);
+ u1 = _mm256_unpackhi_epi16(x5, x10);
+ y5 = butter_fly(u0, u1, cospi_m20_p12);
+ y10 = butter_fly(u0, u1, cospi_p12_p20);
+
+ u0 = _mm256_unpacklo_epi16(x6, x9);
+ u1 = _mm256_unpackhi_epi16(x6, x9);
+ y6 = butter_fly(u0, u1, cospi_m12_m20);
+ y9 = butter_fly(u0, u1, cospi_m20_p12);
+
+ // stage 7
+ x0 = _mm256_add_epi16(y0, y1);
+ x1 = _mm256_sub_epi16(y0, y1);
+ x2 = _mm256_sub_epi16(y3, y2);
+ x3 = _mm256_add_epi16(y3, y2);
+ x4 = _mm256_add_epi16(y4, y5);
+ x5 = _mm256_sub_epi16(y4, y5);
+ x6 = _mm256_sub_epi16(y7, y6);
+ x7 = _mm256_add_epi16(y7, y6);
+
+ x8 = _mm256_add_epi16(y8, y9);
+ x9 = _mm256_sub_epi16(y8, y9);
+ x10 = _mm256_sub_epi16(y11, y10);
+ x11 = _mm256_add_epi16(y11, y10);
+ x12 = _mm256_add_epi16(y12, y13);
+ x13 = _mm256_sub_epi16(y12, y13);
+ x14 = _mm256_sub_epi16(y15, y14);
+ x15 = _mm256_add_epi16(y15, y14);
+
+ // stage 8
+ u0 = _mm256_unpacklo_epi16(x0, x15);
+ u1 = _mm256_unpackhi_epi16(x0, x15);
+ in[0] = butter_fly(u0, u1, cospi_p31_p01);
+ in[15] = butter_fly(u0, u1, cospi_m01_p31);
+
+ u0 = _mm256_unpacklo_epi16(x1, x14);
+ u1 = _mm256_unpackhi_epi16(x1, x14);
+ in[1] = butter_fly(u0, u1, cospi_p15_p17);
+ in[14] = butter_fly(u0, u1, cospi_m17_p15);
+
+ u0 = _mm256_unpacklo_epi16(x2, x13);
+ u1 = _mm256_unpackhi_epi16(x2, x13);
+ in[2] = butter_fly(u0, u1, cospi_p23_p09);
+ in[13] = butter_fly(u0, u1, cospi_m09_p23);
+
+ u0 = _mm256_unpacklo_epi16(x3, x12);
+ u1 = _mm256_unpackhi_epi16(x3, x12);
+ in[3] = butter_fly(u0, u1, cospi_p07_p25);
+ in[12] = butter_fly(u0, u1, cospi_m25_p07);
+
+ u0 = _mm256_unpacklo_epi16(x4, x11);
+ u1 = _mm256_unpackhi_epi16(x4, x11);
+ in[4] = butter_fly(u0, u1, cospi_p27_p05);
+ in[11] = butter_fly(u0, u1, cospi_m05_p27);
+
+ u0 = _mm256_unpacklo_epi16(x5, x10);
+ u1 = _mm256_unpackhi_epi16(x5, x10);
+ in[5] = butter_fly(u0, u1, cospi_p11_p21);
+ in[10] = butter_fly(u0, u1, cospi_m21_p11);
+
+ u0 = _mm256_unpacklo_epi16(x6, x9);
+ u1 = _mm256_unpackhi_epi16(x6, x9);
+ in[6] = butter_fly(u0, u1, cospi_p19_p13);
+ in[9] = butter_fly(u0, u1, cospi_m13_p19);
+
+ u0 = _mm256_unpacklo_epi16(x7, x8);
+ u1 = _mm256_unpackhi_epi16(x7, x8);
+ in[7] = butter_fly(u0, u1, cospi_p03_p29);
+ in[8] = butter_fly(u0, u1, cospi_m29_p03);
+}
+
+static void fdct32_avx2(__m256i *in0, __m256i *in1) {
+ __m256i even0[16], even1[16], odd0[16], odd1[16];
+ prepare_16x16_even(in0, even0);
+ fdct16_avx2(even0);
+
+ prepare_16x16_odd(in0, odd0);
+ fdct16_odd_avx2(odd0);
+
+ prepare_16x16_even(in1, even1);
+ fdct16_avx2(even1);
+
+ prepare_16x16_odd(in1, odd1);
+ fdct16_odd_avx2(odd1);
+
+ collect_coeffs(even0, odd0, even1, odd1, in0, in1);
+
+ mm256_transpose_32x32(in0, in1);
+}
+
+static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
+ tran_low_t *output) {
+ int i = 0;
+ const int stride = 32;
+ tran_low_t *coeff = output;
+ while (i < 32) {
+ storeu_output_avx2(&in0[i], coeff);
+ storeu_output_avx2(&in1[i], coeff + 16);
+ coeff += stride;
+ i += 1;
+ }
+}
+
+#if CONFIG_EXT_TX
+static void fhalfright32_16col_avx2(__m256i *in) {
+ int i = 0;
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i x0, x1;
+
+ while (i < 16) {
+ in[i] = _mm256_slli_epi16(in[i], 2);
+ x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
+ x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
+ x0 = _mm256_madd_epi16(x0, sqrt2);
+ x1 = _mm256_madd_epi16(x1, sqrt2);
+ x0 = _mm256_add_epi32(x0, dct_rounding);
+ x1 = _mm256_add_epi32(x1, dct_rounding);
+ x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
+ x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
+ in[i + 16] = _mm256_packs_epi32(x0, x1);
+ i += 1;
+ }
+ fdct16_avx2(&in[16]);
+}
+
+static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
+ fhalfright32_16col_avx2(in0);
+ fhalfright32_16col_avx2(in1);
+ mm256_vectors_swap(in0, &in0[16], 16);
+ mm256_vectors_swap(in1, &in1[16], 16);
+ mm256_transpose_32x32(in0, in1);
+}
+#endif // CONFIG_EXT_TX
+
+static INLINE void load_buffer_32x32(const int16_t *input, int stride,
+ int flipud, int fliplr, __m256i *in0,
+ __m256i *in1) {
+ // Load 4 16x16 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 16;
+ const int16_t *botL = input + 16 * stride;
+ const int16_t *botR = input + 16 * stride + 16;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ // Swap right columns
+ tmp = topR;
+ topR = botR;
+ botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ // Swap bottom rows
+ tmp = botL;
+ botL = botR;
+ botR = tmp;
+ }
+
+ // load first 16 columns
+ load_buffer_16x16(topL, stride, flipud, fliplr, in0);
+ load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
+
+ // load second 16 columns
+ load_buffer_16x16(topR, stride, flipud, fliplr, in1);
+ load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
+}
+
+static INLINE void right_shift_32x32_16col(int bit, __m256i *in) {
+ int i = 0;
+ const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1);
+ __m256i sign;
+ while (i < 32) {
+ sign = _mm256_srai_epi16(in[i], 15);
+ in[i] = _mm256_add_epi16(in[i], rounding);
+ in[i] = _mm256_add_epi16(in[i], sign);
+ in[i] = _mm256_srai_epi16(in[i], bit);
+ i += 1;
+ }
+}
+
+// Positive rounding
+static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) {
+ const int bit = 4;
+ right_shift_32x32_16col(bit, in0);
+ right_shift_32x32_16col(bit, in1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
+ int i = 0;
+ while (i < 32) {
+ in0[i] = _mm256_slli_epi16(in0[i], 2);
+ in1[i] = _mm256_slli_epi16(in1[i], 2);
+ i += 1;
+ }
+ mm256_transpose_32x32(in0, in1);
+}
+#endif
+
+void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m256i in0[32]; // left 32 columns
+ __m256i in1[32]; // right 32 columns
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fdct32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
+#if CONFIG_EXT_TX
+ case ADST_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
+ case DCT_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fdct32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case ADST_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_32x32(input, stride, 1, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_32x32(input, stride, 0, 1, in0, in1);
+ fdct32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_32x32(input, stride, 1, 1, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_32x32(input, stride, 0, 1, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_32x32(input, stride, 1, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case IDTX:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fidtx32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
+ case V_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fdct32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
+ case H_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fidtx32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
+ case V_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
+ case H_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fidtx32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case V_FLIPADST:
+ load_buffer_32x32(input, stride, 1, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
+ case H_FLIPADST:
+ load_buffer_32x32(input, stride, 0, 1, in0, in1);
+ fidtx32_avx2(in0, in1);
+ right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+ write_buffer_32x32(in0, in1, output);
+ _mm256_zeroupper();
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 0000000000..7186b6b924
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,215 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+; void av1_temporal_filter_apply_sse2 | arg
+; (unsigned char *frame1, | 0
+; unsigned int stride, | 1
+; unsigned char *frame2, | 2
+; unsigned int block_width, | 3
+; unsigned int block_height, | 4
+; int strength, | 5
+; int filter_weight, | 6
+; unsigned int *accumulator, | 7
+; unsigned short *count) | 8
+global sym(av1_temporal_filter_apply_sse2) PRIVATE
+sym(av1_temporal_filter_apply_sse2):
+
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ALIGN_STACK 16, rax
+ %define block_width 0
+ %define block_height 16
+ %define strength 32
+ %define filter_weight 48
+ %define rounding_bit 64
+ %define rbp_backup 80
+ %define stack_size 96
+ sub rsp, stack_size
+ mov [rsp + rbp_backup], rbp
+ ; end prolog
+
+ mov edx, arg(3)
+ mov [rsp + block_width], rdx
+ mov edx, arg(4)
+ mov [rsp + block_height], rdx
+ movd xmm6, arg(5)
+ movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+ ; calculate the rounding bit outside the loop
+ ; 0x8000 >> (16 - strength)
+ mov rdx, 16
+ sub rdx, arg(5) ; 16 - strength
+ movq xmm4, rdx ; can't use rdx w/ shift
+ movdqa xmm5, [GLOBAL(_const_top_bit)]
+ psrlw xmm5, xmm4
+ movdqa [rsp + rounding_bit], xmm5
+
+ mov rsi, arg(0) ; src/frame1
+ mov rdx, arg(2) ; predictor frame
+ mov rdi, arg(7) ; accumulator
+ mov rax, arg(8) ; count
+
+ ; dup the filter weight and store for later
+ movd xmm0, arg(6) ; filter_weight
+ pshuflw xmm0, xmm0, 0
+ punpcklwd xmm0, xmm0
+ movdqa [rsp + filter_weight], xmm0
+
+ mov rbp, arg(1) ; stride
+ pxor xmm7, xmm7 ; zero for extraction
+
+ mov rcx, [rsp + block_width]
+ imul rcx, [rsp + block_height]
+ add rcx, rdx
+ cmp dword ptr [rsp + block_width], 8
+ jne .temporal_filter_apply_load_16
+
+.temporal_filter_apply_load_8:
+ movq xmm0, [rsi] ; first row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ movq xmm1, [rsi] ; second row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm1, xmm7 ; src[ 8-15]
+ jmp .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+ movdqa xmm0, [rsi] ; src (frame1)
+ lea rsi, [rsi + rbp] ; += stride
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ punpckhbw xmm1, xmm7 ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+ movdqa xmm2, [rdx] ; predictor (frame2)
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm3, xmm7 ; pred[ 8-15]
+
+ ; modifier = src_byte - pixel_value
+ psubw xmm0, xmm2 ; src - pred[ 0- 7]
+ psubw xmm1, xmm3 ; src - pred[ 8-15]
+
+ ; modifier *= modifier
+ pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
+ pmullw xmm1, xmm1 ; modifer[ 8-15]^2
+
+ ; modifier *= 3
+ pmullw xmm0, [GLOBAL(_const_3w)]
+ pmullw xmm1, [GLOBAL(_const_3w)]
+
+ ; modifer += 0x8000 >> (16 - strength)
+ paddw xmm0, [rsp + rounding_bit]
+ paddw xmm1, [rsp + rounding_bit]
+
+ ; modifier >>= strength
+ psrlw xmm0, [rsp + strength]
+ psrlw xmm1, [rsp + strength]
+
+ ; modifier = 16 - modifier
+ ; saturation takes care of modifier > 16
+ movdqa xmm3, [GLOBAL(_const_16w)]
+ movdqa xmm2, [GLOBAL(_const_16w)]
+ psubusw xmm3, xmm1
+ psubusw xmm2, xmm0
+
+ ; modifier *= filter_weight
+ pmullw xmm2, [rsp + filter_weight]
+ pmullw xmm3, [rsp + filter_weight]
+
+ ; count
+ movdqa xmm4, [rax]
+ movdqa xmm5, [rax+16]
+ ; += modifier
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ ; write back
+ movdqa [rax], xmm4
+ movdqa [rax+16], xmm5
+ lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
+
+ ; load and extract the predictor up to shorts
+ pxor xmm7, xmm7
+ movdqa xmm0, [rdx]
+ lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm1, xmm7 ; pred[ 8-15]
+
+ ; modifier *= pixel_value
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ ; expand to double words
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm7 ; [ 0- 3]
+ punpckhwd xmm2, xmm7 ; [ 4- 7]
+ movdqa xmm3, xmm1
+ punpcklwd xmm1, xmm7 ; [ 8-11]
+ punpckhwd xmm3, xmm7 ; [12-15]
+
+ ; accumulator
+ movdqa xmm4, [rdi]
+ movdqa xmm5, [rdi+16]
+ movdqa xmm6, [rdi+32]
+ movdqa xmm7, [rdi+48]
+ ; += modifier
+ paddd xmm4, xmm0
+ paddd xmm5, xmm2
+ paddd xmm6, xmm1
+ paddd xmm7, xmm3
+ ; write back
+ movdqa [rdi], xmm4
+ movdqa [rdi+16], xmm5
+ movdqa [rdi+32], xmm6
+ movdqa [rdi+48], xmm7
+ lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+ cmp rdx, rcx
+ je .temporal_filter_apply_epilog
+ pxor xmm7, xmm7 ; zero for extraction
+ cmp dword ptr [rsp + block_width], 16
+ je .temporal_filter_apply_load_16
+ jmp .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+ ; begin epilog
+ mov rbp, [rsp + rbp_backup]
+ add rsp, stack_size
+ pop rsp
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+ times 8 dw 3
+align 16
+_const_top_bit:
+ times 8 dw 1<<15
+align 16
+_const_16w:
+ times 8 dw 16
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 0000000000..bf233ca4d9
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+ int n8 = n + 8;
+
+ uint64_t csse;
+
+ const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+ const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+
+ __m128i v_acc0_q = _mm_setzero_si128();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m128i v_r0_w = xx_load_128(r1 + n);
+ const __m128i v_r1_w = xx_load_128(r1 + n8);
+ const __m128i v_d0_w = xx_load_128(d + n);
+ const __m128i v_d1_w = xx_load_128(d + n8);
+ const __m128i v_m01_b = xx_load_128(m + n);
+
+ const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+ const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+ const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+ const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+ const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+ const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+ const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+ const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+ const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+ const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+ const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+ const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+ _mm_srli_epi64(v_sq0_d, 32));
+ const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+ _mm_srli_epi64(v_sq1_d, 32));
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+ n8 += 16;
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+ csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+ xx_storel_64(&csse, v_acc0_q);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+
+ __m128i v_sign_d;
+ __m128i v_acc0_d = _mm_setzero_si128();
+ __m128i v_acc1_d = _mm_setzero_si128();
+ __m128i v_acc_q;
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_m01_b = xx_load_128(m);
+ const __m128i v_m23_b = xx_load_128(m + 16);
+ const __m128i v_m45_b = xx_load_128(m + 32);
+ const __m128i v_m67_b = xx_load_128(m + 48);
+
+ const __m128i v_d0_w = xx_load_128(ds);
+ const __m128i v_d1_w = xx_load_128(ds + 8);
+ const __m128i v_d2_w = xx_load_128(ds + 16);
+ const __m128i v_d3_w = xx_load_128(ds + 24);
+ const __m128i v_d4_w = xx_load_128(ds + 32);
+ const __m128i v_d5_w = xx_load_128(ds + 40);
+ const __m128i v_d6_w = xx_load_128(ds + 48);
+ const __m128i v_d7_w = xx_load_128(ds + 56);
+
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+ const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+ const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+ const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+ const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+ const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+ const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+ const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+ const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+ const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+ const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+ const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+ const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+ const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+ const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+ const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+ v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+ v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+ v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+ v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+ v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+ acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+ xx_storel_64(&acc, v_acc_q);
+#endif
+
+ return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+ return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m128i v_neg_w =
+ _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_a0_w = xx_load_128(a);
+ const __m128i v_b0_w = xx_load_128(b);
+ const __m128i v_a1_w = xx_load_128(a + 8);
+ const __m128i v_b1_w = xx_load_128(b + 8);
+ const __m128i v_a2_w = xx_load_128(a + 16);
+ const __m128i v_b2_w = xx_load_128(b + 16);
+ const __m128i v_a3_w = xx_load_128(a + 24);
+ const __m128i v_b3_w = xx_load_128(b + 24);
+
+ const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+ const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+ const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+ const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+ const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+ const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+ const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+ const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+ const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+ xx_store_128(d, v_r0_w);
+ xx_store_128(d + 8, v_r1_w);
+ xx_store_128(d + 16, v_r2_w);
+ xx_store_128(d + 24, v_r3_w);
+
+ a += 32;
+ b += 32;
+ d += 32;
+ N -= 32;
+ } while (N);
+}