1 files changed, 620 insertions, 421 deletions
diff --git a/media/libaom/src/av1/encoder/encodetxb.c b/media/libaom/src/av1/encoder/encodetxb.c
index 5a31d93d73..825d52a7ae 100644
--- a/media/libaom/src/av1/encoder/encodetxb.c
+++ b/media/libaom/src/av1/encoder/encodetxb.c
@@ -23,6 +23,7 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
+#if CONFIG_HTB_TRELLIS
 static int hbt_needs_init = 1;
 static CRC32C crc_calculator;
 static const int HBT_EOB = 16;            // also the length in opt_qcoeff
@@ -41,6 +42,7 @@ typedef struct OptTxbQcoeff {
 } OptTxbQcoeff;
 
 OptTxbQcoeff *hbt_hash_table;
+#endif  // CONFIG_HTB_TRELLIS
 
 typedef struct LevelDownStats {
   int update;
@@ -60,10 +62,19 @@ typedef struct LevelDownStats {
   int new_eob;
 } LevelDownStats;
 
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
 
   av1_free_txb_buf(cpi);
   // TODO(jingning): This should be further reduced.
@@ -73,26 +84,6 @@ void av1_alloc_txb_buf(AV1_COMP *cpi) {
 
 void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
 
-void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                          int mi_row, int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  int mib_size_log2 = cm->seq_params.mib_size_log2;
-  int stride = (cm->mi_cols >> mib_size_log2) + 1;
-  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
-  CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
-  const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-  assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
-  for (int plane = 0; plane < num_planes; ++plane) {
-    x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
-    x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
-    x->mbmi_ext->txb_skip_ctx[plane] =
-        coeff_buf->txb_skip_ctx[plane] + txb_offset;
-    x->mbmi_ext->dc_sign_ctx[plane] =
-        coeff_buf->dc_sign_ctx[plane] + txb_offset;
-  }
-}
-
 static void write_golomb(aom_writer *w, int level) {
   int x = level + 1;
   int i = x;
@@ -160,7 +151,7 @@ static INLINE int get_eob_pos_token(const int eob, int *const extra) {
     t = eob_to_pos_large[e];
   }
 
-  *extra = eob - k_eob_group_start[t];
+  *extra = eob - av1_eob_group_start[t];
 
   return t;
 }
@@ -243,9 +234,9 @@ void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
       break;
   }
 
-  if (k_eob_offset_bits[eob_pt] > 0) {
+  if (av1_eob_offset_bits[eob_pt] > 0) {
     int eob_ctx = eob_pt - 3;
-    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
     int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
 #if CONFIG_ENTROPY_STATS
     counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
@@ -263,12 +254,12 @@ static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
   eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
 
-  if (k_eob_offset_bits[eob_pt] > 0) {
+  if (av1_eob_offset_bits[eob_pt] > 0) {
     const int eob_ctx = eob_pt - 3;
-    const int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
     const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
     eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
-    const int offset_bits = k_eob_offset_bits[eob_pt];
+    const int offset_bits = av1_eob_offset_bits[eob_pt];
     if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
   }
   return eob_cost;
@@ -284,20 +275,16 @@ static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
   return av1_cost_literal(1);
 }
 
-static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
-                              const int *coeff_lps) {
-  const tran_low_t min_level = 1 + NUM_BASE_LEVELS;
-  const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
-  (void)ctx;
-  if (abs_qc >= min_level) {
-    if (abs_qc >= max_level) {
-      return coeff_lps[COEFF_BASE_RANGE];  // COEFF_BASE_RANGE * cost0;
-    } else {
-      return coeff_lps[(abs_qc - min_level)];  //  * cost0 + cost1;
-    }
-  }
-  return 0;
-}
+static const int golomb_bits_cost[32] = {
+  0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+  512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+static const int golomb_cost_diff[32] = {
+  0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+  512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
+};
 
 static INLINE int get_golomb_cost(int abs_qc) {
   if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
@@ -308,6 +295,32 @@ static INLINE int get_golomb_cost(int abs_qc) {
   return 0;
 }
 
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+                                        int *diff) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  int golomb_bits = 0;
+  if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+    *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+  if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+    int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    if (r < 32) {
+      golomb_bits = golomb_bits_cost[r];
+      *diff += golomb_cost_diff[r];
+    } else {
+      golomb_bits = get_golomb_cost(level);
+      *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+    }
+  }
+
+  return coeff_lps[base_range] + golomb_bits;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
 static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
                           const int is_eob, const TxbInfo *const txb_info,
                           const LV_MAP_COEFF_COST *const txb_costs,
@@ -331,8 +344,7 @@ static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
     if (abs_qc > NUM_BASE_LEVELS) {
       const int ctx =
           get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
-      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
-      cost += get_golomb_cost(abs_qc);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]);
     }
   }
   return cost;
@@ -464,8 +476,6 @@ void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
   const int stride = width + TX_PAD_HOR;
   uint8_t *ls = levels;
 
-  memset(levels - TX_PAD_TOP * stride, 0,
-         sizeof(*levels) * TX_PAD_TOP * stride);
   memset(levels + stride * height, 0,
          sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
 
@@ -492,36 +502,35 @@ void av1_get_nz_map_contexts_c(const uint8_t *const levels,
   }
 }
 
-void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
                           aom_writer *w, int blk_row, int blk_col, int plane,
-                          TX_SIZE tx_size, const tran_low_t *tcoeff,
-                          uint16_t eob, TXB_CTX *txb_ctx) {
+                          int block, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+  const int txb_offset =
+      x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+  const uint16_t eob = eob_txb[block];
+  const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+  const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK;
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  aom_write_symbol(w, eob == 0,
-                   ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+  aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
   if (eob == 0) return;
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const int16_t *const scan = scan_order->scan;
-  int c;
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-  av1_txb_init_levels(tcoeff, width, height, levels);
 
-  av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  // Only y plane's tx_type is transmitted
+  if (plane == 0) {
+    av1_write_tx_type(cm, xd, tx_type, tx_size, w);
+  }
 
   int eob_extra;
   const int eob_pt = get_eob_pos_token(eob, &eob_extra);
   const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
   switch (eob_multi_size) {
     case 0:
@@ -554,22 +563,35 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
       break;
   }
 
-  if (k_eob_offset_bits[eob_pt] > 0) {
+  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
+  if (eob_offset_bits > 0) {
     const int eob_ctx = eob_pt - 3;
-    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int eob_shift = eob_offset_bits - 1;
     int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
     aom_write_symbol(w, bit,
                      ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
-    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
-      eob_shift = k_eob_offset_bits[eob_pt] - 1 - i;
+    for (int i = 1; i < eob_offset_bits; i++) {
+      eob_shift = eob_offset_bits - 1 - i;
       bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
       aom_write_bit(w, bit);
     }
   }
 
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const tran_low_t *tcoeff_txb =
+      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+  const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
   av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-  for (c = eob - 1; c >= 0; --c) {
+  const int bwl = get_txb_bwl(tx_size);
+  for (int c = eob - 1; c >= 0; --c) {
     const int pos = scan[c];
     const int coeff_ctx = coeff_contexts[pos];
     const tran_low_t v = tcoeff[pos];
@@ -588,12 +610,11 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
       // level is above 1.
       const int base_range = level - 1 - NUM_BASE_LEVELS;
       const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      aom_cdf_prob *cdf =
+          ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
       for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
         const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
-        aom_write_symbol(
-            w, k,
-            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
-            BR_CDF_SIZE);
+        aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
         if (k < BR_CDF_SIZE - 1) break;
       }
     }
@@ -601,14 +622,16 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
 
   // Loop to code all signs in the transform block,
   // starting with the sign of DC (if applicable)
-  for (c = 0; c < eob; ++c) {
+  for (int c = 0; c < eob; ++c) {
     const tran_low_t v = tcoeff[scan[c]];
     const tran_low_t level = abs(v);
     const int sign = (v < 0) ? 1 : 0;
     if (level) {
       if (c == 0) {
-        aom_write_symbol(
-            w, sign, ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
+        const int dc_sign_ctx =
+            (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK;
+        aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                         2);
       } else {
         aom_write_bit(w, sign);
       }
@@ -624,20 +647,8 @@ typedef struct encode_txb_args {
   aom_writer *w;
 } ENCODE_TXB_ARGS;
 
-static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
-                                  aom_writer *w, int plane, int block,
-                                  int blk_row, int blk_col, TX_SIZE tx_size) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-  uint16_t eob = x->mbmi_ext->eobs[plane][block];
-  TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
-                      x->mbmi_ext->dc_sign_ctx[plane][block] };
-  av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
-                       &txb_ctx);
-}
-
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
-                         int mi_col, aom_writer *w, BLOCK_SIZE bsize) {
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                         aom_writer *w, BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   const int num_planes = av1_num_planes(cm);
   int block[MAX_MB_PLANE] = { 0 };
@@ -647,23 +658,20 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   const int max_blocks_high = max_block_high(xd, bsize, 0);
   const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
-  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+  int mu_blocks_high = mi_size_high[max_unit_bsize];
   mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
   mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
 
   for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
     for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
       for (int plane = 0; plane < num_planes; ++plane) {
-        const struct macroblockd_plane *const pd = &xd->plane[plane];
-        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                 pd->subsampling_y))
-          continue;
+        if (plane && !xd->is_chroma_ref) break;
         const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
         const int stepr = tx_size_high_unit[tx_size];
         const int stepc = tx_size_wide_unit[tx_size];
         const int step = stepr * stepc;
-
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
         const int unit_height = ROUND_POWER_OF_TWO(
             AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
         const int unit_width = ROUND_POWER_OF_TWO(
@@ -672,8 +680,8 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
              blk_row += stepr) {
           for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
                blk_col += stepc) {
-            write_coeffs_txb_wrap(cm, x, w, plane, block[plane], blk_row,
-                                  blk_col, tx_size);
+            av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane,
+                                 block[plane], tx_size);
             block[plane] += step;
           }
         }
@@ -683,19 +691,19 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
 }
 
 // TODO(angiebird): use this function whenever it's possible
-static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                            const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
-                            TX_TYPE tx_type) {
+static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+                            int reduced_tx_set_used) {
   if (plane > 0) return 0;
 
   const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
 
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
       !xd->lossless[xd->mi[0]->segment_id]) {
     const int ext_tx_set =
-        get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+        get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
     if (is_inter) {
       if (ext_tx_set > 0)
         return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
@@ -715,14 +723,44 @@ static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
   return 0;
 }
 
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+                                         const int16_t *dequant_ptr,
+                                         const int16_t *scan,
+                                         const tran_low_t *coeff_ptr,
+                                         tran_low_t *qcoeff_ptr,
+                                         tran_low_t *dqcoeff_ptr) {
+  // TODO(sarahparker) make this work for aomqm
+  int eob_out = *eob;
+  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+  for (int i = *eob - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+      eob_out--;
+      qcoeff_ptr[rc] = 0;
+      dqcoeff_ptr[rc] = 0;
+    } else {
+      break;
+    }
+  }
+
+  *eob = eob_out;
+}
+
 static AOM_FORCE_INLINE int warehouse_efficients_txb(
-    const AV1_COMMON *const cm, const MACROBLOCK *x, const int plane,
-    const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
     const struct macroblock_plane *p, const int eob,
     const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
-    const MACROBLOCKD *const xd, const TX_TYPE tx_type,
-    const TX_CLASS tx_class) {
-  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
   const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
@@ -739,18 +777,19 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
 
   av1_txb_init_levels(qcoeff, width, height, levels);
 
-  cost += get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
 
   cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
 
   av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-  const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+      coeff_costs->lps_cost;
   int c = eob - 1;
   {
     const int pos = scan[c];
     const tran_low_t v = qcoeff[pos];
-    const int sign = v >> 31;
+    const int sign = AOMSIGN(v);
     const int level = (v ^ sign) - sign;
     const int coeff_ctx = coeff_contexts[pos];
     cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
@@ -758,11 +797,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
     if (v) {
       // sign bit cost
       if (level > NUM_BASE_LEVELS) {
-        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        const int base_range =
-            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-        cost += lps_cost[ctx][base_range];
-        cost += get_golomb_cost(level);
+        const int ctx = get_br_ctx_eob(pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
       }
       if (c) {
         cost += av1_cost_literal(1);
@@ -774,31 +810,28 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
       }
     }
   }
-  const int(*base_cost)[4] = coeff_costs->base_cost;
+  const int(*base_cost)[8] = coeff_costs->base_cost;
   for (c = eob - 2; c >= 1; --c) {
     const int pos = scan[c];
     const int coeff_ctx = coeff_contexts[pos];
     const tran_low_t v = qcoeff[pos];
     const int level = abs(v);
-    const int cost0 = base_cost[coeff_ctx][AOMMIN(level, 3)];
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
     if (v) {
       // sign bit cost
       cost += av1_cost_literal(1);
       if (level > NUM_BASE_LEVELS) {
         const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        const int base_range =
-            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-        cost += lps_cost[ctx][base_range];
-        cost += get_golomb_cost(level);
+        cost += get_br_cost(level, lps_cost[ctx]);
       }
     }
-    cost += cost0;
   }
-  if (c == 0) {
+  // c == 0 after previous loop
+  {
     const int pos = scan[c];
     const tran_low_t v = qcoeff[pos];
     const int coeff_ctx = coeff_contexts[pos];
-    const int sign = v >> 31;
+    const int sign = AOMSIGN(v);
     const int level = (v ^ sign) - sign;
     cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
 
@@ -809,19 +842,81 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
       cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
       if (level > NUM_BASE_LEVELS) {
         const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
-        const int base_range =
-            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
-        cost += lps_cost[ctx][base_range];
-        cost += get_golomb_cost(level);
+        cost += get_br_cost(level, lps_cost[ctx]);
       }
     }
   }
   return cost;
 }
 
-int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
-                        const int plane, const int block, const TX_SIZE tx_size,
-                        const TX_TYPE tx_type, const TXB_CTX *const txb_ctx) {
+static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
+  return cost;
+}
+
+// Look up table of individual cost of coefficient by its quantization level.
+// determined based on Laplacian distribution conditioned on estimated context
+static const int costLUT[15] = { -1143, 53,   545,  825,  1031,
+                                 1209,  1393, 1577, 1763, 1947,
+                                 2132,  2317, 2501, 2686, 2871 };
+static const int const_term = (1 << AV1_PROB_COST_SHIFT);
+static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type) {
+  assert(plane == 0);
+
+  int cost = 0;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+
+  int eob = p->eobs[block];
+
+  // coeffs
+  int c = eob - 1;
+  // eob
+  {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]) - 1;
+    cost += (v << (AV1_PROB_COST_SHIFT + 2));
+  }
+  // other coeffs
+  for (c = eob - 2; c >= 0; c--) {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]);
+    const int idx = AOMMIN(v, 14);
+
+    cost += costLUT[idx];
+  }
+
+  // const_term does not contain DC, and log(e) does not contain eob, so both
+  // (eob-1)
+  cost += (const_term + loge_par) * (eob - 1);
+
+  return cost;
+}
+
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
   const struct macroblock_plane *p = &x->plane[plane];
   const int eob = p->eobs[block];
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
@@ -835,18 +930,47 @@ int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
 
-#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal)                        \
-  case tx_class_literal:                                                       \
-    return warehouse_efficients_txb(cm, x, plane, block, tx_size, txb_ctx, p,  \
-                                    eob, plane_type, coeff_costs, xd, tx_type, \
-                                    tx_class_literal);
-  switch (tx_class) {
-    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_2D);
-    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_HORIZ);
-    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_VERT);
-#undef WAREHOUSE_EFFICIENTS_TXB_CASE
-    default: assert(false); return 0;
+  return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
+                                  plane_type, coeff_costs, xd, tx_type,
+                                  tx_class, reduced_tx_set_used);
+}
+
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  int eob = p->eobs[block];
+
+  if (adjust_eob) {
+    const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+    const int16_t *scan = scan_order->scan;
+    tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
+    tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+    update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
+                          tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
+  }
+
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
   }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb_laplacian(
+      x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
+      tx_type, tx_class, reduced_tx_set_used);
 }
 
 static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
@@ -1014,13 +1138,8 @@ static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
   return update;
 }
 
-// These numbers are empirically obtained.
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 17, 13 },
-  { 16, 10 },
-};
-
-void hbt_init() {
+#if CONFIG_HTB_TRELLIS
+static void hbt_init() {
   hbt_hash_table =
       aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
   memset(hbt_hash_table, 0,
@@ -1032,11 +1151,11 @@ void hbt_init() {
 
 void hbt_destroy() { aom_free(hbt_hash_table); }
 
-int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
-                  TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                  const LV_MAP_EOB_COST *txb_eob_costs,
-                  const struct macroblock_plane *p, int block, int fast_mode,
-                  int *rate_cost) {
+static int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                         TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                         const LV_MAP_EOB_COST *txb_eob_costs,
+                         const struct macroblock_plane *p, int block,
+                         int fast_mode, int *rate_cost) {
   (void)fast_mode;
   const int16_t *scan = txb_info->scan_order->scan;
   int prev_eob = txb_info->eob;
@@ -1098,9 +1217,9 @@ int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
   return txb_info->eob;
 }
 
-int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
-                 TxbInfo *txb_info, const struct macroblock_plane *p, int block,
-                 int *rate_cost) {
+static int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
+                        TxbInfo *txb_info, const struct macroblock_plane *p,
+                        int block, int *rate_cost) {
   const int16_t *scan = txb_info->scan_order->scan;
   int new_eob = 0;
   int update = 0;
@@ -1140,11 +1259,12 @@ int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
   return txb_info->eob;
 }
 
-int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
-                     TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                     const LV_MAP_EOB_COST *txb_eob_costs,
-                     const struct macroblock_plane *p, int block, int fast_mode,
-                     int *rate_cost) {
+static int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                            TxbInfo *txb_info,
+                            const LV_MAP_COEFF_COST *txb_costs,
+                            const LV_MAP_EOB_COST *txb_eob_costs,
+                            const struct macroblock_plane *p, int block,
+                            int fast_mode, int *rate_cost) {
   // Check for qcoeff match
   int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
   int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
@@ -1163,10 +1283,11 @@ int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
   }
 }
 
-int hbt_create_hashes(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                      const LV_MAP_EOB_COST *txb_eob_costs,
-                      const struct macroblock_plane *p, int block,
-                      int fast_mode, int *rate_cost) {
+static int hbt_create_hashes(TxbInfo *txb_info,
+                             const LV_MAP_COEFF_COST *txb_costs,
+                             const LV_MAP_EOB_COST *txb_eob_costs,
+                             const struct macroblock_plane *p, int block,
+                             int fast_mode, int *rate_cost) {
   // Initialize hash table if needed.
   if (hbt_needs_init) {
     hbt_init();
@@ -1287,21 +1408,49 @@ int hbt_create_hashes(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
   return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
                           txb_eob_costs, p, block, fast_mode, rate_cost);
 }
+#endif  // CONFIG_HTB_TRELLIS
 
-static AOM_FORCE_INLINE int get_coeff_cost_simple(
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
     int ci, tran_low_t abs_qc, int coeff_ctx,
     const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
-    const uint8_t *levels) {
+    const uint8_t *levels, int *cost_low) {
   // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
   // and not the last (scan_idx != eob - 1)
   assert(ci > 0);
   int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  int diff = 0;
+  if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
   if (abs_qc) {
     cost += av1_cost_literal(1);
     if (abs_qc > NUM_BASE_LEVELS) {
       const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
-      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
-      cost += get_golomb_cost(abs_qc);
+      int brcost_diff = 0;
+      cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+                                    &brcost_diff);
+      diff += brcost_diff;
+    }
+  }
+  *cost_low = cost - diff;
+
+  return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+                                     int coeff_ctx, int dc_sign_ctx,
+                                     const LV_MAP_COEFF_COST *txb_costs,
+                                     int bwl, TX_CLASS tx_class) {
+  int cost = 0;
+  cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
     }
   }
   return cost;
@@ -1326,9 +1475,12 @@ static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
       cost += av1_cost_literal(1);
     }
     if (abs_qc > NUM_BASE_LEVELS) {
-      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
-      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
-      cost += get_golomb_cost(abs_qc);
+      int br_ctx;
+      if (is_last)
+        br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      else
+        br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
     }
   }
   return cost;
@@ -1350,8 +1502,9 @@ static INLINE void update_coeff_general(
     TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
     int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
     const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels) {
-  const int dqv = dequant[si != 0];
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
+    const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   const int ci = scan[si];
   const tran_low_t qc = qcoeff[ci];
   const int is_last = si == (eob - 1);
@@ -1372,13 +1525,23 @@ static INLINE void update_coeff_general(
     const int64_t rd = RDCOST(rdmult, rate, dist);
 
     tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-    const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
-    const int rate_low =
-        get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
-                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
-    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = qc_low = dqc_low = 0;
+      dist_low = dist0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift);
+      rate_low =
+          get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    }
+
+    rd_low = RDCOST(rdmult, rate_low, dist_low);
     if (rd_low < rd) {
       qcoeff[ci] = qc_low;
       dqcoeff[ci] = dqc_low;
@@ -1397,8 +1560,8 @@ static AOM_FORCE_INLINE void update_coeff_simple(
     int bwl, int64_t rdmult, int shift, const int16_t *dequant,
     const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
     const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
-    uint8_t *levels) {
-  const int dqv = dequant[1];
+    uint8_t *levels, const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   (void)eob;
   // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
   // and not the last (scan_idx != eob - 1)
@@ -1412,28 +1575,28 @@ static AOM_FORCE_INLINE void update_coeff_simple(
     *accu_rate += txb_costs->base_cost[coeff_ctx][0];
   } else {
     const tran_low_t abs_qc = abs(qc);
-    const tran_low_t tqc = tcoeff[ci];
-    const tran_low_t dqc = dqcoeff[ci];
-    const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
-                                           bwl, tx_class, levels);
-    if (abs(dqc) < abs(tqc)) {
+    const tran_low_t abs_tqc = abs(tcoeff[ci]);
+    const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+    int rate_low = 0;
+    const int rate = get_two_coeff_cost_simple(
+        ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
+    if (abs_dqc < abs_tqc) {
       *accu_rate += rate;
       return;
     }
-    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+
+    const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift);
     const int64_t rd = RDCOST(rdmult, rate, dist);
 
-    const int sign = (qc < 0) ? 1 : 0;
-    tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
     const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
-    const int rate_low = get_coeff_cost_simple(
-        ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+    const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+    const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift);
     const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
     if (rd_low < rd) {
-      qcoeff[ci] = qc_low;
-      dqcoeff[ci] = dqc_low;
+      const int sign = (qc < 0) ? 1 : 0;
+      qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+      dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
       levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
       *accu_rate += rate_low;
     } else {
@@ -1448,8 +1611,9 @@ static AOM_FORCE_INLINE void update_coeff_eob(
     int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
     const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
     const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness) {
-  const int dqv = dequant[si != 0];
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+    const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   assert(si != *eob - 1);
   const int ci = scan[si];
   const tran_low_t qc = qcoeff[ci];
@@ -1471,40 +1635,42 @@ static AOM_FORCE_INLINE void update_coeff_eob(
     int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
 
     tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-    const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
-    const int rate_low =
-        get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
-                               txb_costs, bwl, tx_class, levels);
-    const int64_t rd_low =
-        RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = 0;
+      dqc_low = qc_low = 0;
+      dist_low = 0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+      rate_low =
+          get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    }
 
     int lower_level_new_eob = 0;
     const int new_eob = si + 1;
-    uint8_t tmp_levels[3];
-    for (int ni = 0; ni < *nz_num; ++ni) {
-      const int last_ci = nz_ci[ni];
-      tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)];
-      levels[get_padded_idx(last_ci, bwl)] = 0;
-    }
-
-    const int coeff_ctx_new_eob = get_lower_levels_ctx_general(
-        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si);
     const int new_eob_cost =
         get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
     int rate_coeff_eob =
-        new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign,
-                                              coeff_ctx_new_eob, dc_sign_ctx,
-                                              txb_costs, bwl, tx_class, levels);
+        new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+                                          dc_sign_ctx, txb_costs, bwl,
+                                          tx_class);
     int64_t dist_new_eob = dist;
     int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
 
     if (abs_qc_low > 0) {
       const int rate_coeff_eob_low =
-          new_eob_cost +
-          get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob,
-                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+          new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+                                            coeff_ctx_new_eob, dc_sign_ctx,
+                                            txb_costs, bwl, tx_class);
       const int64_t dist_new_eob_low = dist_low;
       const int64_t rd_new_eob_low =
           RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
@@ -1526,7 +1692,7 @@ static AOM_FORCE_INLINE void update_coeff_eob(
     if (sharpness == 0 && rd_new_eob < rd) {
       for (int ni = 0; ni < *nz_num; ++ni) {
         int last_ci = nz_ci[ni];
-        // levels[get_padded_idx(last_ci, bwl)] = 0;
+        levels[get_padded_idx(last_ci, bwl)] = 0;
         qcoeff[last_ci] = 0;
         dqcoeff[last_ci] = 0;
       }
@@ -1536,10 +1702,6 @@ static AOM_FORCE_INLINE void update_coeff_eob(
       *accu_dist = dist_new_eob;
       lower_level = lower_level_new_eob;
     } else {
-      for (int ni = 0; ni < *nz_num; ++ni) {
-        const int last_ci = nz_ci[ni];
-        levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni];
-      }
       *accu_rate += rate;
       *accu_dist += dist;
     }
@@ -1579,54 +1741,74 @@ static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
                          const TXB_CTX *const txb_ctx, int *rate_cost,
-                         int sharpness) {
-  const AV1_COMMON *cm = &cpi->common;
+                         int sharpness, int fast_mode) {
   MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int shift = av1_get_tx_scale(tx_size);
+  int eob = p->eobs[block];
+  const int16_t *dequant = p->dequant_QTX;
+  const qm_val_t *iqmatrix =
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
+
+  // This function is not called if eob = 0.
+  assert(eob > 0);
+
+  if (fast_mode) {
+    update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
+    if (eob == 0) {
+      *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
+      return eob;
+    }
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
   const MB_MODE_INFO *mbmi = xd->mi[0];
-  const struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
-  const int16_t *dequant = p->dequant_QTX;
   const int bwl = get_txb_bwl(tx_size);
   const int width = get_txb_wide(tx_size);
   const int height = get_txb_high(tx_size);
   assert(width == (1 << bwl));
   const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
-  const int16_t *scan = scan_order->scan;
   const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
   const int eob_multi_size = txsize_log2_minus4[tx_size];
   const LV_MAP_EOB_COST *txb_eob_costs =
       &x->eob_costs[eob_multi_size][plane_type];
 
-  const int shift = av1_get_tx_scale(tx_size);
-  const int64_t rdmult =
-      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
-       2) >>
+  const int rshift =
       (sharpness +
        (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
             ? 7 - mbmi->segment_id
             : 2) +
        (cpi->oxcf.aq_mode != VARIANCE_AQ &&
-                cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
+                cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL &&
+                cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
             ? (3 - x->sb_energy_level)
             : 0));
+  const int64_t rdmult =
+      (((int64_t)x->rdmult *
+        (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+       2) >>
+      rshift;
 
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
 
-  av1_txb_init_levels(qcoeff, width, height, levels);
+  if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
 
   // TODO(angirbird): check iqmatrix
 
   const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
   const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
-  int eob = p->eobs[block];
   const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
   int accu_rate = eob_cost;
   int64_t accu_dist = 0;
@@ -1642,15 +1824,14 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
     update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
                          bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
                          dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
-                         levels);
+                         levels, iqmatrix);
     --si;
   } else {
     assert(abs_qc == 1);
-    const int coeff_ctx = get_lower_levels_ctx_general(
-        1, si, bwl, height, levels, ci, tx_size, tx_class);
-    accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx,
-                                        txb_ctx->dc_sign_ctx, txb_costs, bwl,
-                                        tx_class, levels);
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si);
+    accu_rate +=
+        get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+                           txb_costs, bwl, tx_class);
     const tran_low_t tqc = tcoeff[ci];
     const tran_low_t dqc = dqcoeff[ci];
     const int64_t dist = get_coeff_dist(tqc, dqc, shift);
@@ -1661,12 +1842,12 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
 #define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
   case tx_class_literal:                                                   \
-    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
+    for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) {          \
       update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
                        tx_size, tx_class_literal, bwl, height,             \
                        txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
                        txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
-                       levels, sharpness);                                 \
+                       levels, sharpness, iqmatrix);                       \
     }                                                                      \
     break;
   switch (tx_class) {
@@ -1687,7 +1868,7 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
     for (; si >= 1; --si) {                                                    \
       update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
                           rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
-                          qcoeff, dqcoeff, levels);                            \
+                          qcoeff, dqcoeff, levels, iqmatrix);                  \
     }                                                                          \
     break;
   switch (tx_class) {
@@ -1705,10 +1886,11 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
     update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
                          bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
                          dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
-                         levels);
+                         levels, iqmatrix);
   }
 
-  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
+                                            cm->features.reduced_tx_set_used);
   if (eob == 0)
     accu_rate += skip_cost;
   else
@@ -1728,18 +1910,20 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
                      TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
   const AV1_COMMON *cm = &cpi->common;
+  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int eob = p->eobs[block];
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
   const int16_t *dequant = p->dequant_QTX;
   const int seg_eob = av1_get_max_eob(tx_size);
   const int bwl = get_txb_bwl(tx_size);
@@ -1754,25 +1938,24 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   const int shift = av1_get_tx_scale(tx_size);
   const int64_t rdmult =
-      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+      (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type]
+        << (2 * (xd->bd - 8))) +
        2) >>
       2;
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
-  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
   const qm_val_t *iqmatrix =
-      IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
   assert(width == (1 << bwl));
-  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  const int tx_type_cost =
+      get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
   TxbInfo txb_info = {
-    qcoeff,   levels,       dqcoeff,    tcoeff,  dequant, shift,
-    tx_size,  txs_ctx,      tx_type,    bwl,     width,   height,
-    eob,      seg_eob,      scan_order, txb_ctx, rdmult,  &cm->coeff_ctx_table,
-    iqmatrix, tx_type_cost,
+    qcoeff,     levels,  dqcoeff, tcoeff,   dequant,      shift, tx_size,
+    txs_ctx,    tx_type, bwl,     width,    height,       eob,   seg_eob,
+    scan_order, txb_ctx, rdmult,  iqmatrix, tx_type_cost,
   };
 
+#if CONFIG_HTB_TRELLIS
   // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
   // by storing the coefficient deltas in a hash table.
   // Currently disabled in speedfeatures.c
@@ -1780,7 +1963,9 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
     return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
                              fast_mode, rate_cost);
   }
-
+#else
+  (void)fast_mode;
+#endif  // CONFIG_HTB_TRELLIS
   av1_txb_init_levels(qcoeff, width, height, levels);
 
   const int update =
@@ -1812,34 +1997,14 @@ int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
   return cul_level;
 }
 
-void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
-                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                              void *arg) {
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *cpi = args->cpi;
-  const AV1_COMMON *cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const uint16_t eob = p->eobs[block];
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  const PLANE_TYPE plane_type = pd->plane_type;
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
-  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
-                   blk_row);
-}
-
-static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                 int blk_row, int blk_col, int plane,
-                                 TX_SIZE tx_size, FRAME_COUNTS *counts,
+static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
+                                 MACROBLOCKD *xd, int blk_row, int blk_col,
+                                 int plane, TX_SIZE tx_size,
+                                 FRAME_COUNTS *counts,
                                  uint8_t allow_update_cdf) {
   MB_MODE_INFO *mbmi = xd->mi[0];
   int is_inter = is_inter_block(mbmi);
+  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
   FRAME_CONTEXT *fc = xd->tile_ctx;
 #if !CONFIG_ENTROPY_STATS
   (void)counts;
@@ -1847,15 +2012,30 @@ static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
   // Only y plane's tx_type is updated
   if (plane > 0) return;
-  TX_TYPE tx_type = av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, tx_size,
-                                    cm->reduced_tx_set_used);
-  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
+  const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
+  if (is_inter) {
+    if (cpi->oxcf.use_inter_dct_only) {
+      assert(tx_type == DCT_DCT);
+    }
+  } else {
+    if (cpi->oxcf.use_intra_dct_only) {
+      assert(tx_type == DCT_DCT);
+    } else if (cpi->oxcf.use_intra_default_tx_only) {
+      const TX_TYPE default_type = get_default_tx_type(
+          PLANE_TYPE_Y, xd, tx_size, cpi->is_screen_content_type);
+      (void)default_type;
+      assert(tx_type == default_type);
+    }
+  }
+
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+      cm->quant_params.base_qindex > 0 && !mbmi->skip &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
     if (eset > 0) {
       const TxSetType tx_set_type =
-          av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+          av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used);
       if (is_inter) {
         if (allow_update_cdf) {
           update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
@@ -1899,84 +2079,98 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = xd->mi[0];
   const int eob = p->eobs[block];
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
-              pd->left_context + blk_row, &txb_ctx);
-  const int bwl = get_txb_bwl(tx_size);
-  const int width = get_txb_wide(tx_size);
-  const int height = get_txb_high(tx_size);
-  const uint8_t allow_update_cdf = args->allow_update_cdf;
-  const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#if CONFIG_ENTROPY_STATS
-  int cdf_idx = cm->coef_cdf_category;
-#endif  // CONFIG_ENTROPY_STATS
-
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *tcoeff;
+  assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+  if (args->dry_run == OUTPUT_ENABLED) {
+    MB_MODE_INFO *mbmi = xd->mi[0];
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, plane,
+                pd->above_entropy_context + blk_col,
+                pd->left_entropy_context + blk_row, &txb_ctx);
+    const int bwl = get_txb_bwl(tx_size);
+    const int width = get_txb_wide(tx_size);
+    const int height = get_txb_high(tx_size);
+    const uint8_t allow_update_cdf = args->allow_update_cdf;
+    const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 #if CONFIG_ENTROPY_STATS
-  ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+    int cdf_idx = cm->coef_cdf_category;
+    ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
 #endif  // CONFIG_ENTROPY_STATS
-  if (allow_update_cdf) {
-    update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
-               2);
-  }
-
-  x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
-  x->mbmi_ext->eobs[plane][block] = eob;
+    if (allow_update_cdf) {
+      update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx],
+                 eob == 0, 2);
+    }
 
-  if (eob == 0) {
-    av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
-    return;
-  }
+    CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+    const int txb_offset =
+        x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+    uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+    entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+    eob_txb[block] = eob;
+
+    if (eob == 0) {
+      av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+                               blk_row);
+      return;
+    }
+    const int segment_id = mbmi->segment_id;
+    const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+    tran_low_t *tcoeff_txb =
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+    tcoeff = tcoeff_txb + block_offset;
+    memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+    uint8_t levels_buf[TX_PAD_2D];
+    uint8_t *const levels = set_levels(levels_buf, width);
+    av1_txb_init_levels(tcoeff, width, height, levels);
+    update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+                         td->counts, allow_update_cdf);
 
-  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-  const int segment_id = mbmi->segment_id;
-  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+    const TX_CLASS tx_class = tx_type_to_class[tx_type];
+    const int16_t *const scan = scan_order->scan;
 
-  uint8_t levels_buf[TX_PAD_2D];
-  uint8_t *const levels = set_levels(levels_buf, width);
-  av1_txb_init_levels(tcoeff, width, height, levels);
-  update_tx_type_count(cm, xd, blk_row, blk_col, plane, tx_size, td->counts,
-                       allow_update_cdf);
+    // record tx type usage
+    td->rd_counts.tx_type_used[tx_size][tx_type]++;
 
-  const PLANE_TYPE plane_type = pd->plane_type;
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
-  const TX_CLASS tx_class = tx_type_to_class[tx_type];
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
-  const int16_t *const scan = scan_order->scan;
 #if CONFIG_ENTROPY_STATS
-  av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
-                         td->counts, allow_update_cdf);
+    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                           td->counts, allow_update_cdf);
 #else
-  av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
-                         allow_update_cdf);
+    av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                           allow_update_cdf);
 #endif
 
-  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+    DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+    av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+                            coeff_contexts);
 
-  for (int c = eob - 1; c >= 0; --c) {
-    const int pos = scan[c];
-    const int coeff_ctx = coeff_contexts[pos];
-    const tran_low_t v = qcoeff[pos];
-    const tran_low_t level = abs(v);
+    for (int c = eob - 1; c >= 0; --c) {
+      const int pos = scan[c];
+      const int coeff_ctx = coeff_contexts[pos];
+      const tran_low_t v = qcoeff[pos];
+      const tran_low_t level = abs(v);
 
-    if (allow_update_cdf) {
-      if (c == eob - 1) {
-        assert(coeff_ctx < 4);
-        update_cdf(
-            ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
-            AOMMIN(level, 3) - 1, 3);
-      } else {
-        update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
-                   AOMMIN(level, 3), 4);
+      if (allow_update_cdf) {
+        if (c == eob - 1) {
+          assert(coeff_ctx < 4);
+          update_cdf(
+              ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+              AOMMIN(level, 3) - 1, 3);
+        } else {
+          update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+                     AOMMIN(level, 3), 4);
+        }
       }
-    }
-    {
       if (c == eob - 1) {
         assert(coeff_ctx < 4);
 #if CONFIG_ENTROPY_STATS
@@ -1987,76 +2181,81 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                       [coeff_ctx][AOMMIN(level, 3)];
 #endif
       }
-    }
-    if (level > NUM_BASE_LEVELS) {
-      const int base_range = level - 1 - NUM_BASE_LEVELS;
-      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
-      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
-        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
-        if (allow_update_cdf) {
-          update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
-                                         [plane_type][br_ctx],
-                     k, BR_CDF_SIZE);
-        }
-        for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+      if (level > NUM_BASE_LEVELS) {
+        const int base_range = level - 1 - NUM_BASE_LEVELS;
+        const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+          const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+          if (allow_update_cdf) {
+            update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+                                           [plane_type][br_ctx],
+                       k, BR_CDF_SIZE);
+          }
+          for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
 #if CONFIG_ENTROPY_STATS
-          ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type][lps]
-                                 [br_ctx][lps == k];
+            ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+                                   [lps][br_ctx][lps == k];
 #endif  // CONFIG_ENTROPY_STATS
-          if (lps == k) break;
-        }
+            if (lps == k) break;
+          }
 #if CONFIG_ENTROPY_STATS
-        ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
-                                     [plane_type][br_ctx][k];
+          ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                       [plane_type][br_ctx][k];
 #endif
-        if (k < BR_CDF_SIZE - 1) break;
+          if (k < BR_CDF_SIZE - 1) break;
+        }
       }
     }
-  }
-
-  // Update the context needed to code the DC sign (if applicable)
-  if (tcoeff[0] != 0) {
-    const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
-    const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+    // Update the context needed to code the DC sign (if applicable)
+    if (tcoeff[0] != 0) {
+      const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+      const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
 #if CONFIG_ENTROPY_STATS
-    ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+      ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
 #endif  // CONFIG_ENTROPY_STATS
-    if (allow_update_cdf)
-      update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
-    x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+      entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+    }
+  } else {
+    tcoeff = qcoeff;
   }
-
   const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
-  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
-                   blk_row);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+                           blk_col, blk_row);
 }
 
 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
-                            RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                            int mi_row, int mi_col, uint8_t allow_update_cdf) {
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                            uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
-  struct tokenize_b_args arg = { cpi, td, NULL, 0, allow_update_cdf };
-  (void)rate;
-  (void)mi_row;
-  (void)mi_col;
+  struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
   if (mbmi->skip) {
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+    av1_reset_entropy_context(xd, bsize, num_planes);
     return;
   }
 
-  if (!dry_run) {
-    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_and_record_txb_context, &arg,
-                                  num_planes);
-  } else if (dry_run == DRY_RUN_NORMAL) {
-    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_txb_context_b, &arg, num_planes);
-  } else {
-    printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
-    assert(0);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int ss_x = pd->subsampling_x;
+    const int ss_y = pd->subsampling_y;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+    av1_foreach_transformed_block_in_plane(
+        xd, plane_bsize, plane, av1_update_and_record_txb_context, &arg);
   }
 }
+
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+                                         int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
+  const int offset =
+      (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  return cpi->coeff_buffer_base + offset;
+}